Neon column-wise addmul_4

Tue Feb 26 17:10:12 CET 2013

On 02/26/2013 05:14 AM, Niels Möller wrote:
> Untried tricks: One could try to use vuzp to separate high and low
> parts of the products. Then only the low parts need shifting around.
> I guess I'll try that with addmul_4 first, to see if it makes for any
> improvement. One could maybe use vaddw, to delay adding in one of the
> carry limbs, reducing the recurrency to only vuzp, vaddw (but if the
> recurrency isn't the bottleneck, that won't help).

For discussion.  FWIW, this comes in at 4.0 c/l.  That's slower than the
numbers from the previously posted row-wise addmul_4, but I'd unrolled that one
4 times, so it's not quite a fair comparison.  That said...

>From Niels' message down-thread:

>  2 vmlal
>  2 vext
>  2 vpaddl
>  2 ld1 (scalar)
>  1 st1 (scalar)
>  1 subs
>  1 bne
  ---
  11

Here:

  3 vadd
  2 vmull
  2 vpadal
  2 ld1
  1 vpaddl
  1 vuzp
  1 vext
  1 st1
  1 subs
  1 bne
 ---
 15

It requires 4 more insns per loop.  This fact really should not be surprising
since we're producing a single folded carry term per loop, rather than having
the carry be distributed across elements of a vector.

The fact that the carry is consolidated means that we cannot use vmlal, because
the carry may be as large as 35 bits.  Which means that we must wait until the
product has been decomposed to add it in without losing data.

r~
-------------- next part --------------
dnl  ARM neon mpn_addmul_4.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arch	armv6t2
	.arm

C	     cycles/limb

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`Qu03', `q0')	C v4si of four multiplicand limbs
define(`Du01', `d0')	C v2si aliases
define(`Du23', `d1')

define(`Dv10', `d2')	C v2si of two multiplier limbs
define(`Dv32', `d3')	C ... 2nd

define(`Da0', `d4')	C one zero-extended addend limb

define(`Dsum', `d5')	C Three names for the same register, used in the
define(`Dcin', `d5')	C ... appropriate context for clarity regarding
define(`Dcout', `d5')	C ... the role it's currently playing in the round.

define(`Qu4', `q3')	C one multiplicand limb, viewed inside v4si
define(`Du4', `d6')	C ... or v2si.

define(`Qp01', `q8')	C v2di of product
define(`Dp0', `d16')	C v1di (scalar) aliases of Qp01
define(`Dp1', `d17')
define(`Qp23', `q9')	C ... 2nd
define(`Dp2', `d18')
define(`Dp3', `d19')

define(`Qzero', `q15')	C The zero constant.
define(`Dzero', `d31')

ASM_START()
PROLOGUE(mpn_addmul_4)
	vld1.32		{Dv10, Dv32}, [vp]	@ load v0-v3
	mov		vp, rp			@ read from vp, write to rp
	vld1.32		{Du01, Du23}, [up]!	@ load u0-u3
	vld1.32		{Da0}, [vp]!		@ load a0-a1
	vmov.i32	Qzero, #0

	@ Column 0: only consider the U0*V0 product.
	vmull.u32	Qp01, Du01, Dv10	@ v2di{ u0*v0, ignored }
	vext.32		Dv32, Dv32, Dv32, #1	@ arrange v0-v3 as v3-v0
	vext.32		Dv10, Dv10, Dv10, #1
	vaddw.u32	Qp01, Qp01, Da0		@ v2di{ u0*v0+a0, ignored }
	vshr.u64	Dcout, Dp0, #32		@ cout = hi(p0)
	vshr.u64	Da0, Da0, #32		@ zero-extend a1
	vst1.32		{Dp0[0]}, [rp]!		@ output lo(p0)

	@ Column 1: only consider the U1*V0 and U0*V1 products.
	vmull.u32	Qp01, Du01, Dv10	@ v2di{ u0*v1, v0*u1 }
	vadd.i64	Dsum, Dcin, Da0		@ sum = cin + a1
	vld1.32		{Da0[0]}, [vp]!		@ load (zero-extended) a2

	@ Reorganize the high and low halves of the 64-bit products:
	@ Dp0 = v1di{ u0*v1 } = v2si{ a, b }
	@ Dp1 = v1di{ u1*v0 } = v2si{ x, y }
	vuzp.32		Dp0, Dp1
	@ Dp0 = v2si{ a, x } = { lo(u0*v1), lo(u1*v0) }
	@ Dp1 = v2si{ y, b } = { hi(u1*v0), hi(u0*v1) }

	vpadal.u32	Dsum, Dp0		@ sum = cin + a1 + a + x
	vst1.32		{Dsum[0]}, [rp]!	@ output lo(sum)
	vshr.u64	Dcout, Dsum, #32	@ cout = hi(sum)
	vpadal.u32	Dcout, Dp1		@ cout = hi(sum) + y + b

	@ Column 2: We have 3 products, but just that is hard to manage,
	@ so we're going to drop into the main loop with a zero placed to
	@ moot the multiplication with v3.
	vshr.u64	Du4, Du23, #32		@ v2si{ u3, 0 }
	vext.32		Qu03, Qzero, Qu03, #3	@ v4di{ 0, u0, u1, u2 }

	@ We've done two rounds so far.  Since we load data for round N+1
	@ inside the loop, make sure we exit the loop one round early.
	sub		n, n, #2+1

	ALIGN(16)
L(loop):
	@ 4 x 1 multiplies, all producing results in the same column.
	vmull.u32	Qp01, Du01, Dv32
	vadd.i64	Dsum, Dcin, Da0		@ sum = cin + a0
	vld1.32		{Da0[0]}, [vp]!		@ load next A limb

	vmull.u32	Qp23, Du23, Dv10
	vext.32		Qu03, Qu03, Qu4, #1	@ { u4, u3, u2, u1 }
	vld1.32		{Du4[0]}, [up]!		@ load next U limb

	@ Reorganize the high and low halves of the 64-bit products:
	@ Qp01 = v2di{ u0*v3, u1*v2 } = v4si{ a, b, c, d }
	@ Qp23 = v2di{ u2*v1, u3*v0 }       = v4si{ w, x, y, z }
	vuzp.32		Qp01, Qp23
	@ Qp01 = v4si{ x, z, b, d } = v4si{ lo(u2*v1), ..., lo(u1*v2) }
	@ Qp23 = v4si{ w, y, a, c } = v4si{ hi(u2*v1), ..., lo(u1*v2) }

	@ Independantly sum the high and low portions.
	vpadal.u32	Dsum, Dp0		@ sum += x + z
	vpaddl.u32	Qp23, Qp23		@ v2di{ w+y, a+c }
	vpadal.u32	Dsum, Dp1		@ sum += b + d
	vadd.i64	Dp2, Dp2, Dp3		@ p2 = w+y+a+c
	vst1.i32	{Dsum[0]}, [rp]!	@ output lo(sum)
	vshr.u64	Dcout, Dsum, #32	@ cout = hi(sum)
	subs		n, n, #1
	vadd.i64	Dcout, Dp2		@ cout = hi(sum) + w+y+a+c
	bne		L(loop)

	@ Column N: Four products, but no more data to prefetch.
	vmull.u32	Qp01, Du01, Dv32
	vadd.i64	Dsum, Dcin, Da0		@ sum = cin + a0
	vmull.u32	Qp23, Du23, Dv10
	vext.32		Qu03, Qu03, Qzero, #1	@ { 0, u3, u2, u1 }
	vuzp.32		Qp01, Qp23		@ separate hi's and lo's
	vpadal.u32	Dsum, Dp0		@ sum += x + z
	vpaddl.u32	Qp23, Qp23		@ v2di{ w+y, a+c }
	vpadal.u32	Dsum, Dp1		@ sum += b + d
	vadd.i64	Dp2, Dp2, Dp3		@ p2 = w+y+a+c
	vst1.i32	{Dsum[0]}, [rp]!	@ output lo(sum)
	vshr.u64	Dcout, Dsum, #32	@ cout = hi(sum)
	vadd.i64	Dcout, Dp2		@ cout = hi(sum) + w+y+a+c

	@ Column N+1: Three products.  We shifted in a zero so that
	@ we could continue to use the four product pattern.
	vmull.u32	Qp01, Du01, Dv32
	vmull.u32	Qp23, Du23, Dv10
	vext.32		Du23, Du01, Du23, #1	@ rebuild { u2, u3 }

	@ Qp01 = v2di{ u0*v3, u1*v2 } = v4si{ a, b, c, d }
	@ Qp23 = v2di{ u2*v1, 0 }     = v4si{ w, x, 0, 0 }
	vuzp.32		Qp01, Qp23
	@ Qp01 = { x, 0, b, d }, Qp32 = { w, 0, a, c }
	vpadal.u32	Dsum, Dp0		@ sum += x + z
	vpaddl.u32	Qp23, Qp23		@ v2di{ w, a+c }
	vpadal.u32	Dsum, Dp1		@ sum += b + d
	vadd.i64	Dp2, Dp2, Dp3		@ p2 = w+a+c
	vst1.i32	{Dsum[0]}, [rp]!	@ output lo(sum)
	vshr.u64	Dcout, Dsum, #32	@ cout = hi(sum)
	vadd.i64	Dcout, Dp2		@ cout = hi(sum) + w+a+c

	@ Column N+2: only consider the U3*V2 and U2*V3 products.
	vmull.u32	Qp01, Du23, Dv32
	vuzp.32		Dp0, Dp1		@ separate hi's and lo's
	vpadal.u32	Dsum, Dp0		@ sum = cin + a + x
	vpaddl.u32	Dp1, Dp1		@ p1 = b + y
	vst1.32		{Dsum[0]}, [rp]!	@ output lo(sum)
	vsra.u64	Dp1, Dsum, #32		@ p1 is cout this round

	@ Column N+3: only consider the U3*V3 product.
	vmlal.u32	Qp01, Du23, Dv32[0]	@ v2di{ ignored, u3*v3+cin }
	vst1.32		{Dp1[0]}, [rp]!		@ output lo(p1)
	vmov.32		r0, Dp1[1]		@ return hi(p1)
	bx		lr

EPILOGUE()