arm "neon"

Fri Feb 22 05:52:58 CET 2013

On 2013-02-21 06:28, Torbjorn Granlund wrote:
> I'd advice strongly against that.  Creating hard-to-trigger carry
> propagation bugs is not unlikely when playing with these primitives, and
> addmul_N.c will be much better at finding these, and will also shorten
> your development cycle (fast results, no need for make && make check).
> It will also print a diff table which makes debugging easier.
>
> Typical command:
>
> gmpsrc=[...]
> gcc -O1 -I. -I$gmpsrc $gmpsrc/tests/devel/addmul_N.c -DN=2 -DTIMES \
> -DCLOCK=3200000000 .libs/libgmp.a tests/.libs/libtests.a && ./a.out
>

Indeed, the last version that Niels posted doesn't pass this test.

The following does pass, but if I'm to believe the arithmetic it's still 
fairly slow -- around 12cyc/sec.

If one is even more clever than I, one could do a 4x unroll, making best 
use of vld4.  But when you do that, getting the carries right becomes 
even more tricky.  But I think any correct solution will involve chains 
of vsra to shift and add up the chain.

r~
-------------- next part --------------
dnl  ARM neon mpn_addmul_2.

dnl  Contributed to the GNU project by Niels Möller

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arch	armv6t2
	.arm

C	     cycles/limb

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`v01', `d0')	C q0
define(`l01', `d1')
define(`u00', `d2')	C q1
define(`u11', `d3')
define(`c01', `q2')	C d4, d5
define(`c0', `d4')
define(`c1', `d5')
define(`c12', `q3')	C d6, d7
define(`c1p', `d6')
define(`c2', `d7')
define(`ext', `q4')
define(`ext0', `d8')
define(`ext1', `d9')

C
C	      c1 c0
C	      r1 c0'
C	      u0*v0
C	   u0*v1
C	------------
C	   c1 c0 r0
C	      c0'

.macro	SAVE_AND_CARRY

	vand		c1p, c1, ext0		@ copy low(c1) to c1'
	vst1.32		{c0[0]}, [rp]!		@ store low(c0)
	vsra.u64	c12, c01, #32		@ carry high parts up
	vsra.u64	c2, c1p, #32
	vand		c01, c12, ext		@ ... and shift lows down
	vshr.u64	c2, c2, #32

.endm

ASM_START()
PROLOGUE(mpn_addmul_2)
	vpush		{ d8, d9 }
	vld1.32		{v01}, [vp]
	mov		vp, rp			@ load from vp, store to rp
	vld2.32		{u00[], u11[]}, [up]!	@ load and replicate u0 and u1
	vmov.i64	c01, #0
	vld1.32		{l01}, [vp]!		@ load l0 and l1
	vmov.i64	c12, #0
	pld		[up, #64-8]		@ prefetch 1 cacheline out
	vmov.i32	ext0, #-1
	pld		[vp, #64-4]
	vmovl.u32	ext, ext0		@ 0x00000000ffffffff*2
	sub		n, n, #1		@ exit loop early for odd n
	b		.Lentry

	@ Rotated main loop; all original inputs have been consumed.
	.balign		16
.Loop:
	vld2.32		{u00[], u11[]}, [up]!	@ load for next iteration
	vld1.32		{l01}, [vp]!
	SAVE_AND_CARRY

.Lentry:
	@ The values in each element of c012 should be no larger than 32-bit,
	@ so that we can perform the multiply and two additions without
	@ carry-out of the 64-bit element.
	vmlal.u32	c01, u00, v01		@ { u0*vN+cN }
	subs		n, n, #2
	pld		[up, #128-16]		@ prefetch 2 cachlines out
	vaddw.u32	c01, c01, l01		@ { u0*vN+cN+lN }
	pld		[vp, #128-8]
	SAVE_AND_CARRY
	vmlal.u32	c01, u11, v01		@ { u1*vN }
	bgt		.Loop

	@ Tail of the main loop.  Finish computing c012.
	SAVE_AND_CARRY

	@ Here N = 0 if we originally had an odd number of limbs, or -1
	@ if we originally had an even number of limbs.  The flags are
	@ still set up from the subtract above.
	bne		1f

	@ Here we have 1 limb left to multiply and sum.
	vld1.32		{u00[]}, [up]
	vmov.i32	l01, #0
	vld1.32		{l01[0]}, [vp]
	vmlal.u32	c01, u00, v01
	vaddw.u32	c01, c01, l01
1:
	@ Here we only have the c01 carry remaining.  Store it.
	vst1.32		{c0[0]}, [rp]
	vsra.u64	c1, c0, #32
	vmov.32		r0, c1[0]
	vpop		{ d8, d9 }
	bx		lr

EPILOGUE()