arm "neon"

Richard Henderson rth at
Fri Feb 22 05:52:58 CET 2013

On 2013-02-21 06:28, Torbjorn Granlund wrote:
> I'd advice strongly against that.  Creating hard-to-trigger carry
> propagation bugs is not unlikely when playing with these primitives, and
> addmul_N.c will be much better at finding these, and will also shorten
> your development cycle (fast results, no need for make && make check).
> It will also print a diff table which makes debugging easier.
> Typical command:
> gmpsrc=[...]
> gcc -O1 -I. -I$gmpsrc $gmpsrc/tests/devel/addmul_N.c -DN=2 -DTIMES \
> -DCLOCK=3200000000 .libs/libgmp.a tests/.libs/libtests.a && ./a.out

Indeed, the last version that Niels posted doesn't pass this test.

The following does pass, but if I'm to believe the arithmetic it's still 
fairly slow -- around 12cyc/sec.

If one is even more clever than I, one could do a 4x unroll, making best 
use of vld4.  But when you do that, getting the carries right becomes 
even more tricky.  But I think any correct solution will involve chains 
of vsra to shift and add up the chain.

-------------- next part --------------
dnl  ARM neon mpn_addmul_2.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see


	.fpu	neon
	.arch	armv6t2

C	     cycles/limb
define(`n', `r2')

define(`v01', `d0')	C q0
define(`l01', `d1')
define(`u00', `d2')	C q1
define(`u11', `d3')
define(`c01', `q2')	C d4, d5
define(`c0', `d4')
define(`c1', `d5')
define(`c12', `q3')	C d6, d7
define(`c1p', `d6')
define(`c2', `d7')
define(`ext', `q4')
define(`ext0', `d8')
define(`ext1', `d9')

C	      c1 c0
C	      r1 c0'
C	      u0*v0
C	   u0*v1
C	------------
C	   c1 c0 r0
C	      c0'

	vand		c1p, c1, ext0		@ copy low(c1) to c1'
	vst1.32		{c0[0]}, [rp]!		@ store low(c0)
	vsra.u64	c12, c01, #32		@ carry high parts up
	vsra.u64	c2, c1p, #32
	vand		c01, c12, ext		@ ... and shift lows down
	vshr.u64	c2, c2, #32


	vpush		{ d8, d9 }
	vld1.32		{v01}, [vp]
	mov		vp, rp			@ load from vp, store to rp
	vld2.32		{u00[], u11[]}, [up]!	@ load and replicate u0 and u1
	vmov.i64	c01, #0
	vld1.32		{l01}, [vp]!		@ load l0 and l1
	vmov.i64	c12, #0
	pld		[up, #64-8]		@ prefetch 1 cacheline out
	vmov.i32	ext0, #-1
	pld		[vp, #64-4]
	vmovl.u32	ext, ext0		@ 0x00000000ffffffff*2
	sub		n, n, #1		@ exit loop early for odd n
	b		.Lentry

	@ Rotated main loop; all original inputs have been consumed.
	.balign		16
	vld2.32		{u00[], u11[]}, [up]!	@ load for next iteration
	vld1.32		{l01}, [vp]!

	@ The values in each element of c012 should be no larger than 32-bit,
	@ so that we can perform the multiply and two additions without
	@ carry-out of the 64-bit element.
	vmlal.u32	c01, u00, v01		@ { u0*vN+cN }
	subs		n, n, #2
	pld		[up, #128-16]		@ prefetch 2 cachlines out
	vaddw.u32	c01, c01, l01		@ { u0*vN+cN+lN }
	pld		[vp, #128-8]
	vmlal.u32	c01, u11, v01		@ { u1*vN }
	bgt		.Loop

	@ Tail of the main loop.  Finish computing c012.

	@ Here N = 0 if we originally had an odd number of limbs, or -1
	@ if we originally had an even number of limbs.  The flags are
	@ still set up from the subtract above.
	bne		1f

	@ Here we have 1 limb left to multiply and sum.
	vld1.32		{u00[]}, [up]
	vmov.i32	l01, #0
	vld1.32		{l01[0]}, [vp]
	vmlal.u32	c01, u00, v01
	vaddw.u32	c01, c01, l01
	@ Here we only have the c01 carry remaining.  Store it.
	vst1.32		{c0[0]}, [rp]
	vsra.u64	c1, c0, #32
	vmov.32		r0, c1[0]
	vpop		{ d8, d9 }
	bx		lr


More information about the gmp-devel mailing list