arm "neon"
Richard Henderson
rth at twiddle.net
Fri Feb 22 05:52:58 CET 2013
On 2013-02-21 06:28, Torbjorn Granlund wrote:
> I'd advice strongly against that. Creating hard-to-trigger carry
> propagation bugs is not unlikely when playing with these primitives, and
> addmul_N.c will be much better at finding these, and will also shorten
> your development cycle (fast results, no need for make && make check).
> It will also print a diff table which makes debugging easier.
>
> Typical command:
>
> gmpsrc=[...]
> gcc -O1 -I. -I$gmpsrc $gmpsrc/tests/devel/addmul_N.c -DN=2 -DTIMES \
> -DCLOCK=3200000000 .libs/libgmp.a tests/.libs/libtests.a && ./a.out
>
Indeed, the last version that Niels posted doesn't pass this test.
The following does pass, but if I'm to believe the arithmetic it's still
fairly slow -- around 12cyc/sec.
If one is even more clever than I, one could do a 4x unroll, making best
use of vld4. But when you do that, getting the carries right becomes
even more tricky. But I think any correct solution will involve chains
of vsra to shift and add up the chain.
r~
-------------- next part --------------
dnl ARM neon mpn_addmul_2.
dnl Contributed to the GNU project by Niels Möller
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arch armv6t2
.arm
C cycles/limb
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
define(`v01', `d0') C q0
define(`l01', `d1')
define(`u00', `d2') C q1
define(`u11', `d3')
define(`c01', `q2') C d4, d5
define(`c0', `d4')
define(`c1', `d5')
define(`c12', `q3') C d6, d7
define(`c1p', `d6')
define(`c2', `d7')
define(`ext', `q4')
define(`ext0', `d8')
define(`ext1', `d9')
C
C c1 c0
C r1 c0'
C u0*v0
C u0*v1
C ------------
C c1 c0 r0
C c0'
.macro SAVE_AND_CARRY
vand c1p, c1, ext0 @ copy low(c1) to c1'
vst1.32 {c0[0]}, [rp]! @ store low(c0)
vsra.u64 c12, c01, #32 @ carry high parts up
vsra.u64 c2, c1p, #32
vand c01, c12, ext @ ... and shift lows down
vshr.u64 c2, c2, #32
.endm
ASM_START()
PROLOGUE(mpn_addmul_2)
vpush { d8, d9 }
vld1.32 {v01}, [vp]
mov vp, rp @ load from vp, store to rp
vld2.32 {u00[], u11[]}, [up]! @ load and replicate u0 and u1
vmov.i64 c01, #0
vld1.32 {l01}, [vp]! @ load l0 and l1
vmov.i64 c12, #0
pld [up, #64-8] @ prefetch 1 cacheline out
vmov.i32 ext0, #-1
pld [vp, #64-4]
vmovl.u32 ext, ext0 @ 0x00000000ffffffff*2
sub n, n, #1 @ exit loop early for odd n
b .Lentry
@ Rotated main loop; all original inputs have been consumed.
.balign 16
.Loop:
vld2.32 {u00[], u11[]}, [up]! @ load for next iteration
vld1.32 {l01}, [vp]!
SAVE_AND_CARRY
.Lentry:
@ The values in each element of c012 should be no larger than 32-bit,
@ so that we can perform the multiply and two additions without
@ carry-out of the 64-bit element.
vmlal.u32 c01, u00, v01 @ { u0*vN+cN }
subs n, n, #2
pld [up, #128-16] @ prefetch 2 cachlines out
vaddw.u32 c01, c01, l01 @ { u0*vN+cN+lN }
pld [vp, #128-8]
SAVE_AND_CARRY
vmlal.u32 c01, u11, v01 @ { u1*vN }
bgt .Loop
@ Tail of the main loop. Finish computing c012.
SAVE_AND_CARRY
@ Here N = 0 if we originally had an odd number of limbs, or -1
@ if we originally had an even number of limbs. The flags are
@ still set up from the subtract above.
bne 1f
@ Here we have 1 limb left to multiply and sum.
vld1.32 {u00[]}, [up]
vmov.i32 l01, #0
vld1.32 {l01[0]}, [vp]
vmlal.u32 c01, u00, v01
vaddw.u32 c01, c01, l01
1:
@ Here we only have the c01 carry remaining. Store it.
vst1.32 {c0[0]}, [rp]
vsra.u64 c1, c0, #32
vmov.32 r0, c1[0]
vpop { d8, d9 }
bx lr
EPILOGUE()
More information about the gmp-devel
mailing list