Neon addmul_8
Richard Henderson
rth at twiddle.net
Sun Feb 24 02:38:32 CET 2013
gcc -O2 -g3 [...] addmul_N.c -DN=8 -DCLOCK=1694000000
$ ./t.out
mpn_addmul_8: 2845ms (1.782 cycles/limb) [973.59 Gb/s]
mpn_addmul_8: 2620ms (1.641 cycles/limb) [1057.20 Gb/s]
mpn_addmul_8: 2625ms (1.644 cycles/limb) [1055.19 Gb/s]
mpn_addmul_8: 2625ms (1.644 cycles/limb) [1055.19 Gb/s]
mpn_addmul_8: 2625ms (1.644 cycles/limb) [1055.19 Gb/s]
(Clearly there's something wrong with the bandwith calculation. ;-)
It's somewhat unfortunate that neon load/store insns are not
conditional. I've made do with conditionally loading limbs into core
registers and copying them over.
It might be worth a try having two copies of this main loop, one in
which there are more than 8 limbs remaining (requiring loads of the 9th
RP limb), and one for the final 8 limbs. But I don't have time today,
and my suspicion is that you'll get no better than 1.60 cyc/limb, if
indeed one can measure any improvement at all.
Any real improvement on this routine will have to be algorithmic, using
a different mechanism to compute and apply carries. And that's where
it's likely someone else that's going to have to be clever.
What sizes are really interesting for addmul decomposition?
Would it be useful do a prime like addmul_7? It would run at pretty
much the same speed as addmul_8, since nearly the same insns would be
used. Just the last vector would be a D register instead of a Q register.
There's still lots of free registers -- over half in fact. The same
scheme would still easily fit k=16 without spilling. But given the
distance between dependencies already I can't imagine but that we're
near the point of diminishing returns.
r~
-------------- next part --------------
dnl ARM neon mpn_addmul_8.
dnl
dnl Copyright 2013 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arch armv6t2
.arm
C cycles/limb
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
define(`Dv01', `d0') C v2si of two multiplier limbs
define(`Dv23', `d1') C ... 2nd
define(`Dv45', `d2') C ... 3rd
define(`Dv67', `d3') C ... 4th
define(`Du0', `d4') C v2si of one replicated multiplicand limb
define(`Qa9', `q3') C v4si with [0] containing 9th addend limb
define(`Da9', `d6')
define(`Qc01', `q8') C v2di of product or carry
define(`Dc0', `d16') C v1di (scalar) aliases of Qc01
define(`Dc1', `d17')
define(`Qc23', `q9') C ... 2nd
define(`Dc2', `d18')
define(`Dc3', `d19')
define(`Qc45', `q10') C ... 3rd
define(`Dc4', `d20')
define(`Dc5', `d21')
define(`Qc67', `q11') C ... 4th
define(`Dc6', `d22')
define(`Dc7', `d23')
ASM_START()
PROLOGUE(mpn_addmul_8)
vld1.32 {Dc0, Dc1, Dc2, Dc3}, [rp] @ load 8 rp limbs
cmp n, #8 @ more than 8 limbs?
vld1.32 {Dv01, Dv23, Dv45, Dv67}, [vp] @ load all vp limbs
vld1.32 {Du0[0]}, [up] @ load first up
ldr ip, [up, #4] @ load 2nd up limb
ldrgt r3, [rp, #32] @ maybe load 9th rp
add up, up, #8
moveq r3, #0 @ maybe zero 9th rp
vmov.i32 Qa9, #0 @ clear addend input
vmovl.u32 Qc67, Dc3 @ extend rp limbs to carry-in
vmovl.u32 Qc45, Dc2
vmovl.u32 Qc23, Dc1
vmovl.u32 Qc01, Dc0
.balign 16
@ Main loop -- 8 x 1 multiplications
.Loop:
vmlal.u32 Qc01, Dv01, Du0[0] @ compute all products
vmov.32 Da9[0], r3 @ move 9th rp into place
vmlal.u32 Qc23, Dv23, Du0[0]
cmp n, #9 @ more than 8 limbs left?
vmlal.u32 Qc45, Dv45, Du0[0]
ldrgt r3, [rp, #36] @ maybe load next rp limb
vmlal.u32 Qc67, Dv67, Du0[0]
movle r3, #0 @ maybe zero next rp limb
vmov.32 Du0[0], ip @ move next up limb into place
subs n, n, #1
vst1.u32 {Dc0[0]}, [rp]! @ store lowest product limb
ldrne ip, [up], #4 @ maybe load next up limb
vext.32 Qc01, Qc01, Qc23, #1 @ rotate carry down one
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qa9, #1 @ rotate in 9th addend or 0.
vpaddl.u32 Qc01, Qc01 @ fold carries
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
bne .Loop
@ Done with all products. Finish propagating carries,
@ Store seven carry limbs and return the eighth.
@ c0-c7 = { c7l, c6, c5, c4, c3, c2, c1, c0 }
vst1.32 {Dc0[0]}, [rp]! @ one...
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Dc6, Dc6, Dc7, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Dc6, Dc6
@ c0-c7 = { 0, c6l, c5, c4, c3, c2, c1, c0 }
vst1.32 {Dc0[0]}, [rp]! @ two...
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
@ c0-c7 = { 0, 0, c5l, c4, c3, c2, c1, c0 }
vst1.32 {Dc0[0]}, [rp]! @ three...
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Dc4, Dc4, Dc5, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Dc4, Dc4
@ c0-c7 = { 0, 0, 0, c4l, c3, c2, c1, c0 }
vst1.32 {Dc0[0]}, [rp]! @ four...
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
@ c0-c7 = { 0, 0, 0, 0, c3l, c2, c1, c0 }
vst1.32 {Dc0[0]}, [rp]! @ five...
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Dc2, Dc2, Dc3, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Dc2, Dc2
@ c0-c7 = { 0, 0, 0, 0, 0, c2l, c1, c0 }
vst1.32 {Dc0[0]}, [rp]! @ six...
vext.32 Qc01, Qc01, Qc23, #1
vpaddl.u32 Qc01, Qc01
@ c0-c7 = { 0, 0, 0, 0, 0, 0, c1l, c0 }
vst1.32 {Dc0[0]}, [rp]! @ seven...
vext.32 Dc0, Dc0, Dc1, #1
vpaddl.u32 Dc0, Dc0
@ c0-c7 = { 0, 0, 0, 0, 0, 0, 0, c0l }
vmov.32 r0, Dc0[0] @ eight...
bx lr
EPILOGUE()
More information about the gmp-devel
mailing list