arm "neon"
Richard Henderson
rth at twiddle.net
Sat Feb 23 01:05:13 CET 2013
On 02/22/2013 12:08 PM, Richard Henderson wrote:
> Perhaps I should give this another go...
Down to 5.8 cyc/limb. Good, but not fantastic. I'm gonna try one more time
with larger unrolling to make full use of the vector load insns, and less
over-prefetching.
I guess the target is anything under 2.5 cyc/limb, against the armv6 integer
version?
r~
-------------- next part --------------
dnl ARM neon mpn_addmul_2.
dnl
dnl Copyright 2013 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arch armv6t2
.arm
C cycles/limb
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
define(`Da23', `d0') C v2si of two addend limbs
define(`Qa23', `q0') C v4si alias of Da23 with garbage in high half
define(`Dv01', `d1') C v2si of the two multiplier limbs
define(`Du00', `d2') C v2si of one multiplicand limb, duplicated
define(`Du11', `d3') C v2si of second multiplicand limb
define(`Qc01', `q2') C v2di of product or carry
define(`Dc0', `d4') C v1di (scalar) aliases of Qc01
define(`Dc1', `d5')
ASM_START()
PROLOGUE(mpn_addmul_2)
vmov.i32 Qc01, #0 @ clear carry in
vld2.32 {Du00[], Du11[]}, [up]! @ load and duplicate u0 and u1
vld2.32 {Dc0[0], Dc1[0]}, [rp] @ load a0 and a1 as carry-in
vld1.32 {Dv01}, [vp] @ load v0 and v1
subs n, n, #3 @ less than 4 limbs?
add vp, rp, #8 @ read from vp, write to rp
ble .Lsmall
vld1.32 {Da23}, [vp]! @ load a2 and a3
pld [up, #64-8] @ prefetch line + 1
pld [rp, #64]
pld [up, #128-8] @ prefetch line + 2
pld [rp, #128]
pld [up, #192-8] @ prefetch line + 3
pld [rp, #192]
b .Lentry
.balign 16
@ Rotated main loop tail. We branch here as soon as all U inputs
@ are consumed, so that we can schedule loads for the next round.
.Loop:
vld2.32 {Du00[], Du11[]}, [up]!
vst1.u32 {Dc0[0]}, [rp]!
vext.32 Qc01, Qc01, Qa23, #1
vld1.32 {Da23}, [vp]!
vpaddl.u32 Qc01, Qc01
@ Rotated main loop entry.
.Lentry:
vmlal.u32 Qc01, Du00, Dv01 @ v2di{ c1, c0 }
pld [up, #256-8] @ prefetch line + 4
vst1.u32 {Dc0[0]}, [rp]! @ output lowest in-flight limb
vext.32 Qc01, Qc01, Qa23, #1 @ v4si{ a2, c1h, c1l, c0h }
vshr.u64 Da23, Da23, #32 @ v2si{ 0, a3 }
vpaddl.u32 Qc01, Qc01 @ v2di{ a2+c1h, c1l+c0h }
pld [rp, #256-4] @ prefetch line + 4
subs n, n, #2
vmlal.u32 Qc01, Du11, Dv01
bgt .Loop
@ Rotated main loop tail.
vld2.32 {Du00[], Du11[]}, [up]! @ load last full pair of U
vst1.u32 {Dc0[0]}, [rp]!
vext.32 Qc01, Qc01, Qa23, #1
vpaddl.u32 Qc01, Qc01
@ Note that we began with a bias of -3 to N, so if we hit 0 exactly
@ we have three limbs left; the flags are still valid from above.
beq .Lfinish_three
@ We have two limbs left to process.
.Lfinish_two:
vmlal.u32 Qc01, Du00, Dv01
vmov.i32 Da23, #0 @ no more addend limbs
vst1.u32 {Dc0[0]}, [rp]!
vext.32 Qc01, Qc01, Qa23, #1 @ v4si{ 0, c1h, c1l, c0h }
vpaddl.u32 Qc01, Qc01
vmlal.u32 Qc01, Du11, Dv01
vst1.u32 {Dc0[0]}, [rp]!
vext.u32 Qc01, Qc01, Qa23, #1
vpaddl.u32 Qc01, Qc01
vst1.u32 {Dc0[0]}, [rp]! @ store low carry out
vext.32 Dc0, Dc0, Dc1, #1 @ v2si{ c1l, c0h }
vpaddl.u32 Dc0, Dc0
vmov.32 r0, Dc0[0] @ return high carry out
bx lr
@ We reach here if we began with N < 4. We're ensured that N >= 2,
@ and the flags are still set up so that EQ -> 3 and thus LT -> 2.
.Lsmall:
blt .Lfinish_two
@ We reach here if we began with N > 4 and N odd.
@ We have three limbs left to process.
.Lfinish_three:
vld1.u32 {Da23[0]}, [vp] @ load last addend limb
vmlal.u32 Qc01, Du00, Dv01
vmov Du00, Du11 @ prepare to...
vld1.32 {Du11[]}, [up] @ load last U limb
vext.u32 Qc01, Qc01, Qa23, #1
vpaddl.u32 Qc01, Qc01
b .Lfinish_two
EPILOGUE()
More information about the gmp-devel
mailing list