Neon addmul_4
Richard Henderson
rth at twiddle.net
Sat Feb 23 21:59:48 CET 2013
Down to 2.8-3.0 cyc/limb.
r~
-------------- next part --------------
dnl ARM neon mpn_addmul_4.
dnl
dnl Copyright 2013 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arch armv6t2
.arm
C cycles/limb
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
define(`Du01', `d0') C v2si of two multiplicand libs
define(`Du23', `d1') C ... 2nd
define(`Qa47', `q1') C v4si of addend limbs
define(`Da45', `d2')
define(`Da67', `d3')
define(`Qc01', `q2') C v2di of product or carry
define(`Dc0', `d4') C v1di (scalar) aliases of Qc01
define(`Dc1', `d5')
define(`Qc23', `q3') C ... 2nd
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dv01', `d16') C v2si of two multiplier limbs
define(`Dv23', `d17') C ... 2nd
define(`Do0', `d18') C scalar output limbs, queued
define(`Do1', `d19')
define(`Do2', `d20')
define(`Do3', `d21')
define(`Qzero', `q11')
ASM_START()
PROLOGUE(mpn_addmul_4)
vld1.32 {Dv01, Dv23}, [vp] @ load v0-v3
@ Load and extend a0-a3 into the carry-in
vmov.i32 Qc01, #0
vmov.i32 Qc23, #0
vld4.32 {Dc0[0], Dc1[0], Dc2[0], Dc3[0]}, [rp]
subs n, n, #7 @ less than 8 limbs?
add vp, rp, #16 @ read from vp, write to rp
ble .Lsmall
vld1.32 {Du01, Du23}, [up]! @ load u0-u3
vld1.32 {Da45, Da67}, [vp]! @ load a4-a7
b .Lentry
.balign 16
@ Rotated main loop body. We branch here as soon as all U inputs
@ are consumed, so that we can schedule loads for the next round.
.Loop:
vld1.32 {Du01, Du23}, [up]! @ load u4-u7
vmov Do3, Dc0 @ queue c3l
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c5l, c4h, c4l, c3h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a7, c6h, c6l, c5h }
vld1.32 {Da45, Da67}, [vp]! @ load a8-a11
vpaddl.u32 Qc01, Qc01 @ v2di{ c5l+c4h, c4l+c3h }
vpaddl.u32 Qc23, Qc23 @ v2di{ a7+c6h, c6l+c5h }
@ Store all queued output limbs
vst4.32 {Do0[0], Do1[0], Do2[0], Do3[0]}, [rp]!
@ Rotated main loop entry.
.Lentry:
vmlal.u32 Qc01, Dv01, Du01[0] @ v2di{ c1, c0 }
vmlal.u32 Qc23, Dv23, Du01[0] @ v2di{ c3, c2 }
vmov Do0, Dc0 @ queue c0l
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c2l, c1h, c1l, c0h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a4, c3h, c3l, c2h }
vpaddl.u32 Qc01, Qc01 @ v2di{ c2l+c1h, c1l+c0h }
vpaddl.u32 Qc23, Qc23 @ v2di{ a4+c3h, c3l+c2h }
vext.32 Qa47, Qa47, Qa47, #1 @ v4si{ a4, a7, a6, a5 }
vmlal.u32 Qc01, Dv01, Du01[1] @ v2di{ c2, c1 }
vmlal.u32 Qc23, Dv23, Du01[1] @ v2di{ c4, c3 }
vmov Do1, Dc0 @ queue c1l
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c3l, c2h, c2l, c1h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a5, c4h, c4l, c3h }
vpaddl.u32 Qc01, Qc01 @ v2di{ c3l+c2h, c2l+c1h }
vpaddl.u32 Qc23, Qc23 @ v2di{ a5+c4h, c4l+c3h }
vext.32 Qa47, Qa47, Qa47, #1 @ v4si{ a5, a4, a7, a6 }
vmlal.u32 Qc01, Dv01, Du23[0] @ v2di{ c3, c2 }
vmlal.u32 Qc23, Dv23, Du23[0] @ v2di{ c5, c4 }
vmov Do2, Dc0 @ queue c2l
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c4l, c3h, c3l, c2h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a6, c5h, c5l, c4h }
vpaddl.u32 Qc01, Qc01 @ v2di{ c4l+c3h, c3l+c2h }
vpaddl.u32 Qc23, Qc23 @ v2di{ a6+c5h, c5l+c4h }
vext.32 Qa47, Qa47, Qa47, #1 @ v4si{ a6, a5, a4, a7 }
subs n, n, #4
vmlal.u32 Qc01, Dv01, Du23[1] @ v2di{ c4, c3 }
vmlal.u32 Qc23, Dv23, Du23[1] @ v2di{ c6, c5 }
bgt .Loop
@ Rotated main loop tail.
vmov Do3, Dc0 @ queue c3l
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c5l, c4h, c4l, c3h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a7, c5h, c6l, c5h }
vpaddl.u32 Qc01, Qc01 @ v2di{ c5l+c4h, c4l+c3h }
vpaddl.u32 Qc23, Qc23 @ v2di{ a7+c6h, c6l+c5h }
vst4.32 {Do0[0], Do1[0], Do2[0], Do3[0]}, [rp]!
@ Here, there are fewer than 8 limbs remaining.
.Lsmall:
adds n, n, #2 @ any addend limbs remaining?
vmov.i32 Qa47, #0
vmov.i32 Qzero, #0
blt 9f
add pc, pc, n, lsl #3 @ load remaining addend limbs
nop
1: vld1.32 {Da45[0]}, [vp]
b 9f
2: vld1.32 {Da45}, [vp]
b 9f
3: vld1.32 {Da45}, [vp]!
vld1.32 {Da67[0]}, [vp]
9:
adds n, n, #5 @ finish unbiasing N
beq .Lfinish
vld1.32 {Du01[]}, [up]! @ load and broadcast u0
b .Lsingle_entry
.balign 16
@ Process remaining limbs one at a time.
@ Rotated single loop body. Branch here as soon as U is consumed.
.Lsingle_loop:
vld1.32 {Du01[]}, [up]! @ load and broadcast u0
vst1.32 {Dc0[0]}, [rp]! @ store output limb
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c2l, c1h, c1l, c0h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a4, c3h, c3l, c2h }
vpaddl.u32 Qc01, Qc01 @ v2di{ c2l+c1h, c1l+c0l }
vpaddl.u32 Qc23, Qc23 @ v2di{ a4+c3h, c3l+c2h }
vext.32 Qa47, Qa47, Qzero, #1 @ v4si{ 0, a7, a6, a5 }
.Lsingle_entry:
vmlal.u32 Qc01, Dv01, Du01 @ v2di{ c1, c0 }
subs n, n, #1
vmlal.u32 Qc23, Dv23, Du01 @ v2di{ c3, c2 }
bne .Lsingle_loop
@ Rotated single loop tail.
vst1.32 {Dc0[0]}, [rp]! @ store output limb
vext.32 Qc01, Qc01, Qc23, #1 @ v4si{ c2l, c1h, c1l, c0h }
vext.32 Qc23, Qc23, Qa47, #1 @ v4si{ a4, c3h, c3l, c2h }
vpaddl.u32 Qc01, Qc01 @ v2di{ c2l+c1h, c1l+c0l }
vpaddl.u32 Qc23, Qc23 @ v2di{ a4+c3h, c3l+c2h }
@ We're done with all products. Finish propagating carries,
@ store three carry limbs and return the fourth.
.Lfinish:
@ c0-c3 = { 0, c3l, c2h, c2l, c1h, c1l, c0h, c0l }
vst1.32 {Dc0[0]}, [rp]! @ one...
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Dc2, Dc2, Dc3, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Dc2, Dc2
@ c0-c3 = { 0, 0, 0, c2l, c1h, c1l, c0h, c0l }
vst1.32 {Dc0[0]}, [rp]! @ two...
vext.32 Qc01, Qc01, Qc23, #1
vpaddl.u32 Qc01, Qc01
@ c0-c3 = { 0, 0, 0, 0, 0, c1l, c0h, c0l }
vst1.32 {Dc0[0]}, [rp]! @ three...
vext.32 Dc0, Dc0, Dc1, #1
vpaddl.u32 Dc0, Dc0
@ c0-c3 = { 0, 0, 0, 0, 0, 0, 0, c0l }
vmov.32 r0, Dc0[0] @ four...
bx lr
EPILOGUE()
More information about the gmp-devel
mailing list