Neon addmul_8
Niels Möller
nisse at lysator.liu.se
Tue Feb 26 16:01:44 CET 2013
nisse at lysator.liu.se (Niels Möller) writes:
> Maybe later, but for now, A9 is my target platform. But it seems you're
> right that Neon is almost useless there.
I'm attaching the functions I've been testing, in case anyone else would
like to play with them.
/Niels
-------------- next part --------------
dnl ARM neon mpn_addmul_4.
dnl Contributed to the GNU project by Niels Möller
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arm
.arch armv6t2
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
C Recurrency variables
define(`Qc01', `q1')
define(`Qc23', `q2')
C Aliases
define(`Dc0', `d2')
define(`Dc1', `d3')
define(`Dc2', `d4')
define(`Dc3', `d5')
define(`Du00', `d6')
define(`Qtmp', `q8')
define(`Dtmp', `d16')
define(`Ttmp', `d30')
define(`TDu00', `d7')
define(`Tc01', `q9')
define(`Tc23', `q10')
define(`Ec01', `q11')
define(`Ec23', `q12')
define(`Ac01', `q13')
define(`Ac23', `q14')
ASM_START()
PROLOGUE(mpn_addmul_4)
vld1.32 {Dv01, Dv23}, [vp]
C We read at vp, and store at rp
mov vp, rp
vld1.32 {Dc0, Dc1}, [vp]!
vmov.i32 Qc23, #0
vzip.32 Qc01, Qc23 C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
C Let n denote the number of result words left to read
subs n, #4
beq .Lend
.balign 16
.Loop:
vld1.32 {Du00[]}, [up]!
C Critical path starts here
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
vld1.32 Dtmp[0], [vp]!
subs n, #1
C We have: Qc23 = [c3, l2, c2, l1], Qc01 = [c1, l0, c0, r0]
C Rotate to: Qc23 = [r4, c3, l2, c2], Qc01 = [l1, c1, l0, c0]
C Then add: Qc23 = [r4+ c3, l2+ c2], Qc01 = [l1+ c1, l0+ c0]
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
bne .Loop
.Lend:
C Repeat 4 more times, without reading any new limbs from vp
vmov.i32 Dtmp, #0
mov n, #4
.Lend_loop:
vld1.32 {Du00[]}, [up]!
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
subs n, #1
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
bne .Lend_loop
C We have Qc23 = [c3, c2], Qc10 = [c1, c0] as (small) 64-bit values
C and need to add it together
vst1.32 Dc0[0], [rp]!
vshr.u64 Dc0, Dc0, #32
vadd.i64 Dc1, Dc1, Dc0
vst1.32 Dc1[0], [rp]!
vshr.u64 Dc1, Dc1, #32
vadd.i64 Dc2, Dc2, Dc1
vst1.32 Dc2[0], [rp]!
vshr.u64 Dc2, Dc2, #32
vadd.i64 Dc3, Dc3, Dc2
vmov.32 r0, Dc3[0]
bx lr
EPILOGUE()
-------------- next part --------------
dnl ARM neon mpn_addmul_6.
dnl Contributed to the GNU project by Niels Möller
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arm
.arch armv6t2
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Du00', `d3')
C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
define(`Qtmp', `q9')
define(`Dtmp', `d18')
ASM_START()
PROLOGUE(mpn_addmul_6)
vldm vp, {Dv01,Dv23,Dv45}
C We read at vp, and store at rp
mov vp, rp
vldm vp!, {Dc0,Dc1,Dc2}
vmov Dc4, Dc2
vmov.i32 Qc23, #0
vzip.32 Qc01, Qc23 C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
vmov.i32 Dc5, #0
vzip.32 Dc4, Dc5 C Pad to get Qc45 = [Dc5, Dc4] = [ 0, r5, 0, r4 ]
C Let n denote the number of result words left to read
subs n, #6
beq .Lend
.Loop:
vld1.32 {Du00[]}, [up]!
C Critical path starts here
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
vmlal.u32 Qc45, Dv45, Du00
vld1.32 Dtmp[0], [vp]!
subs n, #1
C We have: Qc45, Qc23, Qc01 = [c5, l4, c4, l3], [c3, l2, c2, l1], [c1, l0, c0, r0]
C Rotate to: Qc45, Qc23, Qc01 = [r6, c5, l4, c4], [l3, c3, l2, c2], [l1, c1, l0, c0]
C Then add: Qc45, Qc23, Qc01 = [r6+ c5, l4+ c4], [l3+ c3, l2+ c2], [l1+ c1, l0+ c0]
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
bne .Loop
.Lend:
C Repeat 6 more times, without reading any new limbs from vp
vmov.i32 Dtmp, #0
mov n, #6
.Lend_loop:
vld1.32 {Du00[]}, [up]!
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
vmlal.u32 Qc45, Dv45, Du00
subs n, #1
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
bne .Lend_loop
C We have Qc45, Qc23, Qc01 = [c5, c4],[c3, c2], [c1, c0] as (small) 64-bit values
C and need to add it together
vst1.32 Dc0[0], [rp]!
vshr.u64 Dc0, Dc0, #32
vadd.i64 Dc1, Dc1, Dc0
vst1.32 Dc1[0], [rp]!
vshr.u64 Dc1, Dc1, #32
vadd.i64 Dc2, Dc2, Dc1
vst1.32 Dc2[0], [rp]!
vshr.u64 Dc2, Dc2, #32
vadd.i64 Dc3, Dc3, Dc2
vst1.32 Dc3[0], [rp]!
vshr.u64 Dc3, Dc3, #32
vadd.i64 Dc4, Dc4, Dc3
vst1.32 Dc4[0], [rp]!
vshr.u64 Dc4, Dc4, #32
vadd.i64 Dc5, Dc5, Dc4
vmov.32 r0, Dc5[0]
bx lr
EPILOGUE()
-------------- next part --------------
dnl ARM neon mpn_addmul_8.
dnl Contributed to the GNU project by Richard Hendersson and Niels Möller
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arm
.arch armv6t2
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')
C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Dv67', `d3')
C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
define(`Qc67', `q9')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
define(`Dc6', `d18')
define(`Dc7', `d19')
define(`Du00', `d22')
define(`Qtmp', `q10')
define(`Dtmp', `d20')
ASM_START()
PROLOGUE(mpn_addmul_8)
vldm vp, {Dv01,Dv23,Dv45,Dv67}
C We read at vp, and store at rp
mov vp, rp
vldm vp!, {Dc0,Dc1,Dc2,Dc3}
vmov Qc45, Qc23
vmov.i32 Qc23, #0
vmov.i32 Qc67, Qc23
vzip.32 Qc01, Qc23 C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
vzip.32 Qc45, Qc67 C Pad to get Qc45 = [0, r5, 0, r4], Qc67 = [0, r7, 0, r6]
C Let n denote the number of result words left to read
subs n, #8
vld1.32 {Du00[]}, [up]!
beq .Lend
.balign 16
.Loop:
vld1.32 Dtmp[0], [vp]!
C Critical path starts here
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
vmlal.u32 Qc45, Dv45, Du00
vmlal.u32 Qc67, Dv67, Du00
vld1.32 {Du00[]}, [up]!
subs n, #1
C Shift and add
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
bne .Loop
.Lend:
C Repeat 8 more times, without reading any new limbs from vp
vmov.i32 Dtmp, #0
mov n, #7
.Lend_loop:
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
vmlal.u32 Qc45, Dv45, Du00
vmlal.u32 Qc67, Dv67, Du00
vld1.32 {Du00[]}, [up]!
subs n, #1
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
bne .Lend_loop
C Wind down, already read Du00
vmlal.u32 Qc01, Dv01, Du00
vmlal.u32 Qc23, Dv23, Du00
vmlal.u32 Qc45, Dv45, Du00
vmlal.u32 Qc67, Dv67, Du00
vst1.32 Dc0[0], [rp]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qtmp, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
C FIXME: Somehow combine above vext vpaddl with below additions?
C We have c7-c0 as (small) 64-bit values and need to add it together
vst1.32 Dc0[0], [rp]!
vshr.u64 Dc0, Dc0, #32
vadd.i64 Dc1, Dc1, Dc0
vst1.32 Dc1[0], [rp]!
vshr.u64 Dc1, Dc1, #32
vadd.i64 Dc2, Dc2, Dc1
vst1.32 Dc2[0], [rp]!
vshr.u64 Dc2, Dc2, #32
vadd.i64 Dc3, Dc3, Dc2
vst1.32 Dc3[0], [rp]!
vshr.u64 Dc3, Dc3, #32
vadd.i64 Dc4, Dc4, Dc3
vst1.32 Dc4[0], [rp]!
vshr.u64 Dc4, Dc4, #32
vadd.i64 Dc5, Dc5, Dc4
vst1.32 Dc5[0], [rp]!
vshr.u64 Dc5, Dc5, #32
vadd.i64 Dc6, Dc6, Dc5
vst1.32 Dc6[0], [rp]!
vshr.u64 Dc6, Dc6, #32
vadd.i64 Dc7, Dc7, Dc6
vmov.32 r0, Dc7[0]
bx lr
EPILOGUE()
-------------- next part --------------
--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list