Neon addmul_8

Tue Feb 26 16:01:44 CET 2013

nisse at lysator.liu.se (Niels Möller) writes:

> Maybe later, but for now, A9 is my target platform. But it seems you're
> right that Neon is almost useless there.

I'm attaching the functions I've been testing, in case anyone else would
like to play with them.

/Niels

-------------- next part --------------
dnl  ARM neon mpn_addmul_4.

dnl  Contributed to the GNU project by Niels MÃ¶ller

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arm
	.arch	armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')

C Recurrency variables
define(`Qc01', `q1')
define(`Qc23', `q2')
C Aliases
define(`Dc0', `d2')
define(`Dc1', `d3')
define(`Dc2', `d4')
define(`Dc3', `d5')

define(`Du00', `d6')
define(`Qtmp', `q8')
define(`Dtmp', `d16')
define(`Ttmp', `d30')

define(`TDu00', `d7')
define(`Tc01', `q9')
define(`Tc23', `q10')
define(`Ec01', `q11')
define(`Ec23', `q12')
define(`Ac01', `q13')
define(`Ac23', `q14')

ASM_START()
PROLOGUE(mpn_addmul_4)
	vld1.32	{Dv01, Dv23}, [vp]
	C We read at vp, and store at rp
	mov	vp, rp
	vld1.32	{Dc0, Dc1}, [vp]!
	vmov.i32	Qc23, #0
	vzip.32	Qc01, Qc23	C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
	C Let n denote the number of result words left to read
	subs	n, #4
	beq	.Lend
	.balign 16
.Loop:

	vld1.32	{Du00[]}, [up]!
	C Critical path starts here
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vld1.32		Dtmp[0], [vp]!
	subs	n, #1

	C We have:   Qc23 = [c3, l2, c2, l1], Qc01 = [c1, l0, c0, r0]
	C Rotate to: Qc23 = [r4, c3, l2, c2], Qc01 = [l1, c1, l0, c0]
	C Then add:  Qc23 = [r4+ c3, l2+ c2], Qc01 = [l1+ c1, l0+ c0]
	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	bne	.Loop

.Lend:
	C Repeat 4 more times, without reading any new limbs from vp
	vmov.i32	Dtmp, #0
	mov	n, #4
.Lend_loop:

	vld1.32	{Du00[]}, [up]!
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	subs	n, #1

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	bne	.Lend_loop

	C We have Qc23 = [c3, c2], Qc10 = [c1, c0] as (small) 64-bit values
	C and need to add it together
	vst1.32		Dc0[0], [rp]!
	vshr.u64	Dc0, Dc0, #32
	vadd.i64	Dc1, Dc1, Dc0
	vst1.32		Dc1[0], [rp]!
	vshr.u64	Dc1, Dc1, #32
	vadd.i64	Dc2, Dc2, Dc1
	vst1.32		Dc2[0], [rp]!
	vshr.u64	Dc2, Dc2, #32
	vadd.i64	Dc3, Dc3, Dc2
	vmov.32	r0, Dc3[0]

	bx		lr
EPILOGUE()

-------------- next part --------------
dnl  ARM neon mpn_addmul_6.

dnl  Contributed to the GNU project by Niels MÃ¶ller

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arm
	.arch	armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Du00', `d3')

C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')

define(`Qtmp', `q9')
define(`Dtmp', `d18')

ASM_START()
PROLOGUE(mpn_addmul_6)
	vldm	vp, {Dv01,Dv23,Dv45}
	C We read at vp, and store at rp
	mov	vp, rp
	vldm	vp!, {Dc0,Dc1,Dc2}
	vmov	Dc4, Dc2
	vmov.i32	Qc23, #0
	vzip.32	Qc01, Qc23	C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
	vmov.i32	Dc5, #0
	vzip.32	Dc4, Dc5	C Pad to get Qc45 = [Dc5, Dc4] = [ 0, r5, 0, r4 ]
	C Let n denote the number of result words left to read
	subs	n, #6
	beq	.Lend
.Loop:

	vld1.32	{Du00[]}, [up]!	
	C Critical path starts here
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vld1.32		Dtmp[0], [vp]!
	subs	n, #1

	C We have:   Qc45, Qc23, Qc01 = [c5, l4, c4, l3], [c3, l2, c2, l1], [c1, l0, c0, r0]
	C Rotate to: Qc45, Qc23, Qc01 = [r6, c5, l4, c4], [l3, c3, l2, c2], [l1, c1, l0, c0]
	C Then add:  Qc45, Qc23, Qc01 = [r6+ c5, l4+ c4], [l3+ c3, l2+ c2], [l1+ c1, l0+ c0]
	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	bne	.Loop

.Lend:
	C Repeat 6 more times, without reading any new limbs from vp
	vmov.i32	Dtmp, #0
	mov	n, #6
.Lend_loop:

	vld1.32	{Du00[]}, [up]!
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	subs	n, #1

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	bne	.Lend_loop

	C We have Qc45, Qc23, Qc01 = [c5, c4],[c3, c2], [c1, c0] as (small) 64-bit values
	C and need to add it together
	vst1.32		Dc0[0], [rp]!
	vshr.u64	Dc0, Dc0, #32
	vadd.i64	Dc1, Dc1, Dc0
	vst1.32		Dc1[0], [rp]!
	vshr.u64	Dc1, Dc1, #32
	vadd.i64	Dc2, Dc2, Dc1
	vst1.32		Dc2[0], [rp]!
	vshr.u64	Dc2, Dc2, #32
	vadd.i64	Dc3, Dc3, Dc2
	vst1.32		Dc3[0], [rp]!
	vshr.u64	Dc3, Dc3, #32
	vadd.i64	Dc4, Dc4, Dc3
	vst1.32		Dc4[0], [rp]!
	vshr.u64	Dc4, Dc4, #32
	vadd.i64	Dc5, Dc5, Dc4

	vmov.32	r0, Dc5[0]

	bx		lr
EPILOGUE()
-------------- next part --------------
dnl  ARM neon mpn_addmul_8.

dnl  Contributed to the GNU project by Richard Hendersson and Niels MÃ¶ller

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arm
	.arch	armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Dv67', `d3')

C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
define(`Qc67', `q9')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
define(`Dc6', `d18')
define(`Dc7', `d19')

define(`Du00', `d22')

define(`Qtmp', `q10')
define(`Dtmp', `d20')

ASM_START()
PROLOGUE(mpn_addmul_8)
	vldm	vp, {Dv01,Dv23,Dv45,Dv67}
	C We read at vp, and store at rp
	mov	vp, rp
	vldm	vp!, {Dc0,Dc1,Dc2,Dc3}
	vmov	Qc45, Qc23
	vmov.i32	Qc23, #0
	vmov.i32	Qc67, Qc23
	vzip.32	Qc01, Qc23	C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
	vzip.32	Qc45, Qc67	C Pad to get Qc45 = [0, r5, 0, r4], Qc67 = [0, r7, 0, r6]
	C Let n denote the number of result words left to read
	subs	n, #8
	vld1.32	{Du00[]}, [up]!	
	beq	.Lend

	.balign 16
.Loop:
	vld1.32		Dtmp[0], [vp]!

	C Critical path starts here
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vmlal.u32	Qc67, Dv67, Du00
	vld1.32	{Du00[]}, [up]!	
	subs	n, #1

	C Shift and add
	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qc67, #1
	vext.32 Qc67, Qc67, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	bne	.Loop

.Lend:
	C Repeat 8 more times, without reading any new limbs from vp
	vmov.i32	Dtmp, #0
	mov	n, #7
.Lend_loop:

	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vmlal.u32	Qc67, Dv67, Du00
	vld1.32	{Du00[]}, [up]!
	subs	n, #1

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qc67, #1
	vext.32 Qc67, Qc67, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	bne	.Lend_loop

	C Wind down, already read Du00
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vmlal.u32	Qc67, Dv67, Du00

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qc67, #1
	vext.32 Qc67, Qc67, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67

	C FIXME: Somehow combine above vext vpaddl with below additions?

	C We have c7-c0 as (small) 64-bit values and need to add it together
	vst1.32		Dc0[0], [rp]!
	vshr.u64	Dc0, Dc0, #32
	vadd.i64	Dc1, Dc1, Dc0
	vst1.32		Dc1[0], [rp]!
	vshr.u64	Dc1, Dc1, #32
	vadd.i64	Dc2, Dc2, Dc1
	vst1.32		Dc2[0], [rp]!
	vshr.u64	Dc2, Dc2, #32
	vadd.i64	Dc3, Dc3, Dc2
	vst1.32		Dc3[0], [rp]!
	vshr.u64	Dc3, Dc3, #32
	vadd.i64	Dc4, Dc4, Dc3
	vst1.32		Dc4[0], [rp]!
	vshr.u64	Dc4, Dc4, #32
	vadd.i64	Dc5, Dc5, Dc4
	vst1.32		Dc5[0], [rp]!
	vshr.u64	Dc5, Dc5, #32
	vadd.i64	Dc6, Dc6, Dc5
	vst1.32		Dc6[0], [rp]!
	vshr.u64	Dc6, Dc6, #32
	vadd.i64	Dc7, Dc7, Dc6

	vmov.32	r0, Dc7[0]

	bx		lr
EPILOGUE()
-------------- next part --------------

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.