Neon addmul_8

Niels Möller nisse at lysator.liu.se
Tue Feb 26 16:01:44 CET 2013


nisse at lysator.liu.se (Niels Möller) writes:

> Maybe later, but for now, A9 is my target platform. But it seems you're
> right that Neon is almost useless there.

I'm attaching the functions I've been testing, in case anyone else would
like to play with them.

/Niels

-------------- next part --------------
dnl  ARM neon mpn_addmul_4.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arm
	.arch	armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')

C Recurrency variables
define(`Qc01', `q1')
define(`Qc23', `q2')
C Aliases
define(`Dc0', `d2')
define(`Dc1', `d3')
define(`Dc2', `d4')
define(`Dc3', `d5')
	
define(`Du00', `d6')
define(`Qtmp', `q8')
define(`Dtmp', `d16')
define(`Ttmp', `d30')
	
define(`TDu00', `d7')
define(`Tc01', `q9')
define(`Tc23', `q10')
define(`Ec01', `q11')
define(`Ec23', `q12')
define(`Ac01', `q13')
define(`Ac23', `q14')
	
ASM_START()
PROLOGUE(mpn_addmul_4)
	vld1.32	{Dv01, Dv23}, [vp]
	C We read at vp, and store at rp
	mov	vp, rp
	vld1.32	{Dc0, Dc1}, [vp]!
	vmov.i32	Qc23, #0
	vzip.32	Qc01, Qc23	C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
	C Let n denote the number of result words left to read
	subs	n, #4
	beq	.Lend
	.balign 16
.Loop:
	
	vld1.32	{Du00[]}, [up]!
	C Critical path starts here
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vld1.32		Dtmp[0], [vp]!
	subs	n, #1

	C We have:   Qc23 = [c3, l2, c2, l1], Qc01 = [c1, l0, c0, r0]
	C Rotate to: Qc23 = [r4, c3, l2, c2], Qc01 = [l1, c1, l0, c0]
	C Then add:  Qc23 = [r4+ c3, l2+ c2], Qc01 = [l1+ c1, l0+ c0]
	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	bne	.Loop

.Lend:
	C Repeat 4 more times, without reading any new limbs from vp
	vmov.i32	Dtmp, #0
	mov	n, #4
.Lend_loop:

	vld1.32	{Du00[]}, [up]!
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	subs	n, #1

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	bne	.Lend_loop

	C We have Qc23 = [c3, c2], Qc10 = [c1, c0] as (small) 64-bit values
	C and need to add it together
	vst1.32		Dc0[0], [rp]!
	vshr.u64	Dc0, Dc0, #32
	vadd.i64	Dc1, Dc1, Dc0
	vst1.32		Dc1[0], [rp]!
	vshr.u64	Dc1, Dc1, #32
	vadd.i64	Dc2, Dc2, Dc1
	vst1.32		Dc2[0], [rp]!
	vshr.u64	Dc2, Dc2, #32
	vadd.i64	Dc3, Dc3, Dc2
	vmov.32	r0, Dc3[0]

	bx		lr
EPILOGUE()
	
-------------- next part --------------
dnl  ARM neon mpn_addmul_6.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arm
	.arch	armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Du00', `d3')

C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
	
define(`Qtmp', `q9')
define(`Dtmp', `d18')

ASM_START()
PROLOGUE(mpn_addmul_6)
	vldm	vp, {Dv01,Dv23,Dv45}
	C We read at vp, and store at rp
	mov	vp, rp
	vldm	vp!, {Dc0,Dc1,Dc2}
	vmov	Dc4, Dc2
	vmov.i32	Qc23, #0
	vzip.32	Qc01, Qc23	C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
	vmov.i32	Dc5, #0
	vzip.32	Dc4, Dc5	C Pad to get Qc45 = [Dc5, Dc4] = [ 0, r5, 0, r4 ]
	C Let n denote the number of result words left to read
	subs	n, #6
	beq	.Lend
.Loop:

	vld1.32	{Du00[]}, [up]!	
	C Critical path starts here
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vld1.32		Dtmp[0], [vp]!
	subs	n, #1

	C We have:   Qc45, Qc23, Qc01 = [c5, l4, c4, l3], [c3, l2, c2, l1], [c1, l0, c0, r0]
	C Rotate to: Qc45, Qc23, Qc01 = [r6, c5, l4, c4], [l3, c3, l2, c2], [l1, c1, l0, c0]
	C Then add:  Qc45, Qc23, Qc01 = [r6+ c5, l4+ c4], [l3+ c3, l2+ c2], [l1+ c1, l0+ c0]
	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	bne	.Loop

.Lend:
	C Repeat 6 more times, without reading any new limbs from vp
	vmov.i32	Dtmp, #0
	mov	n, #6
.Lend_loop:

	vld1.32	{Du00[]}, [up]!
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	subs	n, #1

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	bne	.Lend_loop

	C We have Qc45, Qc23, Qc01 = [c5, c4],[c3, c2], [c1, c0] as (small) 64-bit values
	C and need to add it together
	vst1.32		Dc0[0], [rp]!
	vshr.u64	Dc0, Dc0, #32
	vadd.i64	Dc1, Dc1, Dc0
	vst1.32		Dc1[0], [rp]!
	vshr.u64	Dc1, Dc1, #32
	vadd.i64	Dc2, Dc2, Dc1
	vst1.32		Dc2[0], [rp]!
	vshr.u64	Dc2, Dc2, #32
	vadd.i64	Dc3, Dc3, Dc2
	vst1.32		Dc3[0], [rp]!
	vshr.u64	Dc3, Dc3, #32
	vadd.i64	Dc4, Dc4, Dc3
	vst1.32		Dc4[0], [rp]!
	vshr.u64	Dc4, Dc4, #32
	vadd.i64	Dc5, Dc5, Dc4
	
	vmov.32	r0, Dc5[0]

	bx		lr
EPILOGUE()
-------------- next part --------------
dnl  ARM neon mpn_addmul_8.

dnl  Contributed to the GNU project by Richard Hendersson and Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arm
	.arch	armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Dv67', `d3')

C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
define(`Qc67', `q9')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
define(`Dc6', `d18')
define(`Dc7', `d19')

define(`Du00', `d22')
	
define(`Qtmp', `q10')
define(`Dtmp', `d20')

ASM_START()
PROLOGUE(mpn_addmul_8)
	vldm	vp, {Dv01,Dv23,Dv45,Dv67}
	C We read at vp, and store at rp
	mov	vp, rp
	vldm	vp!, {Dc0,Dc1,Dc2,Dc3}
	vmov	Qc45, Qc23
	vmov.i32	Qc23, #0
	vmov.i32	Qc67, Qc23
	vzip.32	Qc01, Qc23	C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2]
	vzip.32	Qc45, Qc67	C Pad to get Qc45 = [0, r5, 0, r4], Qc67 = [0, r7, 0, r6]
	C Let n denote the number of result words left to read
	subs	n, #8
	vld1.32	{Du00[]}, [up]!	
	beq	.Lend

	.balign 16
.Loop:
	vld1.32		Dtmp[0], [vp]!

	C Critical path starts here
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vmlal.u32	Qc67, Dv67, Du00
	vld1.32	{Du00[]}, [up]!	
	subs	n, #1

	C Shift and add
	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qc67, #1
	vext.32 Qc67, Qc67, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	bne	.Loop

.Lend:
	C Repeat 8 more times, without reading any new limbs from vp
	vmov.i32	Dtmp, #0
	mov	n, #7
.Lend_loop:

	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vmlal.u32	Qc67, Dv67, Du00
	vld1.32	{Du00[]}, [up]!
	subs	n, #1

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qc67, #1
	vext.32 Qc67, Qc67, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	bne	.Lend_loop

	C Wind down, already read Du00
	vmlal.u32	Qc01, Dv01, Du00
	vmlal.u32	Qc23, Dv23, Du00
	vmlal.u32	Qc45, Dv45, Du00
	vmlal.u32	Qc67, Dv67, Du00

	vst1.32		Dc0[0], [rp]!
	vext.32	Qc01, Qc01, Qc23, #1
	vext.32 Qc23, Qc23, Qc45, #1
	vext.32 Qc45, Qc45, Qc67, #1
	vext.32 Qc67, Qc67, Qtmp, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67

	C FIXME: Somehow combine above vext vpaddl with below additions?

	C We have c7-c0 as (small) 64-bit values and need to add it together
	vst1.32		Dc0[0], [rp]!
	vshr.u64	Dc0, Dc0, #32
	vadd.i64	Dc1, Dc1, Dc0
	vst1.32		Dc1[0], [rp]!
	vshr.u64	Dc1, Dc1, #32
	vadd.i64	Dc2, Dc2, Dc1
	vst1.32		Dc2[0], [rp]!
	vshr.u64	Dc2, Dc2, #32
	vadd.i64	Dc3, Dc3, Dc2
	vst1.32		Dc3[0], [rp]!
	vshr.u64	Dc3, Dc3, #32
	vadd.i64	Dc4, Dc4, Dc3
	vst1.32		Dc4[0], [rp]!
	vshr.u64	Dc4, Dc4, #32
	vadd.i64	Dc5, Dc5, Dc4
	vst1.32		Dc5[0], [rp]!
	vshr.u64	Dc5, Dc5, #32
	vadd.i64	Dc6, Dc6, Dc5
	vst1.32		Dc6[0], [rp]!
	vshr.u64	Dc6, Dc6, #32
	vadd.i64	Dc7, Dc7, Dc6
	
	vmov.32	r0, Dc7[0]

	bx		lr
EPILOGUE()
-------------- next part --------------

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.


More information about the gmp-devel mailing list