Neon addmul_4

Richard Henderson rth at twiddle.net
Sat Feb 23 21:59:48 CET 2013


Down to 2.8-3.0 cyc/limb.


r~
-------------- next part --------------
dnl  ARM neon mpn_addmul_4.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arch	armv6t2
	.arm

C	     cycles/limb
	
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`Du01', `d0')	C v2si of two multiplicand libs
define(`Du23', `d1')	C ... 2nd

define(`Qa47', `q1')	C v4si of addend limbs
define(`Da45', `d2')
define(`Da67', `d3')

define(`Qc01', `q2')	C v2di of product or carry
define(`Dc0', `d4')	C v1di (scalar) aliases of Qc01
define(`Dc1', `d5')
define(`Qc23', `q3')	C ... 2nd
define(`Dc2', `d6')
define(`Dc3', `d7')

define(`Dv01', `d16')	C v2si of two multiplier limbs
define(`Dv23', `d17')	C ... 2nd

define(`Do0', `d18')	C scalar output limbs, queued
define(`Do1', `d19')
define(`Do2', `d20')
define(`Do3', `d21')

define(`Qzero', `q11')

ASM_START()
PROLOGUE(mpn_addmul_4)
	vld1.32		{Dv01, Dv23}, [vp]	@ load v0-v3
	@ Load and extend a0-a3 into the carry-in
	vmov.i32	Qc01, #0
	vmov.i32	Qc23, #0
	vld4.32		{Dc0[0], Dc1[0], Dc2[0], Dc3[0]}, [rp]
	subs		n, n, #7		@ less than 8 limbs?
	add		vp, rp, #16		@ read from vp, write to rp
	ble		.Lsmall

	vld1.32		{Du01, Du23}, [up]!	@ load u0-u3
	vld1.32		{Da45, Da67}, [vp]!	@ load a4-a7
	b		.Lentry

	.balign	16
	@ Rotated main loop body.  We branch here as soon as all U inputs
	@ are consumed, so that we can schedule loads for the next round.
.Loop:
	vld1.32		{Du01, Du23}, [up]!	@ load u4-u7
	vmov		Do3, Dc0		@ queue c3l
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c5l, c4h, c4l, c3h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a7, c6h, c6l, c5h }
	vld1.32		{Da45, Da67}, [vp]!	@ load a8-a11
	vpaddl.u32	Qc01, Qc01		@ v2di{ c5l+c4h, c4l+c3h }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a7+c6h, c6l+c5h }
	@ Store all queued output limbs
	vst4.32		{Do0[0], Do1[0], Do2[0], Do3[0]}, [rp]!

	@ Rotated main loop entry.
.Lentry:
	vmlal.u32	Qc01, Dv01, Du01[0]	@ v2di{ c1, c0 }
	vmlal.u32	Qc23, Dv23, Du01[0]	@ v2di{ c3, c2 }
	vmov		Do0, Dc0		@ queue c0l
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c2l, c1h, c1l, c0h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a4, c3h, c3l, c2h }
	vpaddl.u32	Qc01, Qc01		@ v2di{ c2l+c1h, c1l+c0h }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a4+c3h, c3l+c2h }
	vext.32		Qa47, Qa47, Qa47, #1	@ v4si{ a4, a7, a6, a5 }

	vmlal.u32	Qc01, Dv01, Du01[1]	@ v2di{ c2, c1 }
	vmlal.u32	Qc23, Dv23, Du01[1]	@ v2di{ c4, c3 }
	vmov		Do1, Dc0		@ queue c1l
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c3l, c2h, c2l, c1h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a5, c4h, c4l, c3h }
	vpaddl.u32	Qc01, Qc01		@ v2di{ c3l+c2h, c2l+c1h }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a5+c4h, c4l+c3h }
	vext.32		Qa47, Qa47, Qa47, #1	@ v4si{ a5, a4, a7, a6 }

	vmlal.u32	Qc01, Dv01, Du23[0]	@ v2di{ c3, c2 }
	vmlal.u32	Qc23, Dv23, Du23[0]	@ v2di{ c5, c4 }
	vmov		Do2, Dc0		@ queue c2l
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c4l, c3h, c3l, c2h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a6, c5h, c5l, c4h }
	vpaddl.u32	Qc01, Qc01		@ v2di{ c4l+c3h, c3l+c2h }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a6+c5h, c5l+c4h }
	vext.32		Qa47, Qa47, Qa47, #1	@ v4si{ a6, a5, a4, a7 }

	subs		n, n, #4
	vmlal.u32	Qc01, Dv01, Du23[1]	@ v2di{ c4, c3 }
	vmlal.u32	Qc23, Dv23, Du23[1]	@ v2di{ c6, c5 }
	bgt		.Loop

	@ Rotated main loop tail.
	vmov		Do3, Dc0		@ queue c3l
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c5l, c4h, c4l, c3h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a7, c5h, c6l, c5h }
	vpaddl.u32	Qc01, Qc01		@ v2di{ c5l+c4h, c4l+c3h }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a7+c6h, c6l+c5h }
	vst4.32		{Do0[0], Do1[0], Do2[0], Do3[0]}, [rp]!

	@ Here, there are fewer than 8 limbs remaining.
.Lsmall:
	adds		n, n, #2		@ any addend limbs remaining?
	vmov.i32	Qa47, #0
	vmov.i32	Qzero, #0
	blt		9f
	add		pc, pc, n, lsl #3	@ load remaining addend limbs
	nop
1:	vld1.32		{Da45[0]}, [vp]
	b		9f
2:	vld1.32		{Da45}, [vp]
	b		9f
3:	vld1.32		{Da45}, [vp]!
	vld1.32		{Da67[0]}, [vp]
9:
	adds		n, n, #5		@ finish unbiasing N
	beq		.Lfinish
	vld1.32		{Du01[]}, [up]!		@ load and broadcast u0
	b		.Lsingle_entry

	.balign		16
	@ Process remaining limbs one at a time.
	@ Rotated single loop body.  Branch here as soon as U is consumed.
.Lsingle_loop:
	vld1.32		{Du01[]}, [up]!		@ load and broadcast u0
	vst1.32		{Dc0[0]}, [rp]!		@ store output limb
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c2l, c1h, c1l, c0h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a4, c3h, c3l, c2h }
	vpaddl.u32	Qc01, Qc01		@ v2di{ c2l+c1h, c1l+c0l }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a4+c3h, c3l+c2h }
	vext.32		Qa47, Qa47, Qzero, #1	@ v4si{ 0, a7, a6, a5 }
.Lsingle_entry:
	vmlal.u32	Qc01, Dv01, Du01	@ v2di{ c1, c0 }
	subs		n, n, #1
	vmlal.u32	Qc23, Dv23, Du01	@ v2di{ c3, c2 }
	bne		.Lsingle_loop

	@ Rotated single loop tail.
	vst1.32		{Dc0[0]}, [rp]!		@ store output limb
	vext.32		Qc01, Qc01, Qc23, #1	@ v4si{ c2l, c1h, c1l, c0h }
	vext.32		Qc23, Qc23, Qa47, #1	@ v4si{ a4, c3h, c3l, c2h }
	vpaddl.u32	Qc01, Qc01		@ v2di{ c2l+c1h, c1l+c0l }
	vpaddl.u32	Qc23, Qc23		@ v2di{ a4+c3h, c3l+c2h }

	@ We're done with all products.  Finish propagating carries,
	@ store three carry limbs and return the fourth.
.Lfinish:
	@ c0-c3 = { 0, c3l, c2h, c2l, c1h, c1l, c0h, c0l }
	vst1.32		{Dc0[0]}, [rp]!		@ one...
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Dc2, Dc2, Dc3, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Dc2, Dc2

	@ c0-c3 = { 0, 0, 0, c2l, c1h, c1l, c0h, c0l }
	vst1.32		{Dc0[0]}, [rp]!		@ two...
	vext.32		Qc01, Qc01, Qc23, #1
	vpaddl.u32	Qc01, Qc01

	@ c0-c3 = { 0, 0, 0, 0, 0, c1l, c0h, c0l }
	vst1.32		{Dc0[0]}, [rp]!		@ three...
	vext.32		Dc0, Dc0, Dc1, #1
	vpaddl.u32	Dc0, Dc0

	@ c0-c3 = { 0, 0, 0, 0, 0, 0, 0, c0l }
	vmov.32		r0, Dc0[0]		@ four...
	bx		lr

EPILOGUE()


More information about the gmp-devel mailing list