arm "neon"

Richard Henderson rth at
Sat Feb 23 01:05:13 CET 2013

On 02/22/2013 12:08 PM, Richard Henderson wrote:
> Perhaps I should give this another go...

Down to 5.8 cyc/limb.  Good, but not fantastic.  I'm gonna try one more time
with larger unrolling to make full use of the vector load insns, and less

I guess the target is anything under 2.5 cyc/limb, against the armv6 integer

-------------- next part --------------
dnl  ARM neon mpn_addmul_2.
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl  This file is part of the GNU MP Library.
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see


	.fpu	neon
	.arch	armv6t2

C	     cycles/limb
define(`n', `r2')

define(`Da23', `d0')	C v2si of two addend limbs
define(`Qa23', `q0')	C v4si alias of Da23 with garbage in high half
define(`Dv01', `d1')	C v2si of the two multiplier limbs
define(`Du00', `d2')	C v2si of one multiplicand limb, duplicated
define(`Du11', `d3')	C v2si of second multiplicand limb
define(`Qc01', `q2')	C v2di of product or carry
define(`Dc0', `d4')	C v1di (scalar) aliases of Qc01
define(`Dc1', `d5')

	vmov.i32	Qc01, #0		@ clear carry in
	vld2.32		{Du00[], Du11[]}, [up]!	@ load and duplicate u0 and u1
	vld2.32		{Dc0[0], Dc1[0]}, [rp]	@ load a0 and a1 as carry-in
	vld1.32		{Dv01}, [vp]		@ load v0 and v1
	subs		n, n, #3		@ less than 4 limbs?
	add		vp, rp, #8		@ read from vp, write to rp
	ble		.Lsmall

	vld1.32		{Da23}, [vp]!		@ load a2 and a3
	pld		[up, #64-8]		@ prefetch line + 1
	pld		[rp, #64]
	pld		[up, #128-8]		@ prefetch line + 2
	pld		[rp, #128]
	pld		[up, #192-8]		@ prefetch line + 3
	pld		[rp, #192]
	b		.Lentry

	.balign	16
	@ Rotated main loop tail.  We branch here as soon as all U inputs
	@ are consumed, so that we can schedule loads for the next round.
	vld2.32		{Du00[], Du11[]}, [up]!
	vst1.u32	{Dc0[0]}, [rp]!
	vext.32		Qc01, Qc01, Qa23, #1
	vld1.32		{Da23}, [vp]!
	vpaddl.u32	Qc01, Qc01

	@ Rotated main loop entry.
	vmlal.u32	Qc01, Du00, Dv01	@ v2di{ c1, c0 }
	pld		[up, #256-8]		@ prefetch line + 4
	vst1.u32	{Dc0[0]}, [rp]!		@ output lowest in-flight limb
	vext.32		Qc01, Qc01, Qa23, #1	@ v4si{ a2, c1h, c1l, c0h }
	vshr.u64	Da23, Da23, #32		@ v2si{ 0, a3 }
	vpaddl.u32	Qc01, Qc01		@ v2di{ a2+c1h, c1l+c0h }
	pld		[rp, #256-4]		@ prefetch line + 4

	subs		n, n, #2
	vmlal.u32	Qc01, Du11, Dv01
	bgt		.Loop

	@ Rotated main loop tail.
	vld2.32		{Du00[], Du11[]}, [up]!	@ load last full pair of U
	vst1.u32	{Dc0[0]}, [rp]!
	vext.32		Qc01, Qc01, Qa23, #1
	vpaddl.u32	Qc01, Qc01

	@ Note that we began with a bias of -3 to N, so if we hit 0 exactly
	@ we have three limbs left; the flags are still valid from above.
	beq		.Lfinish_three

	@ We have two limbs left to process.
	vmlal.u32	Qc01, Du00, Dv01
	vmov.i32	Da23, #0		@ no more addend limbs
	vst1.u32	{Dc0[0]}, [rp]!
	vext.32		Qc01, Qc01, Qa23, #1	@ v4si{ 0, c1h, c1l, c0h }
	vpaddl.u32	Qc01, Qc01

	vmlal.u32	Qc01, Du11, Dv01
	vst1.u32	{Dc0[0]}, [rp]!
	vext.u32	Qc01, Qc01, Qa23, #1
	vpaddl.u32	Qc01, Qc01

	vst1.u32	{Dc0[0]}, [rp]!		@ store low carry out
	vext.32		Dc0, Dc0, Dc1, #1	@ v2si{ c1l, c0h }
	vpaddl.u32	Dc0, Dc0
	vmov.32		r0, Dc0[0]		@ return high carry out
	bx		lr

	@ We reach here if we began with N < 4.  We're ensured that N >= 2,
	@ and the flags are still set up so that EQ -> 3 and thus LT -> 2.
	blt		.Lfinish_two
	@ We reach here if we began with N > 4 and N odd.
	@ We have three limbs left to process.
	vld1.u32	{Da23[0]}, [vp]		@ load last addend limb
	vmlal.u32	Qc01, Du00, Dv01
	vmov		Du00, Du11		@ prepare to...
	vld1.32		{Du11[]}, [up]		@   load last U limb
	vext.u32	Qc01, Qc01, Qa23, #1
	vpaddl.u32	Qc01, Qc01
	b		.Lfinish_two


More information about the gmp-devel mailing list