arm "neon"

Sat Feb 23 01:05:13 CET 2013

On 02/22/2013 12:08 PM, Richard Henderson wrote:
> Perhaps I should give this another go...

Down to 5.8 cyc/limb.  Good, but not fantastic.  I'm gonna try one more time
with larger unrolling to make full use of the vector load insns, and less
over-prefetching.

I guess the target is anything under 2.5 cyc/limb, against the armv6 integer
version?

r~
-------------- next part --------------
dnl  ARM neon mpn_addmul_2.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arch	armv6t2
	.arm

C	     cycles/limb

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`Da23', `d0')	C v2si of two addend limbs
define(`Qa23', `q0')	C v4si alias of Da23 with garbage in high half
define(`Dv01', `d1')	C v2si of the two multiplier limbs
define(`Du00', `d2')	C v2si of one multiplicand limb, duplicated
define(`Du11', `d3')	C v2si of second multiplicand limb
define(`Qc01', `q2')	C v2di of product or carry
define(`Dc0', `d4')	C v1di (scalar) aliases of Qc01
define(`Dc1', `d5')

ASM_START()
PROLOGUE(mpn_addmul_2)
	vmov.i32	Qc01, #0		@ clear carry in
	vld2.32		{Du00[], Du11[]}, [up]!	@ load and duplicate u0 and u1
	vld2.32		{Dc0[0], Dc1[0]}, [rp]	@ load a0 and a1 as carry-in
	vld1.32		{Dv01}, [vp]		@ load v0 and v1
	subs		n, n, #3		@ less than 4 limbs?
	add		vp, rp, #8		@ read from vp, write to rp
	ble		.Lsmall

	vld1.32		{Da23}, [vp]!		@ load a2 and a3
	pld		[up, #64-8]		@ prefetch line + 1
	pld		[rp, #64]
	pld		[up, #128-8]		@ prefetch line + 2
	pld		[rp, #128]
	pld		[up, #192-8]		@ prefetch line + 3
	pld		[rp, #192]
	b		.Lentry

	.balign	16
	@ Rotated main loop tail.  We branch here as soon as all U inputs
	@ are consumed, so that we can schedule loads for the next round.
.Loop:
	vld2.32		{Du00[], Du11[]}, [up]!
	vst1.u32	{Dc0[0]}, [rp]!
	vext.32		Qc01, Qc01, Qa23, #1
	vld1.32		{Da23}, [vp]!
	vpaddl.u32	Qc01, Qc01

	@ Rotated main loop entry.
.Lentry:
	vmlal.u32	Qc01, Du00, Dv01	@ v2di{ c1, c0 }
	pld		[up, #256-8]		@ prefetch line + 4
	vst1.u32	{Dc0[0]}, [rp]!		@ output lowest in-flight limb
	vext.32		Qc01, Qc01, Qa23, #1	@ v4si{ a2, c1h, c1l, c0h }
	vshr.u64	Da23, Da23, #32		@ v2si{ 0, a3 }
	vpaddl.u32	Qc01, Qc01		@ v2di{ a2+c1h, c1l+c0h }
	pld		[rp, #256-4]		@ prefetch line + 4

	subs		n, n, #2
	vmlal.u32	Qc01, Du11, Dv01
	bgt		.Loop

	@ Rotated main loop tail.
	vld2.32		{Du00[], Du11[]}, [up]!	@ load last full pair of U
	vst1.u32	{Dc0[0]}, [rp]!
	vext.32		Qc01, Qc01, Qa23, #1
	vpaddl.u32	Qc01, Qc01

	@ Note that we began with a bias of -3 to N, so if we hit 0 exactly
	@ we have three limbs left; the flags are still valid from above.
	beq		.Lfinish_three

	@ We have two limbs left to process.
.Lfinish_two:
	vmlal.u32	Qc01, Du00, Dv01
	vmov.i32	Da23, #0		@ no more addend limbs
	vst1.u32	{Dc0[0]}, [rp]!
	vext.32		Qc01, Qc01, Qa23, #1	@ v4si{ 0, c1h, c1l, c0h }
	vpaddl.u32	Qc01, Qc01

	vmlal.u32	Qc01, Du11, Dv01
	vst1.u32	{Dc0[0]}, [rp]!
	vext.u32	Qc01, Qc01, Qa23, #1
	vpaddl.u32	Qc01, Qc01

	vst1.u32	{Dc0[0]}, [rp]!		@ store low carry out
	vext.32		Dc0, Dc0, Dc1, #1	@ v2si{ c1l, c0h }
	vpaddl.u32	Dc0, Dc0
	vmov.32		r0, Dc0[0]		@ return high carry out
	bx		lr

	@ We reach here if we began with N < 4.  We're ensured that N >= 2,
	@ and the flags are still set up so that EQ -> 3 and thus LT -> 2.
.Lsmall:
	blt		.Lfinish_two

	@ We reach here if we began with N > 4 and N odd.
	@ We have three limbs left to process.
.Lfinish_three:
	vld1.u32	{Da23[0]}, [vp]		@ load last addend limb
	vmlal.u32	Qc01, Du00, Dv01
	vmov		Du00, Du11		@ prepare to...
	vld1.32		{Du11[]}, [up]		@   load last U limb
	vext.u32	Qc01, Qc01, Qa23, #1
	vpaddl.u32	Qc01, Qc01
	b		.Lfinish_two

EPILOGUE()