Neon addmul_8

Sun Feb 24 02:38:32 CET 2013

gcc -O2 -g3 [...] addmul_N.c -DN=8 -DCLOCK=1694000000
$ ./t.out
mpn_addmul_8:     2845ms (1.782 cycles/limb) [973.59 Gb/s]
mpn_addmul_8:     2620ms (1.641 cycles/limb) [1057.20 Gb/s]
mpn_addmul_8:     2625ms (1.644 cycles/limb) [1055.19 Gb/s]
mpn_addmul_8:     2625ms (1.644 cycles/limb) [1055.19 Gb/s]
mpn_addmul_8:     2625ms (1.644 cycles/limb) [1055.19 Gb/s]

(Clearly there's something wrong with the bandwith calculation.  ;-)

It's somewhat unfortunate that neon load/store insns are not 
conditional.  I've made do with conditionally loading limbs into core 
registers and copying them over.

It might be worth a try having two copies of this main loop, one in 
which there are more than 8 limbs remaining (requiring loads of the 9th 
RP limb), and one for the final 8 limbs.  But I don't have time today, 
and my suspicion is that you'll get no better than 1.60 cyc/limb, if 
indeed one can measure any improvement at all.

Any real improvement on this routine will have to be algorithmic, using 
a different mechanism to compute and apply carries.  And that's where 
it's likely someone else that's going to have to be clever.

What sizes are really interesting for addmul decomposition?

Would it be useful do a prime like addmul_7?  It would run at pretty 
much the same speed as addmul_8, since nearly the same insns would be 
used.  Just the last vector would be a D register instead of a Q register.

There's still lots of free registers -- over half in fact.  The same 
scheme would still easily fit k=16 without spilling.  But given the 
distance between dependencies already I can't imagine but that we're 
near the point of diminishing returns.

r~
-------------- next part --------------
dnl  ARM neon mpn_addmul_8.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arch	armv6t2
	.arm

C	     cycles/limb

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`Dv01', `d0')	C v2si of two multiplier limbs
define(`Dv23', `d1')	C ... 2nd
define(`Dv45', `d2')	C ... 3rd
define(`Dv67', `d3')	C ... 4th
define(`Du0', `d4')	C v2si of one replicated multiplicand limb
define(`Qa9', `q3')	C v4si with [0] containing 9th addend limb
define(`Da9', `d6')

define(`Qc01', `q8')	C v2di of product or carry
define(`Dc0', `d16')	C v1di (scalar) aliases of Qc01
define(`Dc1', `d17')
define(`Qc23', `q9')	C ... 2nd
define(`Dc2', `d18')
define(`Dc3', `d19')
define(`Qc45', `q10')	C ... 3rd
define(`Dc4', `d20')
define(`Dc5', `d21')
define(`Qc67', `q11')	C ... 4th
define(`Dc6', `d22')
define(`Dc7', `d23')

ASM_START()
PROLOGUE(mpn_addmul_8)
	vld1.32		{Dc0, Dc1, Dc2, Dc3}, [rp]	@ load 8 rp limbs
	cmp		n, #8				@ more than 8 limbs?
	vld1.32		{Dv01, Dv23, Dv45, Dv67}, [vp]	@ load all vp limbs
	vld1.32		{Du0[0]}, [up]			@ load first up
	ldr		ip, [up, #4]			@ load 2nd up limb
	ldrgt		r3, [rp, #32]			@ maybe load 9th rp
	add		up, up, #8
	moveq		r3, #0				@ maybe zero 9th rp

	vmov.i32	Qa9, #0			@ clear addend input
	vmovl.u32	Qc67, Dc3		@ extend rp limbs to carry-in
	vmovl.u32	Qc45, Dc2
	vmovl.u32	Qc23, Dc1
	vmovl.u32	Qc01, Dc0

	.balign	16
	@ Main loop -- 8 x 1 multiplications
.Loop:
	vmlal.u32	Qc01, Dv01, Du0[0]	@ compute all products
	vmov.32		Da9[0], r3		@ move 9th rp into place
	vmlal.u32	Qc23, Dv23, Du0[0]
	cmp		n, #9			@ more than 8 limbs left?
	vmlal.u32	Qc45, Dv45, Du0[0]
	ldrgt		r3, [rp, #36]		@ maybe load next rp limb
	vmlal.u32	Qc67, Dv67, Du0[0]
	movle		r3, #0			@ maybe zero next rp limb
	vmov.32		Du0[0], ip		@ move next up limb into place
	subs		n, n, #1
	vst1.u32	{Dc0[0]}, [rp]!		@ store lowest product limb
	ldrne		ip, [up], #4		@ maybe load next up limb

	vext.32		Qc01, Qc01, Qc23, #1	@ rotate carry down one
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qa9, #1	@ rotate in 9th addend or 0.
	vpaddl.u32	Qc01, Qc01		@ fold carries
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	bne		.Loop

	@ Done with all products.  Finish propagating carries,
	@ Store seven carry limbs and return the eighth.
	@ c0-c7 = { c7l, c6, c5, c4, c3, c2, c1, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ one...
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Dc6, Dc6, Dc7, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Dc6, Dc6

	@ c0-c7 = { 0, c6l, c5, c4, c3, c2, c1, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ two...
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45

	@ c0-c7 = { 0, 0, c5l, c4, c3, c2, c1, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ three...
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Dc4, Dc4, Dc5, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Dc4, Dc4

	@ c0-c7 = { 0, 0, 0, c4l, c3, c2, c1, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ four...
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23

	@ c0-c7 = { 0, 0, 0, 0, c3l, c2, c1, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ five...
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Dc2, Dc2, Dc3, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Dc2, Dc2

	@ c0-c7 = { 0, 0, 0, 0, 0, c2l, c1, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ six...
	vext.32		Qc01, Qc01, Qc23, #1
	vpaddl.u32	Qc01, Qc01

	@ c0-c7 = { 0, 0, 0, 0, 0, 0, c1l, c0 }
	vst1.32		{Dc0[0]}, [rp]!		@ seven...
	vext.32		Dc0, Dc0, Dc1, #1
	vpaddl.u32	Dc0, Dc0

	@ c0-c7 = { 0, 0, 0, 0, 0, 0, 0, c0l }
	vmov.32		r0, Dc0[0]		@ eight...
	bx		lr

EPILOGUE()