Neon mul_basecase

Richard Henderson rth at twiddle.net
Mon Feb 25 07:04:40 CET 2013


I'm not 100% sure how to interpret the speed results, since they're not 
really in the same units as before.  But comparing to mpn_mul_2 they're 
not encouraging.  Am I doing something wrong with the testing or did the 
1.64 cyc/limb I got for addmul_8 not really come over to this test?

But at least it pases make check...

> $ ./speed -C -s 1-400 -f 1.5 mpn_mul_2
> clock_gettime is 1.000ns accurate
> overhead 399.46 cycles, precision 10000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
>             mpn_mul_2
> 1                 n/a
> 2            637.7681
> 3            484.2773
> 4            407.5081
> 6            418.4830
> 9            329.4083
> 13           338.0055
> 19           307.1374
> 28           221.8464
> 42           223.5204
> 63           283.4794
> 94           199.7416
> 141          150.1862
> 211          114.0748
> 316           91.5833
> [rth at fremont tune]$ ./speed -C -s 1-400 -f 1.5 mpn_mul_basecase
> clock_gettime is 1.000ns accurate
> overhead 405.05 cycles, precision 10000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
>         mpn_mul_basecase
> 1           1890.2391
> 2           1072.0477
> 3           1517.6313
> 4           1505.7726
> 6           1494.0550
> 9           1062.7654
> 13          1427.9960
> 19          1177.6670
> 28          1010.8937
> 42          1018.4768
> 63          1101.3801
> 94          1353.1714
> 141         1836.2722
> 211         2595.3451
> 316         3823.3425

P.s. this version incorporates none of what we talked about this 
afternoon.  I just wanted to have a working starting point with which to 
compare.


r~
-------------- next part --------------
dnl  ARM neon mpn_addmul_14.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

	.fpu	neon
	.arch	armv6t2
	.arm

C	     cycles/limb

C Matching the push below.
define(`POPFRAME', `pop { r4, r5, r6, r7, r8, lr }')
define(`POPRETURN', `pop { r4, r5, r6, r7, r8, pc }')

ASM_START()
PROLOGUE(mpn_mul_basecase)
	.cfi_startproc
	push		{ r4, r5, r6, r7, r8, lr }
	.cfi_adjust_cfa_offset 24
	.cfi_rel_offset r4, 0
	.cfi_rel_offset r5, 4
	.cfi_rel_offset r6, 8
	.cfi_rel_offset r7, 12
	.cfi_rel_offset r8, 16
	.cfi_rel_offset lr, 20

	C Save away the parameters, but we'll always try to keep the
	C normal parameter registers up to date for the next call.
	ldr		r8, [sp, #24]	@ vn
	mov		r4, r0		@ rp
	mov		r5, r1		@ up
	mov		r6, r2		@ un
	mov		r7, r3		@ vp

	C Defer to other routines for the initial multiply sans accumulate.
	C Arrange for VN to be even after this.
	tst		r8, #1
	beq		1f

	ldr		r3, [r3]
	bl		mpn_mul_1
	str		r0, [r4, r6, lsl #2]
	add		r4, r4, #4
	add		r7, r7, #4
	sub		r8, r8, #1
	b		2f
1:
	bl		mpn_mul_2
	add		ip, r4, r6, lsl #2
	add		r4, r4, #8
	add		r7, r7, #8
	sub		r8, r8, #2
	str		r0, [ip, #4]
2:
	mov		r0, r4		@ setup args for next addmul call
	mov		r1, r5
	mov		r2, r6
	mov		r3, r7

	C Use a special addmul_14 until we've only a few left.
	subs		r8, r8, #14	@ vn >= 14?
	blt		4f
3:
	bl		mulbc_addmul_14
	subs		r8, r8, #14
	add		r4, r4, #14*4
	add		r7, r7, #14*4
	mov		r0, r4		@ setup args for next addmul call
	mov		r1, r5
	mov		r2, r6
	mov		r3, r7
	bge		3b

	C Tablejump to select the logic for the last 0-12.
4:	add		r8, r8, #14
	ldr		ip, [pc, r8, lsl #1]
5:	add		pc, pc, ip
	.word		0f - 5b - 8
	.word		2f - 5b - 8
	.word		4f - 5b - 8
	.word		6f - 5b - 8
	.word		8f - 5b - 8
	.word		10f - 5b - 8
	.word		12f - 5b - 8

4:	bl		L(do_addmul_2)
2:	bl		L(do_addmul_2)
0:	POPRETURN

10:	bl		L(do_addmul_2)
8:	POPFRAME
	b		mulbc_addmul_8

12:	bl		mulbc_addmul_6
	add		r0, r4, #6*4
	mov		r1, r5
	mov		r2, r6
	add		r3, r7, #6*4
6:	POPFRAME
	b		mulbc_addmul_6

L(do_addmul_2):
	mov		r8, lr
	bl		mpn_addmul_2
	add		ip, r4, r6, lsl #2
	add		r4, r4, #8
	add		r7, r7, #8
	str		r0, [ip, #4]
	mov		r0, r4
	mov		r1, r5
	mov		r2, r6
	mov		r3, r7
	bx		r8

	.cfi_endproc
EPILOGUE()

C
C These addmul_N are like the standard set, except that they do NOT
C return the last limb.  They store it in RP like all the rest.
C

define(`Dv01', `d0')	C v2si of two multiplier limbs
define(`Dv23', `d1')	C ... 2nd
define(`Dv45', `d2')	C ... 3rd
define(`Dv67', `d3')	C ... 4th
define(`Dv89', `d4')	C ... 4th
define(`DvAB', `d5')	C ... 4th
define(`DvCD', `d6')	C ... 4th

define(`Du0', `d7')	C v2si of one replicated multiplicand limb

define(`Qc01', `q8')	C v2di of product or carry
define(`Qc23', `q9')	C ... 2nd
define(`Qc45', `q10')	C ... 3rd
define(`Qc67', `q11')	C ... 4th
define(`Qc89', `q12')	C ... 5th
define(`QcAB', `q13')	C ... 6th
define(`QcCD', `q14')	C ... 7th
define(`Dc0', `d16')	C v1di (scalar) aliases of Qc01
define(`Dc1', `d17')
define(`Dc2', `d18')
define(`Dc3', `d19')
define(`Dc4', `d20')
define(`Dc5', `d21')
define(`Dc6', `d22')
define(`Dc7', `d23')
define(`Dc8', `d24')
define(`Dc9', `d25')
define(`DcA', `d26')
define(`DcB', `d27')
define(`DcC', `d28')
define(`DcD', `d29')

define(`QaN', `q15')	C v4si with [0] containing next addend limb
define(`DaN', `d30')

	.cfi_startproc

	TYPE(mulbc_addmul_6, function)
	ALIGN(16)

mulbc_addmul_6:
	vld1.32		{Dc0, Dc1, Dc2}, [r0]		@ load 6 rp limbs
	vld1.32		{Dv01, Dv23, Dv45}, [r3]	@ load all vp limbs
	add		r3, r0, #6*4

	vmov.i32	QaN, #0			@ clear addend input
	vmovl.u32	Qc45, Dc2		@ extend rp limbs to carry-in
	vmovl.u32	Qc23, Dc1
	vmovl.u32	Qc01, Dc0
	subs		r2, r2, #7		@ 6, 7, or 8 limbs?
	ble		L(small_6)

	vld1.32		{Du0}, [r1]!		@ load 2 up limbs
	vld1.32		{DaN}, [r3]!		@ load a6 and a7
	b		L(entry_6)

	C Rotated main loop body.  Processing 2 UP limbs.
	ALIGN(16)
L(loop_6):
	vld1.32		{Du0}, [r1]!		@ load next two up limbs
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, QaN, #1
	vld1.32		{DaN}, [r3]!		@ load next two rp limbs
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45

	C Rotated main loop entry.
L(entry_6):
	vmlal.u32	Qc01, Dv01, Du0[0]	@ compute u0 products
	vmlal.u32	Qc23, Dv23, Du0[0]
	vmlal.u32	Qc45, Dv45, Du0[0]
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, QaN, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vmlal.u32	Qc01, Dv01, Du0[1]	@ compute u1 products
	vshr.u64	DaN, DaN, #32
	vmlal.u32	Qc23, Dv23, Du0[1]
	subs		r2, r2, #2		@ at least 8 limbs left?
	vmlal.u32	Qc45, Dv45, Du0[1]
	bgt		L(loop_6)

	C Rotated main loop tail.
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, QaN, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vmov.i32	DaN, #0			@ no more rp limbs

L(small_6):
	C We have 6 or 7 limbs left to process.  Handle them one at a time.
	vld1.32		{Du0[0]}, [r1]!
	vmov.i32	DaN, #0			@ zero accumulate
	add		r2, r2, #7		@ undo bias of n
	bne		L(single_entry)		@ flags still set for N==7
	vld1.32		{DaN[0]}, [r3]!		@ load last addend limb
	b		L(single_entry)

	C Rotated single multiply loop body, excluding accumulate.
	ALIGN(16)
L(single_loop):
	vld1.32		{Du0[0]}, [r1]!
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, QaN, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vmov.i32	DaN, #0			@ zero accumulate

L(single_entry):
	vmlal.u32	Qc01, Dv01, Du0[0]	@ compute u0 products
	subs		r2, r2, #1
	vmlal.u32	Qc23, Dv23, Du0[0]
	vmlal.u32	Qc45, Dv45, Du0[0]
	bne		L(single_loop)

	C Rotated single multiply loop body.  Normally we'd have a copy of
	C the loop body, followed by a branch to finish_6 to tidy up all of
	C the carries.  Except that finish_7 is exactly that, so long as
	C we ensure that Dc7 is zero.
	vmov.i32	Qc67, #0
	b		L(finish_7)

	SIZE(mulbc_addmul_6, `.-mulbc_addmul_6')

	TYPE(mulbc_addmul_8, function)
	ALIGN(16)

mulbc_addmul_8:
	vld1.32		{Dc0, Dc1, Dc2, Dc3}, [r0]	@ load 8 rp limbs
	vld1.32		{Dv01, Dv23, Dv45, Dv67}, [r3]	@ load all vp limbs
	cmp		r2, #8				@ more than 8 limbs?
	vld1.32		{Du0[0]}, [r1]!			@ load first up
	ldrgt		r3, [r0, #8*4]			@ maybe load 9th rp
	ldr		ip, [r1], #4			@ load 2nd up limb
	moveq		r3, #0				@ maybe zero 9th rp

	vmov.i32	QaN, #0			@ clear addend input
	vmovl.u32	Qc67, Dc3		@ extend rp limbs to carry-in
	vmovl.u32	Qc45, Dc2
	vmovl.u32	Qc23, Dc1
	vmovl.u32	Qc01, Dc0

	.balign	16
	@ Main loop -- 8 x 1 multiplications
.Loop:
	vmlal.u32	Qc01, Dv01, Du0[0]	@ compute all products
	vmov.32		DaN[0], r3		@ move 9th rp into place
	vmlal.u32	Qc23, Dv23, Du0[0]
	cmp		r2, #9			@ more than 8 limbs left?
	vmlal.u32	Qc45, Dv45, Du0[0]
	ldrgt		r3, [r0, #36]		@ maybe load next rp limb
	vmlal.u32	Qc67, Dv67, Du0[0]
	movle		r3, #0			@ maybe zero next rp limb
	vmov.32		Du0[0], ip		@ move next up limb into place
	subs		r2, r2, #1
	vst1.u32	{Dc0[0]}, [r0]!		@ store lowest product limb
	ldrne		ip, [r1], #4		@ maybe load next up limb

	vext.32		Qc01, Qc01, Qc23, #1	@ rotate carry down one
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, QaN, #1	@ rotate in 9th addend or 0.
	vpaddl.u32	Qc01, Qc01		@ fold carries
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	bne		.Loop

	@ Done with all products.  Finish propagating carries,
	b		L(finish_8)

	SIZE(mulbc_addmul_8, `.-mulbc_addmul_8')

	TYPE(mulbc_addmul_14, function)
	ALIGN(16)

mulbc_addmul_14:
	vld1.32		{Dv01, Dv23, Dv45, Dv67}, [r3]!	@ load 8 vp limbs
	vld1.32		{Dv89, DvAB, DvCD}, [r3]	@ load 6 more vp
	add		r3, r0, #32
	vld1.32		{Dc0, Dc1, Dc2, Dc3}, [r0]	@ load 8 rp limbs
	vld1.32		{Dc4, Dc5, Dc6}, [r3]		@ load 6 more rp

	cmp		r2, #14				@ more than 14 limbs?
	vld1.32		{Du0[0]}, [r1]!			@ load first up
	ldrgt		r3, [r0, #14*4]			@ maybe load 15th rp
	ldr		ip, [r1], #4			@ load 2nd up limb
	moveq		r3, #0				@ maybe zero 15th rp

	vmov.i32	QaN, #0			@ clear addend input
	vmovl.u32	QcCD, Dc6		@ extend rp limbs to carry-in
	vmovl.u32	QcAB, Dc5
	vmovl.u32	Qc89, Dc4
	vmovl.u32	Qc67, Dc3
	vmovl.u32	Qc45, Dc2
	vmovl.u32	Qc23, Dc1
	vmovl.u32	Qc01, Dc0

	ALIGN(16)
	@ Main loop -- 14 x 1 multiplications
L(loop_14):
	vmlal.u32	Qc01, Dv01, Du0[0]	@ compute all products
	vmov.32		DaN[0], r3		@ move 9th rp into place
	vmlal.u32	Qc23, Dv23, Du0[0]
	cmp		r2, #14+1		@ more than 8 limbs left?
	vmlal.u32	Qc45, Dv45, Du0[0]
	ldrgt		r3, [r0, #(14+1)*4]	@ maybe load next rp limb
	vmlal.u32	Qc67, Dv67, Du0[0]
	movle		r3, #0			@ maybe zero next rp limb
	vmlal.u32	Qc89, Dv89, Du0[0]
	subs		r2, r2, #1
	vmlal.u32	QcAB, DvAB, Du0[0]
	vmlal.u32	QcCD, DvCD, Du0[0]
	vmov.32		Du0[0], ip		@ move next up limb into place
	vst1.u32	{Dc0[0]}, [r0]!		@ store lowest product limb
	ldrne		ip, [r1], #4		@ maybe load next up limb

	vext.32		Qc01, Qc01, Qc23, #1	@ rotate carry down one
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vext.32		Qc89, Qc89, QcAB, #1
	vext.32		QcAB, QcAB, QcCD, #1
	vext.32		QcCD, QcCD, QaN, #1	@ rotate in next addend or 0.
	vpaddl.u32	Qc01, Qc01		@ fold carries
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	vpaddl.u32	Qc89, Qc89
	vpaddl.u32	QcAB, QcAB
	vpaddl.u32	QcCD, QcCD
	bne		L(loop_14)

	@ Done with all products.  Finish propagating carries,
L(finish_14):
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vext.32		Qc89, Qc89, QcAB, #1
	vext.32		QcAB, QcAB, QcCD, #1
	vext.32		DcC, DcC, DcD, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	vpaddl.u32	Qc89, Qc89
	vpaddl.u32	QcAB, QcAB
	vpaddl.u32	DcC, DcC

L(finish_13):
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vext.32		Qc89, Qc89, QcAB, #1
	vext.32		QcAB, QcAB, QcCD, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	vpaddl.u32	Qc89, Qc89
	vpaddl.u32	QcAB, QcAB

L(finish_12):
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vext.32		Qc89, Qc89, QcAB, #1
	vext.32		DcA, DcA, DcB, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	vpaddl.u32	Qc89, Qc89
	vpaddl.u32	DcA, DcA

L(finish_11):
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vext.32		Qc89, Qc89, QcAB, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	vpaddl.u32	Qc89, Qc89

L(finish_10):
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vext.32		Dc8, Dc8, Dc9, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67
	vpaddl.u32	Dc8, Dc8

L(finish_9):
	vst1.u32	{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Qc67, Qc67, Qc89, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Qc67, Qc67

L(finish_8):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vext.32		Dc6, Dc6, Dc7, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45
	vpaddl.u32	Dc6, Dc6

L(finish_7):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Qc45, Qc45, Qc67, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Qc45, Qc45

L(finish_6):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vext.32		Dc4, Dc4, Dc5, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23
	vpaddl.u32	Dc4, Dc4

L(finish_5):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Qc23, Qc23, Qc45, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Qc23, Qc23

L(finish_4):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vext.32		Dc2, Dc2, Dc3, #1
	vpaddl.u32	Qc01, Qc01
	vpaddl.u32	Dc2, Dc2

L(finish_3):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Qc01, Qc01, Qc23, #1
	vpaddl.u32	Qc01, Qc01

L(finish_2):
	vst1.32		{Dc0[0]}, [r0]!
	vext.32		Dc0, Dc0, Dc1, #1
	vpaddl.u32	Dc0, Dc0

L(finish_1):
	vst1.32		{Dc0[0]}, [r0]!
	bx		lr

	SIZE(mulbc_addmul_14, `.-mulbc_addmul_14')
	.cfi_endproc


More information about the gmp-devel mailing list