[Gmp-commit] /var/hg/gmp: Further x86_64 gcd_1 improvements.

mercurial at gmplib.org mercurial at gmplib.org
Sat Mar 17 15:19:14 CET 2012


details:   /var/hg/gmp/rev/b319b5e7816c
changeset: 14761:b319b5e7816c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 17 15:19:11 2012 +0100
description:
Further x86_64 gcd_1 improvements.

diffstat:

 ChangeLog                  |    5 +
 mpn/x86_64/core2/gcd_1.asm |   65 +++++++++++++++---------
 mpn/x86_64/gcd_1.asm       |  113 +++++++++++++++++++++++---------------------
 3 files changed, 105 insertions(+), 78 deletions(-)

diffs (294 lines):

diff -r 5027cd1d7fbe -r b319b5e7816c ChangeLog
--- a/ChangeLog	Thu Mar 15 20:04:10 2012 +0100
+++ b/ChangeLog	Sat Mar 17 15:19:11 2012 +0100
@@ -1,3 +1,8 @@
+2012-03-17  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/core2/gcd_1.asm: Conditionally suppress reduction calls.
+	* mpn/x86_64/gcd_1.asm: Rewrite.
+
 2012-03-15  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/generic/gcd_1.c: Parameterise zerotab code.
diff -r 5027cd1d7fbe -r b319b5e7816c mpn/x86_64/core2/gcd_1.asm
--- a/mpn/x86_64/core2/gcd_1.asm	Thu Mar 15 20:04:10 2012 +0100
+++ b/mpn/x86_64/core2/gcd_1.asm	Sat Mar 17 15:19:11 2012 +0100
@@ -25,21 +25,25 @@
 
 
 C	     cycles/bit (approx)
-C AMD K8,K9	 9.79
-C AMD K10	 5.34
-C AMD bd1	 ?
-C AMD bobcat	11.3
-C Intel P4	20.8
-C Intel core2	 5.09
-C Intel NHM	 6.27
-C Intel SBR	 5.30
-C Intel atom	19.6
-C VIA nano	 6.75
-C Numbers measured with: speed -c -s64 mpn_gcd_1
+C AMD K8,K9	 8.50
+C AMD K10	 4.30
+C AMD bd1	 5.00
+C AMD bobcat	10.0
+C Intel P4	18.6
+C Intel core2	 3.83
+C Intel NHM	 5.17
+C Intel SBR	 4.69
+C Intel atom	17.0
+C VIA nano	 5.44
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
 
 C TODO
 C  * Optimise inner-loop for specific CPUs.
 
+C Threshold of when to call bmod when U is one limbs.  Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 6)
+
 C INPUT PARAMETERS
 define(`up',    `%rdi')
 define(`n',     `%rsi')
@@ -62,21 +66,35 @@
 	ALIGN(16)
 PROLOGUE(mpn_gcd_1)
 	DOS64_ENTRY(3)
-	mov	(%rdi), %r8		C src low limb
-	or	%rdx, %r8		C x | y
-	bsf	%r8, %r8		C common twos
+	mov	(up), %rax	C U low limb
+	or	v0, %rax
+	bsf	%rax, %rax	C min(ctz(u0),ctz(v0))
 
-	bsf	%rdx, %rcx
-	shr	R8(%rcx), %rdx
+	bsf	v0, %rcx
+	shr	R8(%rcx), v0
 
-	push	%r8			C preserve common twos over call
-	push	%rdx			C preserve v0 argument over call
-	sub	$8, %rsp		C maintain ABI required rsp alignment
+	push	%rax		C preserve common twos over call
+	push	v0		C preserve v0 argument over call
+	sub	$8, %rsp	C maintain ABI required rsp alignment
+
+	cmp	$1, n
+	jnz	L(reduce_nby1)
+
+C Both U and V are single limbs, reduce with bmod if there are many more bits
+C in u0 than in v0.
+	mov	(up), %r8
+	mov	%r8, %rax
+	shr	$BMOD_THRES_LOG2, %r8
+	cmp	%r8, v0
+	ja	L(reduced)
+	jmp	L(bmod)
+
+L(reduce_nby1):
 
 IFDOS(`	mov	%rdx, %r8	')
 IFDOS(`	mov	%rsi, %rdx	')
 IFDOS(`	mov	%rdi, %rcx	')
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, %rsi
+	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
 	jl	L(bmod)
 	CALL(	mpn_mod_1)
 	jmp	L(reduced)
@@ -86,15 +104,12 @@
 
 	add	$8, %rsp
 	pop	%rdx
-	pop	%r8
 
 	bsf	%rax, %rcx
 	test	%rax, %rax
 	jnz	L(mid)
 	jmp	L(end)
 
-C FIXME: 1st sub to rdx would shorten path
-
 	ALIGN(16)		C               K10   BD    C2    NHM   SBR
 L(top):	cmovc	%r10, %rax	C if x-y < 0    0,3   0,3   0,6   0,5   0,5
 	cmovc	%r9, %rdx	C use x,y-x     0,3   0,3   2,8   1,7   1,7
@@ -106,8 +121,8 @@
 	sub	%rdx, %rax	C               2     2     4     3     4
 	jnz	L(top)		C
 
-L(end):	mov	%rdx, %rax
-	mov	%r8, %rcx
+L(end):	pop	%rcx
+	mov	%rdx, %rax
 	shl	R8(%rcx), %rax
 	DOS64_EXIT()
 	ret
diff -r 5027cd1d7fbe -r b319b5e7816c mpn/x86_64/gcd_1.asm
--- a/mpn/x86_64/gcd_1.asm	Thu Mar 15 20:04:10 2012 +0100
+++ b/mpn/x86_64/gcd_1.asm	Sat Mar 17 15:19:11 2012 +0100
@@ -25,23 +25,22 @@
 
 
 C	     cycles/bit (approx)
-C AMD K8,K9	 6.75
-C AMD K10	 6.75
-C AMD bd1	 7.75
-C AMD bobcat	 7.5
-C Intel P4	18
-C Intel core2	 9
-C Intel NHM	 9
-C Intel SBR	10
-C Intel atom	10.5
-C VIA nano	 8.5
-
-C Numbers measured with: speed -CD -s1-64 mpn_gcd_1
+C AMD K8,K9	 5.21                 (4.95)
+C AMD K10	 5.15                 (5.00)
+C AMD bd1	 5.42                 (5.14)
+C AMD bobcat	 6.71                 (6.56)
+C Intel P4	13.5                 (12.75)
+C Intel core2	 6.20                 (6.16)
+C Intel NHM	 6.49                 (6.25)
+C Intel SBR	 7.75                 (7.57)
+C Intel atom	 8.77                 (8.54)
+C VIA nano	 6.60                 (6.20)
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
 
 
 C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
 
-deflit(MAXSHIFT, 6)
+deflit(MAXSHIFT, 7)
 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
 
 DEF_OBJECT(ctz_table,64)
@@ -51,6 +50,9 @@
 ')
 END_OBJECT(ctz_table)
 
+C Threshold of when to call bmod when U is one limbs.  Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 8)
 
 C INPUT PARAMETERS
 define(`up',    `%rdi')
@@ -65,31 +67,45 @@
 	ALIGN(16)
 PROLOGUE(mpn_gcd_1)
 	DOS64_ENTRY(3)
-	mov	(%rdi), %r8		C src low limb
-	or	%rdx, %r8		C x | y
+	mov	(up), %rax		C src low limb
+	or	v0, %rax		C x | y
 	mov	$-1, R32(%rcx)
 
 L(twos):
 	inc	R32(%rcx)
-	shr	%r8
+	shr	%rax
 	jnc	L(twos)
 
-	shr	R8(%rcx), %rdx
-	mov	R32(%rcx), R32(%r8)	C common twos
+	shr	R8(%rcx), v0
+	mov	R32(%rcx), R32(%rax)	C common twos
 
 L(divide_strip_y):
-	shr	%rdx
+	shr	v0
 	jnc	L(divide_strip_y)
-	adc	%rdx, %rdx
+	adc	v0, v0
 
-	push	%r8
-	push	%rdx
+	push	%rax
+	push	v0
 	sub	$8, %rsp		C maintain ABI required rsp alignment
 
+	cmp	$1, n
+	jnz	L(reduce_nby1)
+
+C Both U and V are single limbs, reduce with bmod if there are many more bits
+C in u0 than in v0.
+	mov	(up), %r8
+	mov	%r8, %rax
+	shr	$BMOD_THRES_LOG2, %r8
+	cmp	%r8, v0
+	ja	L(reduced)
+	jmp	L(bmod)
+
+L(reduce_nby1):
+
 IFDOS(`	mov	%rdx, %r8	')
 IFDOS(`	mov	%rsi, %rdx	')
 IFDOS(`	mov	%rdi, %rcx	')
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, %rsi
+	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
 	jl	L(bmod)
 	CALL(	mpn_mod_1)
 	jmp	L(reduced)
@@ -99,43 +115,34 @@
 
 	add	$8, %rsp
 	pop	%rdx
-	pop	%r8
 
+	LEA(	ctz_table, %rsi)
 	test	%rax, %rax
+	mov	%rax, %rcx
+	jnz	L(mid)
+	jmp	L(end)
 
-	mov	%rax, %rcx
-	LEA(	ctz_table, %r9)
-	jnz	L(strip_x_top)
+	ALIGN(16)			C               K8    BC    P4    NHM   SBR
+L(top):	cmovc	%rcx, %rax		C if x-y < 0	0
+	cmovc	%rdi, %rdx		C use x,y-x	0
+L(mid):	and	$MASK, R32(%rcx)	C		0
+	movzbl	(%rsi,%rcx), R32(%rcx)	C		1
+	jz	L(shift_alot)		C		1
+	shr	R8(%rcx), %rax		C		3
+	mov	%rax, %rdi		C		4
+	mov	%rdx, %rcx		C		3
+	sub	%rax, %rcx		C		4
+	sub	%rdx, %rax		C		4
+	jnz	L(top)			C		5
 
+L(end):	pop	%rcx
 	mov	%rdx, %rax
-	jmp	L(done)
-
-	ALIGN(16)
-L(top):
-	cmovc	%r10, %rcx		C if x-y gave carry, use x,y-x	0
-	cmovc	%rax, %rdx		C				0
-
-L(strip_x_top):
-	mov	%rcx, %rax		C				1
-	and	$MASK, R32(%rcx)	C				1
-
-	movzbl	(%r9,%rcx), R32(%rcx)	C use this insn form to placate Oracle
-
-	shr	R8(%rcx), %rax		C				4
-	cmp	$MAXSHIFT, R8(%rcx)	C				4
-
-	mov	%rax, %rcx		C				5
-	mov	%rdx, %r10		C				5
-	je	L(strip_x_top)		C				5
-
-	sub	%rax, %r10		C				6
-	sub	%rdx, %rcx		C				6
-	jnz	L(top)			C				6
-
-L(done):
-	mov	%r8, %rcx
 	shl	R8(%rcx), %rax
 	DOS64_EXIT()
 	ret
 
+L(shift_alot):
+	shr	$MAXSHIFT, %rax
+	mov	%rax, %rcx
+	jmp	L(mid)
 EPILOGUE()


More information about the gmp-commit mailing list