[Gmp-commit] /home/hgfiles/gmp: 4 new changesets

Wed Mar 17 09:36:47 CET 2010

details:   /home/hgfiles/gmp/rev/7842ac165fde
changeset: 13497:7842ac165fde
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Mar 15 14:10:52 2010 +0100
description:
Update comments.

details:   /home/hgfiles/gmp/rev/e7c49c30ec52
changeset: 13498:e7c49c30ec52
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 16 18:28:39 2010 +0100
description:
Add FLAG_R_OPTIONAL for many binops.

details:   /home/hgfiles/gmp/rev/cfd215ba8932
changeset: 13499:cfd215ba8932
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 16 18:33:41 2010 +0100
description:
Set x bit.

details:   /home/hgfiles/gmp/rev/ad57ab3094a5
changeset: 13500:ad57ab3094a5
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 16 23:38:05 2010 +0100
description:
Use mpn_invert_limb instead of div insn.

diffstat:

 ChangeLog                     |   7 ++++
 mpn/alpha/ev6/mod_1_4.asm     |   5 +-
 mpn/x86_64/core2/divrem_1.asm |  68 ++++++++++++++++++++++++------------------
 mpn/x86_64/divrem_1.asm       |  61 ++++++++++++++++++++++----------------
 tune/speed.c                  |  16 +++++-----
 5 files changed, 91 insertions(+), 66 deletions(-)

diffs (truncated from 315 to 300 lines):

diff -r 66b94f02bf84 -r ad57ab3094a5 ChangeLog

--- a/ChangeLog	Mon Mar 15 13:16:46 2010 +0100
+++ b/ChangeLog	Tue Mar 16 23:38:05 2010 +0100
@@ -1,3 +1,10 @@
+2010-03-16  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/divrem_1.asm: Use mpn_invert_limb instead of div insn.
+	* mpn/x86_64/core2/divrem_1.asm: Likewise.
+
+	* tune/speed.c (routine): Add FLAG_R_OPTIONAL for many binops.
+
 2010-03-15  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/alpha/ev6/mod_1_4.asm (mpn_mod_1s_4p_cps): Rewrite.
diff -r 66b94f02bf84 -r ad57ab3094a5 mpn/alpha/ev6/mod_1_4.asm
--- a/mpn/alpha/ev6/mod_1_4.asm	Mon Mar 15 13:16:46 2010 +0100
+++ b/mpn/alpha/ev6/mod_1_4.asm	Tue Mar 16 23:38:05 2010 +0100
@@ -26,10 +26,9 @@
 C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
 C  * Optimise feed-in code, starting the sw pipeline in switch code.
 C  * Shorten software pipeline.  The mul instructions are scheduled too far
-C    from their users.
-C  * Use fewer registers.  Use r28 and r27.
+C    from their users.  Fixing this will allow us to use fewer registers.
 C  * If we cannot reduce register usage, write perhaps small-n basecase.
-C  * Does it work for PIC?
+C  * Does this work for PIC?
 
 C      cycles/limb
 C EV4:     ?
diff -r 66b94f02bf84 -r ad57ab3094a5 mpn/x86_64/core2/divrem_1.asm
--- a/mpn/x86_64/core2/divrem_1.asm	Mon Mar 15 13:16:46 2010 +0100
+++ b/mpn/x86_64/core2/divrem_1.asm	Tue Mar 16 23:38:05 2010 +0100
@@ -20,24 +20,21 @@
 
 include(`../config.m4')
 
+
 C		norm	unorm	frac
-C AMD K8,K9	14	14	12
-C AMD K10	14	14	12
-C Intel P4	 ?	 ?	 ?
+C AMD K8,K9	13	14	12	The norm number assumes special code
+C AMD K10	13	14	12	The norm number assumes special code
+C Intel P4	47	45	43
 C Intel core2	23	23	19.5
-C Intel corei	19	19	18
-C Intel atom	 ?	 ?	 ?
-C VIA nano	 ?	 ?	 ?
-
-C TODO
-C  * Compute the inverse without relying on the slow div instruction, instead
-C    call invert_limb.
-C  * Tune prologue.
+C Intel corei	19	19	18	The norm number assumes !special code
+C Intel atom	43	51	36	The norm number assumes special code
+C VIA nano	25	43	24
 
 C The code for unnormalized divisors works also for normalized divisors, but
 C for some reason it runs really slowly (on K8) for that case.  Intel Atom runs
 C the code for unnormalized poorly due to shld slowness.
-define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)
+ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',,
+`define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)')
 
 C mp_limb_t
 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
@@ -127,13 +124,18 @@
 	mov	%rax, (qp)
 	lea	-8(qp), qp
 L(8):
-	mov	d, %rdx
-	mov	$-1, %rax
-	not	%rdx
-	div	d			C FREE rax rdx rcx r9 r10 r11
+	push	%rdi
+	push	%rsi
+	push	%r8
+	mov	d, %rdi
+	CALL(	mpn_invert_limb)
+	pop	%r8
+	pop	%rsi
+	pop	%rdi
+
 	mov	%rax, dinv
 	mov	%rbp, %rax
-	lea	(%rbp), %rax		C
+	inc	%rbp
 	jmp	L(nent)
 
 	ALIGN(16)
@@ -181,20 +183,28 @@
 L(44):
 	bsr	d, %rcx
 	not	R32(%rcx)
-	sal	%cl, d
-	sal	%cl, %rbp
-	mov	d, %rdx
-	mov	$-1, %rax
-	not	%rdx
-	div	d			C FREE rax rdx r9 r10 r11
-	test	un, un
+	sal	R8(%rcx), d
+	sal	R8(%rcx), %rbp
+
+	push	%rcx
+	push	%rdi
+	push	%rsi
+	push	%r8
+	mov	d, %rdi
+	CALL(	mpn_invert_limb)
+	pop	%r8
+	pop	%rsi
+	pop	%rdi
+	pop	%rcx
+
 	mov	%rax, dinv
 	mov	%rbp, %rax
+	test	un, un
 	je	L(87)
 L(uent):
 	mov	-8(up,un,8), %rbp
-	shr	%cl, %rax
-	shld	%cl, %rbp, %rax
+	shr	R8(%rcx), %rax
+	shld	R8(%rcx), %rbp, %rax
 	sub	$2, un
 	js	L(ulast)
 
@@ -203,7 +213,7 @@
 	lea	1(%rax), %r11
 	mul	dinv
 	mov	(up,un,8), %r10
-	shld	%cl, %r10, %rbp
+	shld	R8(%rcx), %r10, %rbp
 	add	%rbp, %rax
 	adc	%r11, %rdx
 	mov	%rax, %r11
@@ -225,7 +235,7 @@
 	jns	L(uloop)
 L(ulast):
 	lea	1(%rax), %r11
-	sal	%cl, %rbp
+	sal	R8(%rcx), %rbp
 	mul	dinv
 	add	%rbp, %rax
 	adc	%r11, %rdx
@@ -274,7 +284,7 @@
 	dec	fn			C
 	jns	L(floop)		C
 
-	shr	%cl, %rax
+	shr	R8(%rcx), %rax
 L(ret):	pop	%rbx
 	pop	%rbp
 	pop	%r12
diff -r 66b94f02bf84 -r ad57ab3094a5 mpn/x86_64/divrem_1.asm
--- a/mpn/x86_64/divrem_1.asm	Mon Mar 15 13:16:46 2010 +0100
+++ b/mpn/x86_64/divrem_1.asm	Tue Mar 16 23:38:05 2010 +0100
@@ -24,21 +24,17 @@
 C		norm	unorm	frac
 C AMD K8,K9	13	13	12
 C AMD K10	13	13	12
-C Intel P4	48	48	43
+C Intel P4	47	47	43
 C Intel core2	24.62	24.62	19.5
 C Intel corei	20	20	18
-C Intel atom	43	52	36
-C VIA nano	 ?	 ?	 ?
-
-C TODO
-C  * Compute the inverse without relying on the slow div instruction, instead
-C    call invert_limb.
-C  * Tune prologue.
+C Intel atom	43	52	36	The norm number assumes special code
+C VIA nano	25	46	24	The norm number assumes special code
 
 C The code for unnormalized divisors works also for normalized divisors, but
 C for some reason it runs really slowly (on K8) for that case.  Intel Atom runs
 C the code for unnormalized poorly due to shld slowness.
-define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)
+ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',,
+`define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)')
 
 C mp_limb_t
 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
@@ -128,13 +124,18 @@
 	mov	%rax, (qp)
 	lea	-8(qp), qp
 L(8):
-	mov	d, %rdx
-	mov	$-1, %rax
-	not	%rdx
-	div	d			C FREE rax rdx rcx r9 r10 r11
+	push	%rdi
+	push	%rsi
+	push	%r8
+	mov	d, %rdi
+	CALL(	mpn_invert_limb)
+	pop	%r8
+	pop	%rsi
+	pop	%rdi
+
 	mov	%rax, dinv
 	mov	%rbp, %rax
-	lea	(%rbp), %rax		C
+	inc	%rbp
 	jmp	L(nent)
 
 	ALIGN(16)
@@ -182,20 +183,28 @@
 L(44):
 	bsr	d, %rcx
 	not	R32(%rcx)
-	sal	%cl, d
-	sal	%cl, %rbp
-	mov	d, %rdx
-	mov	$-1, %rax
-	not	%rdx
-	div	d			C FREE rax rdx r9 r10 r11
-	test	un, un
+	sal	R8(%rcx), d
+	sal	R8(%rcx), %rbp
+
+	push	%rcx
+	push	%rdi
+	push	%rsi
+	push	%r8
+	mov	d, %rdi
+	CALL(	mpn_invert_limb)
+	pop	%r8
+	pop	%rsi
+	pop	%rdi
+	pop	%rcx
+
 	mov	%rax, dinv
 	mov	%rbp, %rax
+	test	un, un
 	je	L(87)
 L(uent):
 	mov	-8(up,un,8), %rbp
-	shr	%cl, %rax
-	shld	%cl, %rbp, %rax
+	shr	R8(%rcx), %rax
+	shld	R8(%rcx), %rbp, %rax
 	sub	$2, un
 	lea	1(%rax), %r11
 	js	L(ulast)
@@ -203,7 +212,7 @@
 	ALIGN(16)
 L(uloop):
 	mov	(up,un,8), %r10
-	shld	%cl, %r10, %rbp
+	shld	R8(%rcx), %r10, %rbp
 	mul	dinv
 	add	%rbp, %rax
 	adc	%r11, %rdx
@@ -226,7 +235,7 @@
 	lea	1(%rax), %r11
 	jns	L(uloop)
 L(ulast):
-	sal	%cl, %rbp
+	sal	R8(%rcx), %rbp
 	mul	dinv
 	add	%rbp, %rax
 	adc	%r11, %rdx
@@ -275,7 +284,7 @@
 	dec	fn			C
 	jns	L(floop)		C
 
-	shr	%cl, %rax
+	shr	R8(%rcx), %rax
 L(ret):	pop	%rbx
 	pop	%rbp
 	pop	%r12
diff -r 66b94f02bf84 -r ad57ab3094a5 tune/speed.c
--- a/tune/speed.c	Mon Mar 15 13:16:46 2010 +0100
+++ b/tune/speed.c	Tue Mar 16 23:38:05 2010 +0100
@@ -380,28 +380,28 @@
   { "mpn_copyd",         speed_mpn_copyd            },
 #endif
 #if HAVE_NATIVE_mpn_addlsh1_n
-  { "mpn_addlsh1_n",     speed_mpn_addlsh1_n        },
+  { "mpn_addlsh1_n",     speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
 #endif
 #if HAVE_NATIVE_mpn_sublsh1_n
-  { "mpn_sublsh1_n",     speed_mpn_sublsh1_n        },
+  { "mpn_sublsh1_n",     speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
 #endif
 #if HAVE_NATIVE_mpn_rsblsh1_n
-  { "mpn_rsblsh1_n",     speed_mpn_rsblsh1_n        },
+  { "mpn_rsblsh1_n",     speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
 #endif
 #if HAVE_NATIVE_mpn_addlsh2_n
-  { "mpn_addlsh2_n",     speed_mpn_addlsh2_n        },
+  { "mpn_addlsh2_n",     speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
 #endif
 #if HAVE_NATIVE_mpn_sublsh2_n
-  { "mpn_sublsh2_n",     speed_mpn_sublsh2_n        },
+  { "mpn_sublsh2_n",     speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },