[Gmp-commit] /home/hgfiles/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Mar 17 09:36:47 CET 2010
details: /home/hgfiles/gmp/rev/7842ac165fde
changeset: 13497:7842ac165fde
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 15 14:10:52 2010 +0100
description:
Update comments.
details: /home/hgfiles/gmp/rev/e7c49c30ec52
changeset: 13498:e7c49c30ec52
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Mar 16 18:28:39 2010 +0100
description:
Add FLAG_R_OPTIONAL for many binops.
details: /home/hgfiles/gmp/rev/cfd215ba8932
changeset: 13499:cfd215ba8932
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Mar 16 18:33:41 2010 +0100
description:
Set x bit.
details: /home/hgfiles/gmp/rev/ad57ab3094a5
changeset: 13500:ad57ab3094a5
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Mar 16 23:38:05 2010 +0100
description:
Use mpn_invert_limb instead of div insn.
diffstat:
ChangeLog | 7 ++++
mpn/alpha/ev6/mod_1_4.asm | 5 +-
mpn/x86_64/core2/divrem_1.asm | 68 ++++++++++++++++++++++++------------------
mpn/x86_64/divrem_1.asm | 61 ++++++++++++++++++++++----------------
tune/speed.c | 16 +++++-----
5 files changed, 91 insertions(+), 66 deletions(-)
diffs (truncated from 315 to 300 lines):
diff -r 66b94f02bf84 -r ad57ab3094a5 ChangeLog
--- a/ChangeLog Mon Mar 15 13:16:46 2010 +0100
+++ b/ChangeLog Tue Mar 16 23:38:05 2010 +0100
@@ -1,3 +1,10 @@
+2010-03-16 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/divrem_1.asm: Use mpn_invert_limb instead of div insn.
+ * mpn/x86_64/core2/divrem_1.asm: Likewise.
+
+ * tune/speed.c (routine): Add FLAG_R_OPTIONAL for many binops.
+
2010-03-15 Torbjorn Granlund <tege at gmplib.org>
* mpn/alpha/ev6/mod_1_4.asm (mpn_mod_1s_4p_cps): Rewrite.
diff -r 66b94f02bf84 -r ad57ab3094a5 mpn/alpha/ev6/mod_1_4.asm
--- a/mpn/alpha/ev6/mod_1_4.asm Mon Mar 15 13:16:46 2010 +0100
+++ b/mpn/alpha/ev6/mod_1_4.asm Tue Mar 16 23:38:05 2010 +0100
@@ -26,10 +26,9 @@
C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
C * Optimise feed-in code, starting the sw pipeline in switch code.
C * Shorten software pipeline. The mul instructions are scheduled too far
-C from their users.
-C * Use fewer registers. Use r28 and r27.
+C from their users. Fixing this will allow us to use fewer registers.
C * If we cannot reduce register usage, write perhaps small-n basecase.
-C * Does it work for PIC?
+C * Does this work for PIC?
C cycles/limb
C EV4: ?
diff -r 66b94f02bf84 -r ad57ab3094a5 mpn/x86_64/core2/divrem_1.asm
--- a/mpn/x86_64/core2/divrem_1.asm Mon Mar 15 13:16:46 2010 +0100
+++ b/mpn/x86_64/core2/divrem_1.asm Tue Mar 16 23:38:05 2010 +0100
@@ -20,24 +20,21 @@
include(`../config.m4')
+
C norm unorm frac
-C AMD K8,K9 14 14 12
-C AMD K10 14 14 12
-C Intel P4 ? ? ?
+C AMD K8,K9 13 14 12 The norm number assumes special code
+C AMD K10 13 14 12 The norm number assumes special code
+C Intel P4 47 45 43
C Intel core2 23 23 19.5
-C Intel corei 19 19 18
-C Intel atom ? ? ?
-C VIA nano ? ? ?
-
-C TODO
-C * Compute the inverse without relying on the slow div instruction, instead
-C call invert_limb.
-C * Tune prologue.
+C Intel corei 19 19 18 The norm number assumes !special code
+C Intel atom 43 51 36 The norm number assumes special code
+C VIA nano 25 43 24
C The code for unnormalized divisors works also for normalized divisors, but
C for some reason it runs really slowly (on K8) for that case. Intel Atom runs
C the code for unnormalized poorly due to shld slowness.
-define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)
+ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',,
+`define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)')
C mp_limb_t
C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
@@ -127,13 +124,18 @@
mov %rax, (qp)
lea -8(qp), qp
L(8):
- mov d, %rdx
- mov $-1, %rax
- not %rdx
- div d C FREE rax rdx rcx r9 r10 r11
+ push %rdi
+ push %rsi
+ push %r8
+ mov d, %rdi
+ CALL( mpn_invert_limb)
+ pop %r8
+ pop %rsi
+ pop %rdi
+
mov %rax, dinv
mov %rbp, %rax
- lea (%rbp), %rax C
+ inc %rbp
jmp L(nent)
ALIGN(16)
@@ -181,20 +183,28 @@
L(44):
bsr d, %rcx
not R32(%rcx)
- sal %cl, d
- sal %cl, %rbp
- mov d, %rdx
- mov $-1, %rax
- not %rdx
- div d C FREE rax rdx r9 r10 r11
- test un, un
+ sal R8(%rcx), d
+ sal R8(%rcx), %rbp
+
+ push %rcx
+ push %rdi
+ push %rsi
+ push %r8
+ mov d, %rdi
+ CALL( mpn_invert_limb)
+ pop %r8
+ pop %rsi
+ pop %rdi
+ pop %rcx
+
mov %rax, dinv
mov %rbp, %rax
+ test un, un
je L(87)
L(uent):
mov -8(up,un,8), %rbp
- shr %cl, %rax
- shld %cl, %rbp, %rax
+ shr R8(%rcx), %rax
+ shld R8(%rcx), %rbp, %rax
sub $2, un
js L(ulast)
@@ -203,7 +213,7 @@
lea 1(%rax), %r11
mul dinv
mov (up,un,8), %r10
- shld %cl, %r10, %rbp
+ shld R8(%rcx), %r10, %rbp
add %rbp, %rax
adc %r11, %rdx
mov %rax, %r11
@@ -225,7 +235,7 @@
jns L(uloop)
L(ulast):
lea 1(%rax), %r11
- sal %cl, %rbp
+ sal R8(%rcx), %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
@@ -274,7 +284,7 @@
dec fn C
jns L(floop) C
- shr %cl, %rax
+ shr R8(%rcx), %rax
L(ret): pop %rbx
pop %rbp
pop %r12
diff -r 66b94f02bf84 -r ad57ab3094a5 mpn/x86_64/divrem_1.asm
--- a/mpn/x86_64/divrem_1.asm Mon Mar 15 13:16:46 2010 +0100
+++ b/mpn/x86_64/divrem_1.asm Tue Mar 16 23:38:05 2010 +0100
@@ -24,21 +24,17 @@
C norm unorm frac
C AMD K8,K9 13 13 12
C AMD K10 13 13 12
-C Intel P4 48 48 43
+C Intel P4 47 47 43
C Intel core2 24.62 24.62 19.5
C Intel corei 20 20 18
-C Intel atom 43 52 36
-C VIA nano ? ? ?
-
-C TODO
-C * Compute the inverse without relying on the slow div instruction, instead
-C call invert_limb.
-C * Tune prologue.
+C Intel atom 43 52 36 The norm number assumes special code
+C VIA nano 25 46 24 The norm number assumes special code
C The code for unnormalized divisors works also for normalized divisors, but
C for some reason it runs really slowly (on K8) for that case. Intel Atom runs
C the code for unnormalized poorly due to shld slowness.
-define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)
+ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',,
+`define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)')
C mp_limb_t
C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
@@ -128,13 +124,18 @@
mov %rax, (qp)
lea -8(qp), qp
L(8):
- mov d, %rdx
- mov $-1, %rax
- not %rdx
- div d C FREE rax rdx rcx r9 r10 r11
+ push %rdi
+ push %rsi
+ push %r8
+ mov d, %rdi
+ CALL( mpn_invert_limb)
+ pop %r8
+ pop %rsi
+ pop %rdi
+
mov %rax, dinv
mov %rbp, %rax
- lea (%rbp), %rax C
+ inc %rbp
jmp L(nent)
ALIGN(16)
@@ -182,20 +183,28 @@
L(44):
bsr d, %rcx
not R32(%rcx)
- sal %cl, d
- sal %cl, %rbp
- mov d, %rdx
- mov $-1, %rax
- not %rdx
- div d C FREE rax rdx r9 r10 r11
- test un, un
+ sal R8(%rcx), d
+ sal R8(%rcx), %rbp
+
+ push %rcx
+ push %rdi
+ push %rsi
+ push %r8
+ mov d, %rdi
+ CALL( mpn_invert_limb)
+ pop %r8
+ pop %rsi
+ pop %rdi
+ pop %rcx
+
mov %rax, dinv
mov %rbp, %rax
+ test un, un
je L(87)
L(uent):
mov -8(up,un,8), %rbp
- shr %cl, %rax
- shld %cl, %rbp, %rax
+ shr R8(%rcx), %rax
+ shld R8(%rcx), %rbp, %rax
sub $2, un
lea 1(%rax), %r11
js L(ulast)
@@ -203,7 +212,7 @@
ALIGN(16)
L(uloop):
mov (up,un,8), %r10
- shld %cl, %r10, %rbp
+ shld R8(%rcx), %r10, %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
@@ -226,7 +235,7 @@
lea 1(%rax), %r11
jns L(uloop)
L(ulast):
- sal %cl, %rbp
+ sal R8(%rcx), %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
@@ -275,7 +284,7 @@
dec fn C
jns L(floop) C
- shr %cl, %rax
+ shr R8(%rcx), %rax
L(ret): pop %rbx
pop %rbp
pop %r12
diff -r 66b94f02bf84 -r ad57ab3094a5 tune/speed.c
--- a/tune/speed.c Mon Mar 15 13:16:46 2010 +0100
+++ b/tune/speed.c Tue Mar 16 23:38:05 2010 +0100
@@ -380,28 +380,28 @@
{ "mpn_copyd", speed_mpn_copyd },
#endif
#if HAVE_NATIVE_mpn_addlsh1_n
- { "mpn_addlsh1_n", speed_mpn_addlsh1_n },
+ { "mpn_addlsh1_n", speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
#endif
#if HAVE_NATIVE_mpn_sublsh1_n
- { "mpn_sublsh1_n", speed_mpn_sublsh1_n },
+ { "mpn_sublsh1_n", speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
#endif
#if HAVE_NATIVE_mpn_rsblsh1_n
- { "mpn_rsblsh1_n", speed_mpn_rsblsh1_n },
+ { "mpn_rsblsh1_n", speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
#endif
#if HAVE_NATIVE_mpn_addlsh2_n
- { "mpn_addlsh2_n", speed_mpn_addlsh2_n },
+ { "mpn_addlsh2_n", speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
#endif
#if HAVE_NATIVE_mpn_sublsh2_n
- { "mpn_sublsh2_n", speed_mpn_sublsh2_n },
+ { "mpn_sublsh2_n", speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
More information about the gmp-commit
mailing list