[Gmp-commit] /var/hg/gmp: Further x86_64 gcd_1 improvements.
mercurial at gmplib.org
mercurial at gmplib.org
Sat Mar 17 15:19:14 CET 2012
details: /var/hg/gmp/rev/b319b5e7816c
changeset: 14761:b319b5e7816c
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Mar 17 15:19:11 2012 +0100
description:
Further x86_64 gcd_1 improvements.
diffstat:
ChangeLog | 5 +
mpn/x86_64/core2/gcd_1.asm | 65 +++++++++++++++---------
mpn/x86_64/gcd_1.asm | 113 +++++++++++++++++++++++---------------------
3 files changed, 105 insertions(+), 78 deletions(-)
diffs (294 lines):
diff -r 5027cd1d7fbe -r b319b5e7816c ChangeLog
--- a/ChangeLog Thu Mar 15 20:04:10 2012 +0100
+++ b/ChangeLog Sat Mar 17 15:19:11 2012 +0100
@@ -1,3 +1,8 @@
+2012-03-17 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/core2/gcd_1.asm: Conditionally suppress reduction calls.
+ * mpn/x86_64/gcd_1.asm: Rewrite.
+
2012-03-15 Torbjorn Granlund <tege at gmplib.org>
* mpn/generic/gcd_1.c: Parameterise zerotab code.
diff -r 5027cd1d7fbe -r b319b5e7816c mpn/x86_64/core2/gcd_1.asm
--- a/mpn/x86_64/core2/gcd_1.asm Thu Mar 15 20:04:10 2012 +0100
+++ b/mpn/x86_64/core2/gcd_1.asm Sat Mar 17 15:19:11 2012 +0100
@@ -25,21 +25,25 @@
C cycles/bit (approx)
-C AMD K8,K9 9.79
-C AMD K10 5.34
-C AMD bd1 ?
-C AMD bobcat 11.3
-C Intel P4 20.8
-C Intel core2 5.09
-C Intel NHM 6.27
-C Intel SBR 5.30
-C Intel atom 19.6
-C VIA nano 6.75
-C Numbers measured with: speed -c -s64 mpn_gcd_1
+C AMD K8,K9 8.50
+C AMD K10 4.30
+C AMD bd1 5.00
+C AMD bobcat 10.0
+C Intel P4 18.6
+C Intel core2 3.83
+C Intel NHM 5.17
+C Intel SBR 4.69
+C Intel atom 17.0
+C VIA nano 5.44
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
C TODO
C * Optimise inner-loop for specific CPUs.
+C Threshold of when to call bmod when U is one limbs. Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 6)
+
C INPUT PARAMETERS
define(`up', `%rdi')
define(`n', `%rsi')
@@ -62,21 +66,35 @@
ALIGN(16)
PROLOGUE(mpn_gcd_1)
DOS64_ENTRY(3)
- mov (%rdi), %r8 C src low limb
- or %rdx, %r8 C x | y
- bsf %r8, %r8 C common twos
+ mov (up), %rax C U low limb
+ or v0, %rax
+ bsf %rax, %rax C min(ctz(u0),ctz(v0))
- bsf %rdx, %rcx
- shr R8(%rcx), %rdx
+ bsf v0, %rcx
+ shr R8(%rcx), v0
- push %r8 C preserve common twos over call
- push %rdx C preserve v0 argument over call
- sub $8, %rsp C maintain ABI required rsp alignment
+ push %rax C preserve common twos over call
+ push v0 C preserve v0 argument over call
+ sub $8, %rsp C maintain ABI required rsp alignment
+
+ cmp $1, n
+ jnz L(reduce_nby1)
+
+C Both U and V are single limbs, reduce with bmod if there are many more bits
+C in u0 than in v0.
+ mov (up), %r8
+ mov %r8, %rax
+ shr $BMOD_THRES_LOG2, %r8
+ cmp %r8, v0
+ ja L(reduced)
+ jmp L(bmod)
+
+L(reduce_nby1):
IFDOS(` mov %rdx, %r8 ')
IFDOS(` mov %rsi, %rdx ')
IFDOS(` mov %rdi, %rcx ')
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, %rsi
+ cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
jl L(bmod)
CALL( mpn_mod_1)
jmp L(reduced)
@@ -86,15 +104,12 @@
add $8, %rsp
pop %rdx
- pop %r8
bsf %rax, %rcx
test %rax, %rax
jnz L(mid)
jmp L(end)
-C FIXME: 1st sub to rdx would shorten path
-
ALIGN(16) C K10 BD C2 NHM SBR
L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5
cmovc %r9, %rdx C use x,y-x 0,3 0,3 2,8 1,7 1,7
@@ -106,8 +121,8 @@
sub %rdx, %rax C 2 2 4 3 4
jnz L(top) C
-L(end): mov %rdx, %rax
- mov %r8, %rcx
+L(end): pop %rcx
+ mov %rdx, %rax
shl R8(%rcx), %rax
DOS64_EXIT()
ret
diff -r 5027cd1d7fbe -r b319b5e7816c mpn/x86_64/gcd_1.asm
--- a/mpn/x86_64/gcd_1.asm Thu Mar 15 20:04:10 2012 +0100
+++ b/mpn/x86_64/gcd_1.asm Sat Mar 17 15:19:11 2012 +0100
@@ -25,23 +25,22 @@
C cycles/bit (approx)
-C AMD K8,K9 6.75
-C AMD K10 6.75
-C AMD bd1 7.75
-C AMD bobcat 7.5
-C Intel P4 18
-C Intel core2 9
-C Intel NHM 9
-C Intel SBR 10
-C Intel atom 10.5
-C VIA nano 8.5
-
-C Numbers measured with: speed -CD -s1-64 mpn_gcd_1
+C AMD K8,K9 5.21 (4.95)
+C AMD K10 5.15 (5.00)
+C AMD bd1 5.42 (5.14)
+C AMD bobcat 6.71 (6.56)
+C Intel P4 13.5 (12.75)
+C Intel core2 6.20 (6.16)
+C Intel NHM 6.49 (6.25)
+C Intel SBR 7.75 (7.57)
+C Intel atom 8.77 (8.54)
+C VIA nano 6.60 (6.20)
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-deflit(MAXSHIFT, 6)
+deflit(MAXSHIFT, 7)
deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
DEF_OBJECT(ctz_table,64)
@@ -51,6 +50,9 @@
')
END_OBJECT(ctz_table)
+C Threshold of when to call bmod when U is one limbs. Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 8)
C INPUT PARAMETERS
define(`up', `%rdi')
@@ -65,31 +67,45 @@
ALIGN(16)
PROLOGUE(mpn_gcd_1)
DOS64_ENTRY(3)
- mov (%rdi), %r8 C src low limb
- or %rdx, %r8 C x | y
+ mov (up), %rax C src low limb
+ or v0, %rax C x | y
mov $-1, R32(%rcx)
L(twos):
inc R32(%rcx)
- shr %r8
+ shr %rax
jnc L(twos)
- shr R8(%rcx), %rdx
- mov R32(%rcx), R32(%r8) C common twos
+ shr R8(%rcx), v0
+ mov R32(%rcx), R32(%rax) C common twos
L(divide_strip_y):
- shr %rdx
+ shr v0
jnc L(divide_strip_y)
- adc %rdx, %rdx
+ adc v0, v0
- push %r8
- push %rdx
+ push %rax
+ push v0
sub $8, %rsp C maintain ABI required rsp alignment
+ cmp $1, n
+ jnz L(reduce_nby1)
+
+C Both U and V are single limbs, reduce with bmod if there are many more bits
+C in u0 than in v0.
+ mov (up), %r8
+ mov %r8, %rax
+ shr $BMOD_THRES_LOG2, %r8
+ cmp %r8, v0
+ ja L(reduced)
+ jmp L(bmod)
+
+L(reduce_nby1):
+
IFDOS(` mov %rdx, %r8 ')
IFDOS(` mov %rsi, %rdx ')
IFDOS(` mov %rdi, %rcx ')
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, %rsi
+ cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
jl L(bmod)
CALL( mpn_mod_1)
jmp L(reduced)
@@ -99,43 +115,34 @@
add $8, %rsp
pop %rdx
- pop %r8
+ LEA( ctz_table, %rsi)
test %rax, %rax
+ mov %rax, %rcx
+ jnz L(mid)
+ jmp L(end)
- mov %rax, %rcx
- LEA( ctz_table, %r9)
- jnz L(strip_x_top)
+ ALIGN(16) C K8 BC P4 NHM SBR
+L(top): cmovc %rcx, %rax C if x-y < 0 0
+ cmovc %rdi, %rdx C use x,y-x 0
+L(mid): and $MASK, R32(%rcx) C 0
+ movzbl (%rsi,%rcx), R32(%rcx) C 1
+ jz L(shift_alot) C 1
+ shr R8(%rcx), %rax C 3
+ mov %rax, %rdi C 4
+ mov %rdx, %rcx C 3
+ sub %rax, %rcx C 4
+ sub %rdx, %rax C 4
+ jnz L(top) C 5
+L(end): pop %rcx
mov %rdx, %rax
- jmp L(done)
-
- ALIGN(16)
-L(top):
- cmovc %r10, %rcx C if x-y gave carry, use x,y-x 0
- cmovc %rax, %rdx C 0
-
-L(strip_x_top):
- mov %rcx, %rax C 1
- and $MASK, R32(%rcx) C 1
-
- movzbl (%r9,%rcx), R32(%rcx) C use this insn form to placate Oracle
-
- shr R8(%rcx), %rax C 4
- cmp $MAXSHIFT, R8(%rcx) C 4
-
- mov %rax, %rcx C 5
- mov %rdx, %r10 C 5
- je L(strip_x_top) C 5
-
- sub %rax, %r10 C 6
- sub %rdx, %rcx C 6
- jnz L(top) C 6
-
-L(done):
- mov %r8, %rcx
shl R8(%rcx), %rax
DOS64_EXIT()
ret
+L(shift_alot):
+ shr $MAXSHIFT, %rax
+ mov %rax, %rcx
+ jmp L(mid)
EPILOGUE()
More information about the gmp-commit
mailing list