[Gmp-commit] /var/hg/gmp: Speed up loop code, simplify non-loop code.
mercurial at gmplib.org
mercurial at gmplib.org
Thu Mar 15 15:53:27 CET 2012
details: /var/hg/gmp/rev/7c7a9cba56d3
changeset: 14757:7c7a9cba56d3
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 15 15:53:22 2012 +0100
description:
Speed up loop code, simplify non-loop code.
diffstat:
ChangeLog | 5 +++
mpn/x86_64/core2/gcd_1.asm | 62 ++++++++++++++++++++-------------------------
2 files changed, 32 insertions(+), 35 deletions(-)
diffs (111 lines):
diff -r d16912b36656 -r 7c7a9cba56d3 ChangeLog
--- a/ChangeLog Tue Mar 13 20:24:40 2012 +0100
+++ b/ChangeLog Thu Mar 15 15:53:22 2012 +0100
@@ -1,3 +1,8 @@
+2012-03-15 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/core2/gcd_1.asm: Speed up loop code, simplify non-loop
+ code.
+
2012-03-13 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/core2/gcd_1.asm: Add hack to support fat builds.
diff -r d16912b36656 -r 7c7a9cba56d3 mpn/x86_64/core2/gcd_1.asm
--- a/mpn/x86_64/core2/gcd_1.asm Tue Mar 13 20:24:40 2012 +0100
+++ b/mpn/x86_64/core2/gcd_1.asm Thu Mar 15 15:53:22 2012 +0100
@@ -25,21 +25,20 @@
C cycles/bit (approx)
-C AMD K8,K9 8.5
-C AMD K10 5
-C AMD bd1 5
-C AMD bobcat 10
-C Intel P4 18
-C Intel core2 4.3
-C Intel NHM 5
-C Intel SBR 5
-C Intel atom 17
-C VIA nano 5.3
-C Numbers measured with: speed -CD -s1-64 mpn_gcd_1
+C AMD K8,K9 9.79
+C AMD K10 5.34
+C AMD bd1 ?
+C AMD bobcat 11.3
+C Intel P4 20.8
+C Intel core2 5.09
+C Intel NHM 6.27
+C Intel SBR 5.30
+C Intel atom 19.6
+C VIA nano 6.75
+C Numbers measured with: speed -c -s64 mpn_gcd_1
C TODO
-C * Optimise inner-loop for specific CPUs. The code relies too much on OoO
-C execution.
+C * Optimise inner-loop for specific CPUs.
C INPUT PARAMETERS
define(`up', `%rdi')
@@ -64,20 +63,14 @@
PROLOGUE(mpn_gcd_1)
DOS64_ENTRY(3)
mov (%rdi), %r8 C src low limb
- mov %r8, %r10
or %rdx, %r8 C x | y
-
- bsf %r8, %rcx
-
- shr R8(%rcx), %r10
- shr R8(%rcx), %rdx
- mov R32(%rcx), R32(%r8) C common twos
+ bsf %r8, %r8 C common twos
bsf %rdx, %rcx
shr R8(%rcx), %rdx
- push %r8
- push %rdx
+ push %r8 C preserve common twos over call
+ push %rdx C preserve v0 argument over call
sub $8, %rsp C maintain ABI required rsp alignment
IFDOS(` mov %rdx, %r8 ')
@@ -96,25 +89,24 @@
pop %r8
bsf %rax, %rcx
-
test %rax, %rax
jnz L(mid)
+ jmp L(end)
- jmp L(done)
+C FIXME: 1st sub to rdx would shorten path
- ALIGN(16) C K10 BD C2 NHM SBR
-L(top): cmovc %r10, %rax C if x-y < 0 0,6 0,5 0,6 0,5 0,6
- cmovc %r9, %rdx C use x,y-x 0 0 2 1 1
- bsf %r10, %rcx C 0 0 0 0 0
-L(mid): mov %rdx, %r10 C 1 1 4 3 3
- shr R8(%rcx), %rax C 4 3 2 3 3
- mov %rax, %r9 C 5 4 3 4 5
- sub %rax, %r10 C 5 4 5 4 5
- sub %rdx, %rax C 5 4 4 4 5
+ ALIGN(16) C K10 BD C2 NHM SBR
+L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5
+ cmovc %r9, %rdx C use x,y-x 0,3 0,3 2,8 1,7 1,7
+L(mid): shr R8(%rcx), %rax C 1,7 1,6 2,8 2,8 2,8
+ mov %rdx, %r10 C 1 1 4 3 3
+ sub %rax, %r10 C 2 2 5 4 4
+ bsf %r10, %rcx C 3 3 6 5 5
+ mov %rax, %r9 C 2 2 3 3 4
+ sub %rdx, %rax C 2 2 4 3 4
jnz L(top) C
-L(done):
- mov %rdx, %rax
+L(end): mov %rdx, %rax
mov %r8, %rcx
shl R8(%rcx), %rax
DOS64_EXIT()
More information about the gmp-commit
mailing list