[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Fri Mar 4 22:49:44 CET 2011
details: /var/hg/gmp/rev/e8ee112300ea
changeset: 13997:e8ee112300ea
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Mar 04 22:37:40 2011 +0100
description:
Rewrite for linear performance.
details: /var/hg/gmp/rev/81e561509e1a
changeset: 13998:81e561509e1a
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Mar 04 22:38:26 2011 +0100
description:
*** empty log message ***
diffstat:
ChangeLog | 4 +
mpn/x86_64/addmul_2.asm | 169 +++++++++++++++++++++++------------------------
2 files changed, 88 insertions(+), 85 deletions(-)
diffs (245 lines):
diff -r 8c3dd0608bb7 -r 81e561509e1a ChangeLog
--- a/ChangeLog Fri Mar 04 09:20:33 2011 +0100
+++ b/ChangeLog Fri Mar 04 22:38:26 2011 +0100
@@ -1,3 +1,7 @@
+2011-03-04 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/addmul_2.asm: Rewrite for linear performance.
+
2011-03-03 Torbjorn Granlund <tege at gmplib.org>
* mpn/generic/mod_1_1.c (add_mssaaaa): Canonicalise layout. Add arm
diff -r 8c3dd0608bb7 -r 81e561509e1a mpn/x86_64/addmul_2.asm
--- a/mpn/x86_64/addmul_2.asm Fri Mar 04 09:20:33 2011 +0100
+++ b/mpn/x86_64/addmul_2.asm Fri Mar 04 22:38:26 2011 +0100
@@ -1,7 +1,7 @@
dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl add the result to a third limb vector.
-dnl Copyright 2008 Free Software Foundation, Inc.
+dnl Copyright 2008, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -25,18 +25,16 @@
C AMD K10 2.375
C Intel P4 15-16
C Intel core2 4.45
-C Intel corei 4.35
+C Intel NHM 4.32
+C Intel SBR 3.4
C Intel atom ?
-C VIA nano 4.5
+C VIA nano 4.4
C This code is the result of running a code generation and optimization tool
C suite written by David Harvey and Torbjorn Granlund.
C TODO
-C * Work on feed-in and wind-down code.
-C * Convert "mov $0" to "xor".
-C * Adjust initial lea to save some bytes.
-C * Perhaps adjust n from n_param&3 value?
+C * Tune feed-in and wind-down code.
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -52,119 +50,120 @@
define(`w3', `%r10')
define(`n', `%r11')
-ASM_START()
TEXT
ALIGN(16)
+ASM_START()
PROLOGUE(mpn_addmul_2)
+ mov n_param, n
push %rbx
push %rbp
- mov (vp), v0
+ mov 0(vp), v0
mov 8(vp), v1
- mov n_param, n
+ mov R32(n_param), R32(%rbx)
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ mul v0
neg n
- lea -32(up,n_param,8), up
- lea -32(rp,n_param,8), rp
+ and $3, R32(%rbx)
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jc L(b1)
+ jz L(b2)
- and $3, R32(n_param)
- jz L(am2p0)
- cmp $2, R32(n_param)
- jc L(am2p1)
- jz L(am2p2)
-L(am2p3):
- mov 32(up,n,8), %rax
- mul v0
- mov %rax, w1
- mov 32(up,n,8), %rax
+L(b3): mov %rax, w1
mov %rdx, w2
xor R32(w3), R32(w3)
- add $2, n
- jmp L(am3)
-L(am2p0):
- mov 32(up,n,8), %rax
- mul v0
+ mov 8(up,n,8), %rax
+ dec n
+ jmp L(lo3)
+
+L(b2): mov %rax, w2
+ mov 8(up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ add $-2, n
+ jmp L(lo2)
+
+L(b1): mov %rax, w3
+ mov 8(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b0): mov $0, R32(w3)
mov %rax, w0
- mov 32(up,n,8), %rax
+ mov 8(up,n,8), %rax
mov %rdx, w1
xor R32(w2), R32(w2)
- add $3, n
- jmp L(am0)
-L(am2p1):
- mov 32(up,n,8), %rax
- mul v0
- mov %rax, w3
- mov 32(up,n,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- jmp L(am1)
-L(am2p2):
- mov 32(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov 32(up,n,8), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- xor R32(w1), R32(w1)
- add $1, n
- jmp L(am2)
+ jmp L(lo0)
ALIGN(32)
-L(top):
- add w3, (rp,n,8) C 0 21
- adc %rax, w0 C 1 24
+L(top): mov $0, R32(w1)
+ mul v0
+ add %rax, w3
+ mov (up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(lo1): mul v1
+ add w3, (rp,n,8)
+ mov $0, R32(w3)
+ adc %rax, w0
+ mov $0, R32(w2)
mov 8(up,n,8), %rax
- adc %rdx, w1 C 3 26
- mov $0, R32(w2)
+ adc %rdx, w1
mul v0
- add %rax, w0 C 2 26
+ add %rax, w0
mov 8(up,n,8), %rax
- adc %rdx, w1 C 4 28
- adc $0, R32(w2) C 6 30
-L(am0): mul v1
- add w0, 8(rp,n,8) C 3 27
- adc %rax, w1 C 6 30
- adc %rdx, w2 C 8 32
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add w0, 8(rp,n,8)
+ adc %rax, w1
+ adc %rdx, w2
mov 16(up,n,8), %rax
- mov $0, R32(w3)
mul v0
- add %rax, w1 C 8
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
mov 16(up,n,8), %rax
- adc %rdx, w2 C 10
- adc $0, R32(w3) C 12
-L(am3): mul v1
- add w1, 16(rp,n,8) C 9
- adc %rax, w2 C 12
+L(lo3): mul v1
+ add w1, 16(rp,n,8)
+ adc %rax, w2
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
mov 24(up,n,8), %rax
- adc %rdx, w3 C 14
mul v0
- mov $0, R32(w0)
- add %rax, w2 C 14
- adc %rdx, w3 C 16
- mov $0, R32(w1)
+ add %rax, w2
mov 24(up,n,8), %rax
- adc $0, R32(w0) C 18
-L(am2): mul v1
- add w2, 24(rp,n,8) C 15
- adc %rax, w3 C 18
- adc %rdx, w0 C 20
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(lo2): mul v1
+ add w2, 24(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
mov 32(up,n,8), %rax
- mul v0
- add %rax, w3 C 20
- mov 32(up,n,8), %rax
- adc %rdx, w0 C 22
- adc $0, R32(w1) C 24
-L(am1): mul v1
add $4, n
js L(top)
- add w3, 24(rp)
+L(end): xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov (up), %rax
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w3, (rp)
adc %rax, w0
adc %rdx, w1
- mov w0, 32(rp)
+ mov w0, 8(rp)
mov w1, %rax
pop %rbp
pop %rbx
ret
EPILOGUE()
+
More information about the gmp-commit
mailing list