[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Oct 24 09:12:27 CEST 2011
details: /var/hg/gmp/rev/a203e27eadf2
changeset: 14382:a203e27eadf2
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Oct 24 09:12:00 2011 +0200
description:
Put intermediate result into R, don't allocate any stack space.
details: /var/hg/gmp/rev/b3e498c3dff7
changeset: 14383:b3e498c3dff7
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Oct 24 09:12:18 2011 +0200
description:
*** empty log message ***
diffstat:
ChangeLog | 5 +++
mpn/x86_64/sqr_basecase.asm | 71 +++++++++++++++++++-------------------------
2 files changed, 36 insertions(+), 40 deletions(-)
diffs (215 lines):
diff -r a7f13a059476 -r b3e498c3dff7 ChangeLog
--- a/ChangeLog Sun Oct 23 21:57:00 2011 +0200
+++ b/ChangeLog Mon Oct 24 09:12:18 2011 +0200
@@ -1,3 +1,8 @@
+2011-10-24 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/sqr_basecase.asm: Put intermediate result into R, don't
+ allocate any stack space.
+
2011-10-23 Torbjorn Granlund <tege at gmplib.org>
* mpn/s390_64/logops_n.asm: Use nc, oc, xc when possible.
diff -r a7f13a059476 -r b3e498c3dff7 mpn/x86_64/sqr_basecase.asm
--- a/mpn/x86_64/sqr_basecase.asm Sun Oct 23 21:57:00 2011 +0200
+++ b/mpn/x86_64/sqr_basecase.asm Mon Oct 24 09:12:18 2011 +0200
@@ -30,11 +30,11 @@
C code which uses addmul_2s from the start, conditionally leaving a 1x1
C multiply to the end. (In assembly code, one would stop invoking
C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
-C * This code only handles operands up to SQR_TOOM2_THRESHOLD_MAX. That
-C means we can safely use 32-bit operations for all sizes, unlike in e.g.,
-C mpn_addmul_1.
+C * Align more labels, should shave off a few cycles.
+C * We can safely use 32-bit size operations, since operands with (2^32)
+C limbs will lead to non-termination in practice.
C * The jump table could probably be optimized, at least for non-pic.
-C * The special code for n=1,2,3 was quickly written. It is probably too
+C * The special code for n <= 4 was quickly written. It is probably too
C large and unnecessarily slow.
C * Consider combining small cases code so that the n=k-1 code jumps into the
C middle of the n=k code.
@@ -62,12 +62,6 @@
define(`up', `%rsi')
define(`n_param', `%rdx')
-C We should really trim this, for better spatial locality. Alternatively,
-C we could grab the upper part of the stack area, leaving the lower part
-C instead of the upper part unused.
-deflit(SQR_TOOM2_THRESHOLD_MAX, 80)
-define(`STACK_ALLOC', eval(8*2*SQR_TOOM2_THRESHOLD_MAX))
-
define(`n', `%r11')
define(`tp', `%r12')
define(`i', `%r8')
@@ -85,12 +79,12 @@
ALIGN(16)
PROLOGUE(mpn_sqr_basecase)
- add $-48, %rsp
- mov %rbx, 40(%rsp)
- mov %rbp, 32(%rsp)
- mov %r12, 24(%rsp)
- mov %r13, 16(%rsp)
- mov %r14, 8(%rsp)
+ add $-40, %rsp
+ mov %rbx, 32(%rsp)
+ mov %rbp, 24(%rsp)
+ mov %r12, 16(%rsp)
+ mov %r13, 8(%rsp)
+ mov %r14, (%rsp)
mov R32(n_param), R32(n) C free original n register (rdx)
mov R32(n_param), R32(%rcx)
@@ -117,7 +111,7 @@
mul %rax
mov %rax, (rp)
mov %rdx, 8(rp)
- add $40, %rsp
+ add $32, %rsp
pop %rbx
ret
@@ -141,7 +135,7 @@
mov %r10, 16(rp)
adc $0, %r11
mov %r11, 24(rp)
- add $40, %rsp
+ add $32, %rsp
pop %rbx
ret
@@ -186,7 +180,7 @@
adc %r10, 24(rp)
adc %r11, 32(rp)
adc %rbx, 40(rp)
- add $40, %rsp
+ add $32, %rsp
pop %rbx
ret
@@ -256,15 +250,15 @@
adc %r12, 40(rp)
adc %rbp, 48(rp)
adc %rbx, 56(rp)
- add $24, %rsp
+ add $16, %rsp
pop %r12
pop %rbp
pop %rbx
ret
-L(0m4): add $-STACK_ALLOC, %rsp
- lea -24(%rsp,n,8), tp C point tp in middle of result operand
+L(0m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
mov (up), v0
mov 8(up), %rax
lea (up,n,8), up C point up at end of input operand
@@ -321,8 +315,8 @@
jmp L(dowhile)
-L(1m4): add $-STACK_ALLOC, %rsp
- lea (%rsp,n,8), tp C point tp in middle of result operand
+L(1m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
mov (up), v0 C u0
mov 8(up), %rax C u1
lea 8(up,n,8), up C point up at end of input operand
@@ -336,7 +330,7 @@
mul v0 C u0 * u1
mov %rdx, w1
xor R32(w2), R32(w2)
- mov %rax, (%rsp)
+ mov %rax, 8(rp)
jmp L(m0)
ALIGN(16)
@@ -399,8 +393,8 @@
jmp L(dowhile_end)
-L(2m4): add $-STACK_ALLOC, %rsp
- lea -24(%rsp,n,8), tp C point tp in middle of result operand
+L(2m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
mov (up), v0
mov 8(up), %rax
lea (up,n,8), up C point up at end of input operand
@@ -456,8 +450,8 @@
jmp L(dowhile_mid)
-L(3m4): add $-STACK_ALLOC, %rsp
- lea (%rsp,n,8), tp C point tp in middle of result operand
+L(3m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
mov (up), v0 C u0
mov 8(up), %rax C u1
lea 8(up,n,8), up C point up at end of input operand
@@ -472,7 +466,7 @@
mov %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
- mov %rax, (%rsp)
+ mov %rax, 8(rp)
jmp L(m2)
ALIGN(16)
@@ -709,11 +703,9 @@
C Function mpn_sqr_diag_addlsh1
lea -4(n,n), j
- mov (%rsp), %r11
-
+ mov 8(rp), %r11
+ lea -8(up), up
lea (rp,j,8), rp
- lea -8(up), up
- lea 8(%rsp,j,8), tp
neg j
mov (up,j,4), %rax
mul %rax
@@ -741,9 +733,9 @@
adc %rdx, %r11
mov %r10, (rp,j,8)
L(d0): mov %r11, 8(rp,j,8)
- mov (tp,j,8), %r10
+ mov 16(rp,j,8), %r10
adc %r10, %r10
- mov 8(tp,j,8), %r11
+ mov 24(rp,j,8), %r11
adc %r11, %r11
nop
sbb R32(%rbp), R32(%rbp) C save CF
@@ -754,9 +746,9 @@
adc %rdx, %r11
mov %r10, 16(rp,j,8)
L(d1): mov %r11, 24(rp,j,8)
- mov 16(tp,j,8), %r10
+ mov 32(rp,j,8), %r10
adc %r10, %r10
- mov 24(tp,j,8), %r11
+ mov 40(rp,j,8), %r11
adc %r11, %r11
sbb R32(%rbx), R32(%rbx) C save CF
add $4, j
@@ -769,7 +761,7 @@
adc %rdx, %r11
mov %r10, (rp)
mov %r11, 8(rp)
- mov (tp), %r10
+ mov 16(rp), %r10
adc %r10, %r10
sbb R32(%rbp), R32(%rbp) C save CF
neg R32(%rbp)
@@ -781,7 +773,6 @@
mov %r10, 16(rp)
mov %rdx, 24(rp)
- add $eval(8+STACK_ALLOC), %rsp
pop %r14
pop %r13
pop %r12
More information about the gmp-commit
mailing list