[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Apr 27 20:44:17 UTC 2015
details: /var/hg/gmp/rev/1b1a6be2b9f8
changeset: 16599:1b1a6be2b9f8
user: Torbjorn Granlund <torbjorng at google.com>
date: Mon Apr 27 22:43:24 2015 +0200
description:
Tune broadwell basecase functions.
details: /var/hg/gmp/rev/c679b9059102
changeset: 16600:c679b9059102
user: Torbjorn Granlund <torbjorng at google.com>
date: Mon Apr 27 22:44:13 2015 +0200
description:
Trivial merge.
diffstat:
mpn/generic/invert.c | 6 +-
mpn/x86_64/coreibwl/mul_basecase.asm | 128 +++++++++++++++-------------------
mpn/x86_64/coreibwl/sqr_basecase.asm | 70 ++++++++----------
3 files changed, 89 insertions(+), 115 deletions(-)
diffs (truncated from 431 to 300 lines):
diff -r 0c6b3f298b36 -r c679b9059102 mpn/generic/invert.c
--- a/mpn/generic/invert.c Sun Apr 26 20:51:31 2015 +0200
+++ b/mpn/generic/invert.c Mon Apr 27 22:44:13 2015 +0200
@@ -57,10 +57,10 @@
xp = scratch; /* 2 * n limbs */
/* n > 1 here */
- i = n - 1;
+ i = n;
do
- xp[i] = GMP_NUMB_MAX;
- while (--i >= 0);
+ xp[--i] = GMP_NUMB_MAX;
+ while (i);
mpn_com (xp + n, dp, n);
if (n == 2) {
mpn_divrem_2 (ip, 0, xp, 4, dp);
diff -r 0c6b3f298b36 -r c679b9059102 mpn/x86_64/coreibwl/mul_basecase.asm
--- a/mpn/x86_64/coreibwl/mul_basecase.asm Sun Apr 26 20:51:31 2015 +0200
+++ b/mpn/x86_64/coreibwl/mul_basecase.asm Mon Apr 27 22:44:13 2015 +0200
@@ -54,11 +54,9 @@
C optimisation tool suite written by David Harvey and Torbjorn Granlund.
C TODO
-C * Tune non-loop code. Very little effort has been spent there.
+C * Do overlapped software pipelining.
C * When changing this, make sure the code which falls into the inner loops
-C does not execute too many no-ops.
-C * Eliminate rp_save and up_save by keeping un_save as a negated, scaled
-C counter, similar to the sqr_basecase of this directory.
+C does not execute too many no-ops (for both PIC and non-PIC).
define(`rp', `%rdi')
define(`up', `%rsi')
@@ -67,10 +65,9 @@
define(`vn', `%r8')
define(`n', `%rcx')
-define(`rp_save', `%r13')
-define(`up_save', `%rbx')
-define(`un_save', `%rbp')
+define(`n_save', `%rbp')
define(`vp', `%r14')
+define(`unneg', `%rbx')
define(`v0', `%rdx')
define(`jaddr', `%rax')
@@ -79,9 +76,6 @@
define(`w2', `%r10')
define(`w3', `%r11')
-C %rax %rbx %rcx %rdx %rdi %rsi %rbp
-C %r8 %r9 %r10 %r11 %r12 %r13 %r14 %r15
-
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
@@ -134,19 +128,18 @@
push %rbx
push %rbp
push %r12
- push %r13
push %r14
mov vp_param, vp
- mov un_param, un_save
- mov rp, rp_save
- mov up, up_save
-
- mov R32(un_save), R32(%rax)
- shr $3, un_save
+ lea 1(un_param), unneg
+ mov un_param, n_save
+ mov R32(un_param), R32(%rax)
+ and $-8, unneg
+ shr $3, n_save C loop count
+ neg unneg
and $7, R32(%rax) C clear CF for adc as side-effect
C note that rax lives very long
- mov un_save, n
+ mov n_save, n
mov (vp), v0
lea 8(vp), vp
@@ -159,52 +152,50 @@
jmp *(%r10,%rax,8)
')
-L(mf0): mulx( (up_save), w2, w3)
- lea 56(up_save), up
- lea -8(rp_save), rp
+L(mf0): mulx( (up), w2, w3)
+ lea 56(up), up
+ lea -8(rp), rp
jmp L(mb0)
-L(mf3): mulx( (up_save), w0, w1)
- lea 16(up_save), up
- lea 16(rp_save), rp
+L(mf3): mulx( (up), w0, w1)
+ lea 16(up), up
+ lea 16(rp), rp
inc n
jmp L(mb3)
-L(mf4): mulx( (up_save), w2, w3)
- lea 24(up_save), up
- lea 24(rp_save), rp
+L(mf4): mulx( (up), w2, w3)
+ lea 24(up), up
+ lea 24(rp), rp
inc n
jmp L(mb4)
-L(mf5): mulx( (up_save), w0, w1)
- lea 32(up_save), up
- lea 32(rp_save), rp
+L(mf5): mulx( (up), w0, w1)
+ lea 32(up), up
+ lea 32(rp), rp
inc n
jmp L(mb5)
-L(mf6): mulx( (up_save), w2, w3)
- lea 40(up_save), up
- lea 40(rp_save), rp
+L(mf6): mulx( (up), w2, w3)
+ lea 40(up), up
+ lea 40(rp), rp
inc n
jmp L(mb6)
-L(mf7): mulx( (up_save), w0, w1)
- lea 48(up_save), up
- lea 48(rp_save), rp
+L(mf7): mulx( (up), w0, w1)
+ lea 48(up), up
+ lea 48(rp), rp
inc n
jmp L(mb7)
-L(mf1): mulx( (up_save), w0, w1)
+L(mf1): mulx( (up), w0, w1)
jmp L(mb1)
-L(mf2): mulx( (up_save), w2, w3)
- lea 8(up_save), up
- lea 8(rp_save), rp
+L(mf2): mulx( (up), w2, w3)
+ lea 8(up), up
+ lea 8(rp), rp
mulx( (up), w0, w1)
- test n, n
- jz L(m1end)
- ALIGN(32)
+ ALIGN(16)
L(m1top):
mov w2, -8(rp)
adc w3, w0
@@ -254,47 +245,39 @@
')
L(outer):
- mov un_save, n
+ lea (up,unneg,8), up
+ mov n_save, n
mov (vp), v0
lea 8(vp), vp
- lea 8(rp_save), rp_save
jmp *jaddr
-C addmul_1
-L(f0): mulx( (up_save), w2, w3)
- lea -8(up_save), up
- lea -8(rp_save), rp
+L(f0): mulx( 8,(up), w2, w3)
+ lea 8(rp,unneg,8), rp
lea -1(n), n
jmp L(b0)
-L(f3): mulx( (up_save), w0, w1)
- lea 16(up_save), up
- lea -48(rp_save), rp
+L(f3): mulx( -16,(up), w0, w1)
+ lea -56(rp,unneg,8), rp
jmp L(b3)
-L(f4): mulx( (up_save), w2, w3)
- lea 24(up_save), up
- lea -40(rp_save), rp
+L(f4): mulx( -24,(up), w2, w3)
+ lea -56(rp,unneg,8), rp
jmp L(b4)
-L(f5): mulx( (up_save), w0, w1)
- lea 32(up_save), up
- lea -32(rp_save), rp
+L(f5): mulx( -32,(up), w0, w1)
+ lea -56(rp,unneg,8), rp
jmp L(b5)
-L(f6): mulx( (up_save), w2, w3)
- lea 40(up_save), up
- lea -24(rp_save), rp
+L(f6): mulx( -40,(up), w2, w3)
+ lea -56(rp,unneg,8), rp
jmp L(b6)
-L(f7): mulx( (up_save), w0, w1)
- lea -16(up_save), up
- lea -16(rp_save), rp
+L(f7): mulx( 16,(up), w0, w1)
+ lea 8(rp,unneg,8), rp
jmp L(b7)
-L(f1): mulx( (up_save), w0, w1)
- lea (up_save), up
- lea (rp_save), rp
+L(f1): mulx( (up), w0, w1)
+ lea 8(rp,unneg,8), rp
jmp L(b1)
L(am1end):
@@ -304,22 +287,21 @@
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
- dec vn
+ dec vn C clear CF and OF as side-effect
jnz L(outer)
L(done):
pop %r14
- pop %r13
pop %r12
pop %rbp
pop %rbx
ret
-L(f2): mulx( (up_save), w2, w3)
- lea 8(up_save), up
- lea 8(rp_save), rp
+L(f2):
+ mulx( -8,(up), w2, w3)
+ lea 8(rp,unneg,8), rp
mulx( (up), w0, w1)
- ALIGN(32)
+ ALIGN(16)
L(am1top):
adox( -8,(rp), w2)
adcx( w3, w0)
diff -r 0c6b3f298b36 -r c679b9059102 mpn/x86_64/coreibwl/sqr_basecase.asm
--- a/mpn/x86_64/coreibwl/sqr_basecase.asm Sun Apr 26 20:51:31 2015 +0200
+++ b/mpn/x86_64/coreibwl/sqr_basecase.asm Mon Apr 27 22:44:13 2015 +0200
@@ -63,10 +63,8 @@
C * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code. We have
C 3 variants below, but the haswell code turns out to be fastest.
C * Do overlapped software pipelining.
-C * Tune non-loop code. Very little effort has been spent there.
C * When changing this, make sure the code which falls into the inner loops
-C does not execute too many no-ops.
-C * Improve awkward un_save computation.
+C does not execute too many no-ops (for both PIC and non-PIC).
define(`rp', `%rdi')
define(`up', `%rsi')
@@ -167,14 +165,14 @@
push up
push un_param
- lea -3(un_param), un_save
- lea -1(un_param), R32(%rax) C FIXME: rotate jump tables instead
- shr $3, un_save
- lea 1(un_save), n
- neg un_save
- shl $3, un_save
+ lea -3(un_param), R32(un_save)
+ lea 5(un_param), n
+ mov R32(un_param), R32(%rax)
+ and $-8, R32(un_save)
+ shr $3, R32(n) C count for mul_1 loop
+ neg un_save C 8*count and offert for addmul_1 loops
and $7, R32(%rax) C clear CF for adc as side-effect
- C note that rax lives very long
+
mov (up), u0
lea L(mtab)(%rip), %r10
@@ -224,7 +222,7 @@
L(mf2): mulx( 8,(up), w2, w3)
lea 16(up), up
lea 16(rp), rp
- dec n
+ dec R32(n)
mulx( (up), w0, w1)
ALIGN(16)
@@ -253,7 +251,7 @@
L(mb3): mulx( -8,(up), w2, w3)
More information about the gmp-commit
mailing list