[Gmp-commit] /var/hg/gmp: Rewrite to do 2x and limb squaring in main loop.
mercurial at gmplib.org
mercurial at gmplib.org
Mon Jun 26 21:44:17 UTC 2017
details: /var/hg/gmp/rev/78e40fad7642
changeset: 17456:78e40fad7642
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon Jun 26 23:44:10 2017 +0200
description:
Rewrite to do 2x and limb squaring in main loop.
diffstat:
mpn/x86_64/coreibwl/sqr_basecase.asm | 423 +++++++++++++++++-----------------
1 files changed, 210 insertions(+), 213 deletions(-)
diffs (truncated from 657 to 300 lines):
diff -r ef66a6a32972 -r 78e40fad7642 mpn/x86_64/coreibwl/sqr_basecase.asm
--- a/mpn/x86_64/coreibwl/sqr_basecase.asm Tue Jun 20 15:55:33 2017 +0200
+++ b/mpn/x86_64/coreibwl/sqr_basecase.asm Mon Jun 26 23:44:10 2017 +0200
@@ -1,6 +1,6 @@
dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
-dnl Copyright 2015 Free Software Foundation, Inc.
+dnl Copyright 2015, 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -61,12 +61,11 @@
C hardly allow correct branch prediction. On 2nd thought, we now might make
C each of the 8 loop branches be poorly predicted since they will be
C executed fewer times for each time. With just one addmul_1 loop, the loop
-C count will change only once each 8th time!
-C * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code. We have
-C 3 variants below, but the haswell code turns out to be fastest.
+C count will change only once each 8th time.
C * Do overlapped software pipelining.
-C * When changing this, make sure the code which falls into the inner loops
-C does not execute too many no-ops (for both PIC and non-PIC).
+C * Perhaps load in shrx/sarx, eliminating separate load insn.
+C * Schedule add+stored in small n code.
+C * Try swapping adox and adcx insn, making mulx have more time to run.
define(`rp', `%rdi')
define(`up', `%rsi')
@@ -163,12 +162,8 @@
L(gt3): push %rbx
- push rp
- push up
- push un_param
-
lea -3(un_param), R32(un_save)
- lea 5(un_param), n
+ lea 5(un_param), R32(n)
mov R32(un_param), R32(%rax)
and $-8, R32(un_save)
shr $3, R32(n) C count for mul_1 loop
@@ -186,45 +181,75 @@
jmp *(%r10,%rax,8)
')
-L(mf0): mulx( 8,(up), w2, w3)
+L(mf0): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
lea 64(up), up
-C lea (rp), rp
+ add w1, w2
jmp L(mb0)
-L(mf3): mulx( 8,(up), w0, w1)
+L(mf3): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mov w2, (rp)
+ mulx( 8,(up), w0, w1)
lea 24(up), up
lea 24(rp), rp
+ add w3, w0
jmp L(mb3)
-L(mf4): mulx( 8,(up), w2, w3)
+L(mf4): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
lea 32(up), up
lea 32(rp), rp
+ add w1, w2
jmp L(mb4)
-L(mf5): mulx( 8,(up), w0, w1)
+L(mf5): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
lea 40(up), up
lea 40(rp), rp
+ add w3, w0
jmp L(mb5)
-L(mf6): mulx( 8,(up), w2, w3)
+L(mf6): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
lea 48(up), up
lea 48(rp), rp
+ add w1, w2
jmp L(mb6)
-L(mf7): mulx( 8,(up), w0, w1)
+L(mf7): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
lea 56(up), up
lea 56(rp), rp
+ add w3, w0
jmp L(mb7)
-L(mf1): mulx( 8,(up), w0, w1)
+L(mf1): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
lea 8(up), up
lea 8(rp), rp
+ add w3, w0
jmp L(mb1)
-L(mf2): mulx( 8,(up), w2, w3)
+L(mf2): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
lea 16(up), up
lea 16(rp), rp
dec R32(n)
+ add w1, w2
mulx( (up), w0, w1)
ALIGN(16)
@@ -233,8 +258,8 @@
L(mb1): mulx( 8,(up), w2, w3)
adc w1, w2
lea 64(up), up
- mov w0, (rp)
-L(mb0): mov w2, 8(rp)
+L(mb0): mov w0, (rp)
+ mov w2, 8(rp)
mulx( -48,(up), w0, w1)
lea 64(rp), rp
adc w3, w0
@@ -259,29 +284,35 @@
L(end): mov w2, -8(rp)
adc w3, w0
- mov w0, (rp)
- adc %rcx, w1
- mov w1, 8(rp)
+C mov w0, (rp)
+C adc %rcx, w1
+C mov w1, 8(rp)
lea L(atab)(%rip), %r10
ifdef(`PIC',
` movslq (%r10,%rax,4), %r11
lea (%r11, %r10), %r11
- jmp *%r11
',`
- jmp *(%r10,%rax,8)
+ mov (%r10,%rax,8), %r11
')
+ mov $63, R32(%rax)
+ jmp *%r11
L(ed0): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
- mov w0, (rp)
+L(f7): mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
-L(f7): lea -64(up,un_save,8), up
- or R32(un_save), R32(n)
- mov 8(up), u0
- mulx( 16,(up), w0, w1)
+ lea -64(up,un_save,8), up
+ mov R32(un_save), R32(n)
lea -56(rp,un_save,8), rp
+ mov (up), w1 C up[-1]
+ mov 8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
jmp L(b7)
ALIGN(16)
@@ -292,9 +323,9 @@
mulx( 8,(up), w2, w3)
adox( (rp), w0)
lea 8(n), R32(n)
- mov w0, (rp)
+L(b0): mov w0, (rp)
adcx( w1, w2)
-L(b0): mulx( 16,(up), w0, w1)
+ mulx( 16,(up), w0, w1)
adcx( w3, w0)
adox( 8,(rp), w2)
mov w2, 8(rp)
@@ -325,14 +356,22 @@
L(ed1): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
- mov w0, (rp)
+L(f0): mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
-L(f0): lea -64(up,un_save,8), up
- or R32(un_save), R32(n)
- mov (up), u0
+ lea -64(up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -8(up), w3 C up[-1]
+ mov (up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
mulx( 8,(up), w2, w3)
- lea -56(rp,un_save,8), rp
+ adox( (rp), w0)
jmp L(b0)
ALIGN(16)
@@ -376,15 +415,25 @@
L(ed2): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
- mov w0, (rp)
+L(f1): mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
-L(f1): lea (up,un_save,8), up
- or R32(un_save), R32(n)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
lea 8(un_save), un_save
- mov -8(up), u0
+ lea -56(rp,un_save,8), rp
+ mov -16(up), w1 C up[-1]
+ mov -8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2) C FIXME: crossjump?
mulx( (up), w0, w1)
- lea -56(rp,un_save,8), rp
+ adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
jmp L(b1)
ALIGN(16)
@@ -418,7 +467,7 @@
adox( 40,(rp), w2)
adcx( w3, w0)
mov w2, 40(rp)
- adox( 48,(rp), w0)
+L(b2): adox( 48,(rp), w0)
mulx( -8,(up), w2, w3)
mov w0, 48(rp)
lea 64(rp), rp
@@ -428,17 +477,22 @@
L(ed3): adox( (rp), w0)
adox( %rcx, w1) C relies on rcx = 0
- mov w0, (rp)
+L(f2): mov w0, (rp)
adc %rcx, w1 C relies on rcx = 0
mov w1, 8(rp)
-L(f2): lea (up,un_save,8), up
+ lea (up,un_save,8), up
or R32(un_save), R32(n)
- jz L(corner2)
- mov -16(up), u0
- mulx( -8,(up), w2, w3)
- lea 8(rp,un_save,8), rp
- mulx( (up), w0, w1)
- jmp L(tp2)
+ jz L(cor3)
+ lea -56(rp,un_save,8), rp
+ mov -24(up), w3 C up[-1]
+ mov -16(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ jmp L(b2)
ALIGN(16)
L(tp3): adox( -8,(rp), w2)
@@ -467,11 +521,11 @@
adcx( w1, w2)
adox( 32,(rp), w0)
mov w0, 32(rp)
- mulx( -16,(up), w0, w1)
More information about the gmp-commit
mailing list