[Gmp-commit] /var/hg/gmp: Rewrite to do 2x and limb squaring in main loop.
mercurial at gmplib.org
mercurial at gmplib.org
Thu Apr 27 16:20:44 UTC 2017
details: /var/hg/gmp/rev/6e4e07f8ac81
changeset: 17369:6e4e07f8ac81
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Apr 27 18:20:35 2017 +0200
description:
Rewrite to do 2x and limb squaring in main loop.
diffstat:
mpn/x86_64/zen/sqr_basecase.asm | 294 ++++++++++++++++++++++-----------------
1 files changed, 168 insertions(+), 126 deletions(-)
diffs (truncated from 415 to 300 lines):
diff -r 36b4b377a950 -r 6e4e07f8ac81 mpn/x86_64/zen/sqr_basecase.asm
--- a/mpn/x86_64/zen/sqr_basecase.asm Tue Apr 25 22:23:51 2017 +0200
+++ b/mpn/x86_64/zen/sqr_basecase.asm Thu Apr 27 18:20:35 2017 +0200
@@ -1,6 +1,6 @@
dnl AMD64 mpn_sqr_basecase optimised for AMD Zen.
-dnl Copyright 2012, 2013, 2015, 2017 Free Software Foundation, Inc.
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -31,22 +31,27 @@
include(`../config.m4')
C TODO
-C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider
-C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 instead of 8
-C product registers.
-C * Replace sqr_diag_addlsh1 code with zen optimised code.
+C * Polish.
+C * Micro-schedule.
C * Do overlapped software pipelining.
-C * Re-allocate to benefit more from 32-bit encoding (register rbp is free).
-C * Polish.
+C * Consider shallower sw pipelining of mul_1/addmul_1 loops, allowing 4
+C instead of 8 product registers. Keep 4x unrolling or go to 2x. This
+C would allow leaner feed-in as the size congruence classes (mod 2) would
+C share the same feed-in, except the final branch.
+C * Expand inner loops 4x in the outer loop, to both save some (poorly branch
+C predicted) bookkeeping, and to allow some overlapped sw pipelining.
+C * It is tempting to use 32-bit loop counts, but it is tricky as we keep all
+C counts negative, and 32-bit ops zero extend. It would work if we first
+C offset ptrs by 2^64-2^32...
define(`rp', `%rdi')
define(`up', `%rsi')
define(`un_param',`%rdx')
-define(`un', `%r14')
+define(`un', `%rbp')
define(`n', `%rcx')
-C these are used just for the small op code
+C these are used just for the small op code
define(`w0', `%r8')
define(`w1', `%r9')
define(`w2', `%r10')
@@ -62,7 +67,7 @@
PROLOGUE(mpn_sqr_basecase)
FUNC_ENTRY(3)
- cmp $2, un_param
+ cmp $2, R32(un_param)
jae L(gt1)
mov (up), %rdx
@@ -93,7 +98,7 @@
FUNC_EXIT()
ret
-L(gt2): cmp $4, un_param
+L(gt2): cmp $4, R32(un_param)
jae L(gt3)
push %rbx
@@ -133,57 +138,69 @@
FUNC_EXIT()
ret
-L(gt3):
- push %r15
- push %r14
+L(gt3): push %r15
+C push %r14
push %r13
push %r12
push %rbp
push %rbx
- mov un_param, un
+ mov R32(un_param), R32(un)
mov (up), %rdx C up[0]
mov 8(up), %r9 C up[1]
+ mulx( %rdx, %rax, %r15) C up[0]^2
+ mov %rax, (rp)
+ shl %rdx
+
lea (up,un,8), up
lea -32(rp,un,8), rp
neg un
- lea 1(un), n
+ lea 4(un), n
+ and $-4, n
- bt $0, R32(n)
- jnc L(mx0)
-L(mx1): bt $1, R32(n)
- jnc L(mb3)
+ test $1, R8(un)
+ jnz L(mx0)
+L(mx1): test $2, R8(un)
+ jz L(mb3)
L(mb1): mulx( %r9, %rbx, %rax)
- add $1, n C clear cy as side-effect
- .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
- .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ `mulx' 16(up,un,8), %r9, %r8
+ `mulx' 24(up,un,8), %r11, %r10
+ add %r15, %rbx
jmp L(mlo1)
L(mb3): mulx( %r9, %r11, %r10)
- .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
- .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
- sub $-3, n C clear cy as side-effect
- jz L(mwd3)
- test R32(%rdx), R32(%rdx) C clear cy
+ `mulx' 16(up,un,8), %r13, %r12
+ `mulx' 24(up,un,8), %rbx, %rax
+ add %r15, %r11
+ jrcxz L(n4)
jmp L(mlo3)
+L(n4): mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp) C FIXME: suppress
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp) C FIXME: suppress
+ jmp L(m)
-L(mx0): bt $1, R32(n)
- jnc L(mb0)
+L(mx0): test $2, R8(un)
+ jnz L(mb0)
L(mb2): mulx( %r9, %r13, %r12)
- .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax
- add $2, n C clear cy as side-effect
- .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ `mulx' 16(up,un,8), %rbx, %rax
+ `mulx' 24(up,un,8), %r9, %r8
+ add %r15, %r13
jmp L(mlo2)
L(mb0): mulx( %r9, %r9, %r8)
- .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
- .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ `mulx' 16(up,un,8), %r11, %r10
+ `mulx' 24(up,un,8), %r13, %r12
+ add %r15, %r9
jmp L(mlo0)
+ ALIGN(64)
L(mtop):jrcxz L(mend)
adc %r8, %r11
mov %r9, (rp,n,8)
@@ -202,7 +219,7 @@
L(mend):mov %r9, (rp)
adc %r8, %r11
-L(mwd3):mov %r11, 8(rp)
+ mov %r11, 8(rp)
adc %r10, %r13
mov %r13, 16(rp)
adc %r12, %rbx
@@ -210,54 +227,69 @@
mov %rbx, 24(rp)
mov %rax, 32(rp)
- lea 2(un), %r15
+ lea 2(un), un C FIXME: Incorporate above
L(outer):
- mov -8(up,%r15,8), %rdx C v0 = up[0]
- mov %r15, n
+ mov -8(up,un,8), %rdx C up[0]
+ lea 3(un), n
+ and $-4, n
+
+ mov -16(up,un,8), %r9 C up[-1]
+ sar $63, %r9
+ and %rdx, %r9 C "ci" in C code
+ add 32(rp,un,8), %r9
+ mulx( %rdx, %rax, %r15) C up[0]^2
+ mov (up,un,8), %r8 C up[1]
+ adc $0, %r15
+ add %rax, %r9
+ adc $0, %r15 C "cin" in C code
+ mov %r9, 32(rp,un,8)
lea 8(rp), rp
- mov (up,%r15,8), %r8 C v0 = up[1]
- bt $0, R32(n)
- jnc L(x0)
-L(x1): bt $1, R32(n)
- jnc L(b3)
+ mov -16(up,un,8), %r10 C up[-1]
+ shr $63, %r10
+ lea (%r10,%rdx,2), %rdx C "u0" arg in C code
+
+ test $1, R8(un)
+ jz L(x0)
+L(x1): test $2, R8(un)
+ jz L(b3)
L(b1): mulx( %r8, %rbx, %rax)
- add $1, n C clear cy as side-effect
- .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
- .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ add %r15, %rbx
+ adc $0, %rax
+ `mulx' 8(up,un,8), %r9, %r8
+ `mulx' 16(up,un,8), %r11, %r10
jmp L(lo1)
L(b0): mulx( %r8, %r9, %r8)
- .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
- .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
- xor R32(%rax), R32(%rax)
+ `mulx' 8(up,un,8), %r11, %r10
+ `mulx' 16(up,un,8), %r13, %r12
+ add %r15, %r9
jmp L(lo0)
-L(b3): mulx( %r8, %r11, %r10)
- .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
- .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
- add %r10, %r13
+L(x0): test $2, R8(un)
+ jz L(b0)
+
+L(b2): mulx( %r8, %r13, %r12)
+ `mulx' 8(up,un,8), %rbx, %rax
+ add %r15, %r13
adc %r12, %rbx
adc $0, %rax
- sub $-3, n C clear cy as side-effect
- jz L(wd3)
- test R32(%rdx), R32(%rdx) C clear cy
+ `mulx' 16(up,un,8), %r9, %r8
+ jmp L(lo2)
+
+L(b3): mulx( %r8, %r11, %r10)
+ `mulx' 8(up,un,8), %r13, %r12
+ `mulx' 16(up,un,8), %rbx, %rax
+ add %r15, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jrcxz L(xit3)
jmp L(lo3)
-L(x0): bt $1, R32(n)
- jnc L(b0)
-
-L(b2): mulx( %r8, %r13, %r12)
- .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax
- add %r12, %rbx
- adc $0, %rax
- lea 2(n), n
- jrcxz L(xit2)
- .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
- jmp L(lo2)
-
+ ALIGN(64)
L(top): add %r9, (rp,n,8)
L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
adc %r11, 8(rp,n,8)
@@ -270,76 +302,86 @@
adc %r8, %r11
adc %r10, %r13
adc %r12, %rbx
- adc $0, %rax C rax = carry limb
+ adc $0, %rax
add $4, n
- js L(top)
+ jnz L(top)
add %r9, (rp)
-L(wd3): adc %r11, 8(rp)
-L(wd2): adc %r13, 16(rp)
-L(wd1): adc %rbx, 24(rp)
- adc $0, %rax
- mov %rax, 32(rp)
-
- add $1, %r15
- jmp L(outer)
-
-L(xit2):add %r13, 16(rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
adc %rbx, 24(rp)
adc $0, %rax
mov %rax, 32(rp)
- mov -16(up), %rdx
- lea 8(rp), rp
- mov -8(up), %r8
More information about the gmp-commit
mailing list