[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Jun 14 04:49:05 UTC 2017
details: /var/hg/gmp/rev/d1b353dbca0d
changeset: 17445:d1b353dbca0d
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Jun 11 20:37:22 2017 +0200
description:
Crossjump for code size and mix lead-in insns for lower overhead.
details: /var/hg/gmp/rev/1c2a86e2c1ad
changeset: 17446:1c2a86e2c1ad
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Jun 14 06:45:02 2017 +0200
description:
Provide Zen and Broadwell mpn_mullo_basecase.
details: /var/hg/gmp/rev/0ed5f12cb7d8
changeset: 17447:0ed5f12cb7d8
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Jun 14 06:45:30 2017 +0200
description:
ChangeLog
diffstat:
ChangeLog | 27 ++-
mpn/x86_64/coreibwl/mullo_basecase.asm | 395 +++++++++++++++++++++++++++++++++
mpn/x86_64/popham.asm | 68 ++---
mpn/x86_64/zen/mullo_basecase.asm | 300 +++++++++++++++++++++++++
4 files changed, 746 insertions(+), 44 deletions(-)
diffs (truncated from 873 to 300 lines):
diff -r 4d3ad139a326 -r 0ed5f12cb7d8 ChangeLog
--- a/ChangeLog Fri Jun 09 23:25:36 2017 +0200
+++ b/ChangeLog Wed Jun 14 06:45:30 2017 +0200
@@ -1,5 +1,26 @@
+2017-06-14 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/x86_64/zen/mullo_basecase.asm: New file.
+ * mpn/x86_64/coreibwl/mullo_basecase.asm: New file.
+
+2017-06-11 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/x86_64/popham.asm: Crossjump for code size and mix lead-in insns
+ for lower overhead.
+
+2017-06-09 Torbjörn Granlund <tg at gmplib.org>
+
+ * configure.ac: Set GMP_NONSTD_ABI protecting against dots in the abi.
+ (hppa): Remove old GNU/Linux restriction to 32-bit ABI.
+
+2017-06-07 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/x86_64/bd1/addmul_2.asm: New file.
+
2017-06-06 Torbjörn Granlund <tg at gmplib.org>
+ * mpn/x86_64/coreihwl/aors_n.asm: New file.
+
* mpn/x86_64/x86_64-defs.m4 (c4_helper): New macro.
(mulx, shlx, shrx): Use c4_helper.
@@ -18,9 +39,9 @@
Make zen path correspond to non-fat path.
* mpn/x86/fat/fat.c (__gmpn_cpuvec_init): Adapt to bt1/bt2 changes.
- * mpn/x86_64/bt2/copyi.asm: New file.
- * mpn/x86_64/bt2/copyd.asm: New file.
- * mpn/x86_64/bt2/com.asm: New file.
+ * mpn/x86_64/bt2/copyi.asm: New grabber file.
+ * mpn/x86_64/bt2/copyd.asm: New grabber file.
+ * mpn/x86_64/bt2/com.asm: New grabber file.
2017-06-03 Torbjörn Granlund <tg at gmplib.org>
diff -r 4d3ad139a326 -r 0ed5f12cb7d8 mpn/x86_64/coreibwl/mullo_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreibwl/mullo_basecase.asm Wed Jun 14 06:45:30 2017 +0200
@@ -0,0 +1,395 @@
+dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`jmpreg',`%rbx')
+define(`nn', `%rbp')
+
+C TODO
+C * Suppress more rp[] rewrites in corner.
+C * Rearrange feed-in jumps for short branch forms.
+C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since
+C feed-in code implodes, the blow-up will not be more than perhaps 4x.
+C * Micro-optimise critical lead-in code block around L(ent).
+C * Write n < 4 code specifically for Broadwell (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, R32(n)
+ jae L(big)
+
+ mov vp_param, vp
+ mov (up), %rdx
+
+ cmp $2, R32(n)
+ jae L(gt1)
+L(n1): imul (vp), %rdx
+ mov %rdx, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp), %r9
+ mulx( %r9, %rax, %rdx)
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp), %r9
+ mulx( %r9, %rax, %r10) C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rdx
+ mulx( %r9, %rax, %rdx) C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r8
+ mov (up), %rdx
+ mulx( %r8, %rax, %rdx) C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r8 C u1 x v1
+ add %r8, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(big): push %r14
+ push %r12
+ push %rbx
+ push %rbp
+ mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end
+ imul (up), %r14 C FIXME Put at absolute end
+ lea -3(n), R32(nn)
+ lea 8(vp_param), vp
+ mov (vp_param), %rdx
+
+ mov R32(n), R32(%rax)
+ shr $3, R32(n)
+ and $7, R32(%rax) C clear OF, CF as side-effect
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( (up), %r10, %r8)
+ lea 56(up), up
+ lea -8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(mb0)
+
+L(mf3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea 16(rp), rp
+ jrcxz L(mc)
+ inc R32(n)
+ lea L(f2)(%rip), jmpreg
+ jmp L(mb3)
+
+L(mc): mulx( -8,(up), %r10, %r8)
+ add %rax, %r10
+ mov %r9, -16(rp)
+ mulx( (up), %r9, %rax)
+ mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ jmp L(c2)
+
+L(mf4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc R32(n)
+ lea L(f3)(%rip), jmpreg
+ jmp L(mb4)
+
+L(mf5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc R32(n)
+ lea L(f4)(%rip), jmpreg
+ jmp L(mb5)
+
+L(mf6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc R32(n)
+ lea L(f5)(%rip), jmpreg
+ jmp L(mb6)
+
+L(mf7): mulx( (up), %r9, %rax)
+ lea 48(up), up
+ lea 48(rp), rp
+ lea L(f6)(%rip), jmpreg
+ jmp L(mb7)
+
+L(mf1): mulx( (up), %r9, %rax)
+ lea L(f0)(%rip), jmpreg
+ jmp L(mb1)
+
+L(mf2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ lea L(f1)(%rip), jmpreg
+ mulx( (up), %r9, %rax)
+
+C FIXME ugly fallthrough FIXME
+ ALIGN(32)
+L(mtop):mov %r10, -8(rp)
+ adc %r8, %r9
+L(mb1): mulx( 8,(up), %r10, %r8)
+ adc %rax, %r10
+ lea 64(up), up
+ mov %r9, (rp)
+L(mb0): mov %r10, 8(rp)
+ mulx( -48,(up), %r9, %rax)
+ lea 64(rp), rp
+ adc %r8, %r9
+L(mb7): mulx( -40,(up), %r10, %r8)
+ mov %r9, -48(rp)
+ adc %rax, %r10
+L(mb6): mov %r10, -40(rp)
+ mulx( -32,(up), %r9, %rax)
+ adc %r8, %r9
+L(mb5): mulx( -24,(up), %r10, %r8)
+ mov %r9, -32(rp)
+ adc %rax, %r10
+L(mb4): mulx( -16,(up), %r9, %rax)
+ mov %r10, -24(rp)
+ adc %r8, %r9
+L(mb3): mulx( -8,(up), %r10, %r8)
+ adc %rax, %r10
+ mov %r9, -16(rp)
+ dec R32(n)
+ mulx( (up), %r9, %rax)
+ jnz L(mtop)
+
+L(mend):mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ adc %rcx, %rax
+
+ lea 8(,nn,8), %r12
+ neg %r12
+ shr $3, R32(nn)
+ jmp L(ent)
+
+L(f0): mulx( (up), %r10, %r8)
+ lea -8(up), up
+ lea -8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(b0)
+
+L(f1): mulx( (up), %r9, %rax)
+ lea -1(nn), R32(nn)
+ lea L(f0)(%rip), jmpreg
+ jmp L(b1)
+
+L(end): adox( (rp), %r9)
+ mov %r9, (rp)
+ adox( %rcx, %rax) C relies on rcx = 0
+ adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits
+ lea 8(%r12), %r12
More information about the gmp-commit
mailing list