[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Apr 16 22:44:32 UTC 2017
details: /var/hg/gmp/rev/0492ea7fdb27
changeset: 17348:0492ea7fdb27
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Apr 16 22:59:23 2017 +0200
description:
(addmul_1 variant): Rewrite to compute in-place and to avoid are-computation.
details: /var/hg/gmp/rev/17e04afe49f2
changeset: 17349:17e04afe49f2
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon Apr 17 00:07:50 2017 +0200
description:
Fix header comments.
details: /var/hg/gmp/rev/38e4409e2f03
changeset: 17350:38e4409e2f03
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon Apr 17 00:42:55 2017 +0200
description:
Add more AMD Zen files.
details: /var/hg/gmp/rev/26e8e5269bbf
changeset: 17351:26e8e5269bbf
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon Apr 17 00:43:35 2017 +0200
description:
Use immediate op instead of reg.
details: /var/hg/gmp/rev/830a47fb463f
changeset: 17352:830a47fb463f
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon Apr 17 00:44:28 2017 +0200
description:
ChangeLog
diffstat:
ChangeLog | 36 ++++
mpn/generic/sqr_basecase.c | 47 ++---
mpn/x86_64/k10/lshift.asm | 2 +-
mpn/x86_64/k10/lshiftc.asm | 2 +-
mpn/x86_64/k10/rshift.asm | 2 +-
mpn/x86_64/zen/mul_1.asm | 2 +-
mpn/x86_64/zen/mul_basecase.asm | 265 ++++++++++++++++++++++++++++++
mpn/x86_64/zen/sqr_basecase.asm | 347 ++++++++++++++++++++++++++++++++++++++++
8 files changed, 672 insertions(+), 31 deletions(-)
diffs (truncated from 776 to 300 lines):
diff -r 834c423765dc -r 830a47fb463f ChangeLog
--- a/ChangeLog Sun Apr 16 05:40:32 2017 +0200
+++ b/ChangeLog Mon Apr 17 00:44:28 2017 +0200
@@ -1,8 +1,44 @@
+2017-04-17 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/x86_64/zen/mul_basecase.asm: New file.
+ * mpn/x86_64/zen/sqr_basecase.asm: New file.
+
+2017-04-16 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/generic/sqr_basecase.c (addmul_1 variant): Rewrite to compute
+ in-place and to avoid a re-computation.
+
+ * configure.ac: Remove k8, k10 from zen path.
+
+2017-04-15 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/x86_64/zen/gmp-mparam.h: New file.
+
+ * mpn/x86_64/zen/com.asm: New file, grabbing another asm file.
+ * mpn/x86_64/zen/copyd.asm: Likewise.
+ * mpn/x86_64/zen/copyi.asm: Likewise.
+ * mpn/x86_64/zen/gcd_1.asm: Likewise.
+ * mpn/x86_64/zen/hamdist.asm: Likewise.
+ * mpn/x86_64/zen/lshift.asm: Likewise.
+ * mpn/x86_64/zen/lshiftc.asm: Likewise.
+ * mpn/x86_64/zen/popcount.asm: Likewise.
+ * mpn/x86_64/zen/rshift.asm: Likewise.
+
+ * config.guess: Recognise AMD zen.
+ * acinclude.m4 (X86_64_PATTERN): Add zen.
+ * config.sub: Corresponding changes.
+ * configure.ac: Corresponding changes.
+ * mpn/x86_64/fat/fat.c: Corresponding changes.
+
2017-03-27 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpz/oddfac_1.c (limb_apprsqrt): Better approximation.
* mpz/bin_uiui.c (limb_apprsqrt): Likewise.
+2017-03-12 Torbjörn Granlund <tg at gmplib.org>
+
+ * tests/mpf/t-get_d_2exp.c (check_data): Rewrite of check_onebit.
+
2017-03-08 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/generic/sqrtrem.c: Direct use of sqrtrem2 when n==2.
diff -r 834c423765dc -r 830a47fb463f mpn/generic/sqr_basecase.c
--- a/mpn/generic/sqr_basecase.c Sun Apr 16 05:40:32 2017 +0200
+++ b/mpn/generic/sqr_basecase.c Mon Apr 17 00:44:28 2017 +0200
@@ -5,8 +5,8 @@
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011 Free Software
-Foundation, Inc.
+Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2017 Free
+Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -289,36 +289,29 @@
void
mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
{
- mp_size_t i;
-
- ASSERT (n >= 1);
- ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));
-
- {
- mp_limb_t ul, lpl;
- ul = up[0];
- umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
- rp[0] = lpl >> GMP_NAIL_BITS;
- }
- if (n > 1)
+ if (n == 1)
{
- mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
- mp_ptr tp = tarr;
- mp_limb_t cy;
-
- /* must fit 2*n limbs in tarr */
- ASSERT (n <= SQR_TOOM2_THRESHOLD);
+ mp_limb_t ul, lpl;
+ ul = up[0];
+ umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
+ rp[0] = lpl >> GMP_NAIL_BITS;
+ }
+ else
+ {
+ mp_size_t i;
+ mp_ptr xp;
- cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
- tp[n - 1] = cy;
- for (i = 2; i < n; i++)
+ rp += 1;
+ rp[n - 1] = mpn_mul_1 (rp, up + 1, n - 1, up[0]);
+ for (i = n - 2; i != 0; i--)
{
- mp_limb_t cy;
- cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
- tp[n + i - 2] = cy;
+ up += 1;
+ rp += 2;
+ rp[i] = mpn_addmul_1 (rp, up + 1, i, up[0]);
}
- MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
+ xp = rp - 2 * n + 3;
+ MPN_SQR_DIAG_ADDLSH1 (xp, xp + 1, up - n + 2, n);
}
}
#endif
diff -r 834c423765dc -r 830a47fb463f mpn/x86_64/k10/lshift.asm
--- a/mpn/x86_64/k10/lshift.asm Sun Apr 16 05:40:32 2017 +0200
+++ b/mpn/x86_64/k10/lshift.asm Mon Apr 17 00:44:28 2017 +0200
@@ -1,4 +1,4 @@
-dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge.
+dnl X86-64 mpn_lshift optimised for AMD K10.
dnl Copyright 2012 Free Software Foundation, Inc.
diff -r 834c423765dc -r 830a47fb463f mpn/x86_64/k10/lshiftc.asm
--- a/mpn/x86_64/k10/lshiftc.asm Sun Apr 16 05:40:32 2017 +0200
+++ b/mpn/x86_64/k10/lshiftc.asm Mon Apr 17 00:44:28 2017 +0200
@@ -1,4 +1,4 @@
-dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge.
+dnl X86-64 mpn_lshiftc optimised for AMD K10.
dnl Copyright 2012 Free Software Foundation, Inc.
diff -r 834c423765dc -r 830a47fb463f mpn/x86_64/k10/rshift.asm
--- a/mpn/x86_64/k10/rshift.asm Sun Apr 16 05:40:32 2017 +0200
+++ b/mpn/x86_64/k10/rshift.asm Mon Apr 17 00:44:28 2017 +0200
@@ -1,4 +1,4 @@
-dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge.
+dnl X86-64 mpn_rshift optimised for AMD K10.
dnl Copyright 2012 Free Software Foundation, Inc.
diff -r 834c423765dc -r 830a47fb463f mpn/x86_64/zen/mul_1.asm
--- a/mpn/x86_64/zen/mul_1.asm Sun Apr 16 05:40:32 2017 +0200
+++ b/mpn/x86_64/zen/mul_1.asm Mon Apr 17 00:44:28 2017 +0200
@@ -150,7 +150,7 @@
L(wd2): adc %r10, %r13
mov %r13, 16(rp)
L(wd1): adc %r12, %rbx
- adc n, %rax
+ adc $0, %rax
mov %rbx, 24(rp)
pop %r13
diff -r 834c423765dc -r 830a47fb463f mpn/x86_64/zen/mul_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/zen/mul_basecase.asm Mon Apr 17 00:44:28 2017 +0200
@@ -0,0 +1,265 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Zen.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider
+C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 instead of 8
+C product registers.
+C * Do overlapped software pipelining.
+C * Let vn_param be vn to save a copy.
+C * Re-allocate to benefit more from 32-bit encoding.
+C * Polish.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx')
+define(`vn_param', `%r8')
+
+define(`un', `%r14')
+define(`vp', `%rbp')
+define(`v0', `%rdx')
+define(`n', `%rcx')
+define(`vn', `%r15')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ cmp $2, un_param
+ ja L(gen)
+ mov (vp_param), %rdx
+ mulx( (up), %rax, %r9) C 0 1
+ je L(s2x)
+
+L(s11): mov %rax, (rp)
+ mov %r9, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(s2x): cmp $2, vn_param
+ mulx( 8,(up), %r8, %r10) C 1 2
+ je L(s22)
+
+L(s21): add %r8, %r9
+ adc $0, %r10
+ mov %rax, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(s22): add %r8, %r9 C 1
+ adc $0, %r10 C 2
+ mov 8(vp_param), %rdx
+ mov %rax, (rp)
+ mulx( (up), %r8, %r11) C 1 2
+ mulx( 8,(up), %rax, %rdx) C 2 3
+ add %r11, %rax C 2
+ adc $0, %rdx C 3
+ add %r8, %r9 C 1
+ adc %rax, %r10 C 2
+ adc $0, %rdx C 3
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(gen): push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov un_param, un
+ mov vp_param, vp
+ mov vn_param, vn
+
+ mov (up), %r9
+ mov (vp), v0
+
+ lea (up,un,8), up
+ lea -32(rp,un,8), rp
+
+ neg un
+ mov un, n
+ bt $0, R32(un)
+ jnc L(mx0)
+L(mx1): bt $1, R32(un)
+ jnc L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ add $1, n C clear cy
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
+ sub $-3, n
+ jz L(mwd3)
+ add $0, %r11
More information about the gmp-commit
mailing list