[Gmp-commit] /var/hg/gmp: 7 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Jul 19 22:25:53 UTC 2017
details: /var/hg/gmp/rev/31be3698f320
changeset: 17464:31be3698f320
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Jul 19 23:33:52 2017 +0200
description:
Get pentium4 code instead of k6 code for better speed on modern Intel P6 cores.
details: /var/hg/gmp/rev/79840c3eb0fe
changeset: 17465:79840c3eb0fe
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Jul 20 00:15:04 2017 +0200
description:
(define_mpn): Add sbpi1_bdiv_q, sbpi1_bdiv_qr, sbpi1_bdiv_r.
details: /var/hg/gmp/rev/819493052b69
changeset: 17466:819493052b69
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Jul 20 00:17:16 2017 +0200
description:
(mpn_sbpi1_bdiv_r): Declare.
details: /var/hg/gmp/rev/b74ff42f4bbd
changeset: 17467:b74ff42f4bbd
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Jul 20 00:18:17 2017 +0200
description:
New file.
details: /var/hg/gmp/rev/48d04f272174
changeset: 17468:48d04f272174
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Jul 20 00:22:09 2017 +0200
description:
New file.
details: /var/hg/gmp/rev/b3104e2006fb
changeset: 17469:b3104e2006fb
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Jul 20 00:25:18 2017 +0200
description:
Minor tweak.
details: /var/hg/gmp/rev/f2247fcc6a86
changeset: 17470:f2247fcc6a86
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Jul 20 00:25:49 2017 +0200
description:
ChangeLog
diffstat:
ChangeLog | 25 +
gmp-impl.h | 3 +
mpn/asm-defs.m4 | 3 +
mpn/generic/sbpi1_bdiv_r.c | 79 ++++++
mpn/x86/p6/sse2/submul_1.asm | 2 +-
mpn/x86_64/zen/sbpi1_bdiv_r.asm | 507 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 618 insertions(+), 1 deletions(-)
diffs (truncated from 661 to 300 lines):
diff -r 62abbaeaab13 -r f2247fcc6a86 ChangeLog
--- a/ChangeLog Wed Jul 19 23:30:07 2017 +0200
+++ b/ChangeLog Thu Jul 20 00:25:49 2017 +0200
@@ -1,3 +1,28 @@
+2017-07-20 Torbjörn Granlund <tg at gmplib.org>
+
+ * gmp-impl.h (mpn_sbpi1_bdiv_r): Declare.
+
+ * mpn/asm-defs.m4 (define_mpn): Add sbpi1_bdiv_q, sbpi1_bdiv_qr,
+ sbpi1_bdiv_r.
+
+ * mpn/generic/sbpi1_bdiv_r.c: New file.
+ * mpn/x86_64/zen/sbpi1_bdiv_r.asm: New file.
+
+2017-07-19 Torbjörn Granlund <tg at gmplib.org>
+
+ * mpn/x86/p6/sse2/submul_1.asm: Get pentium4 code instead of k6 code
+ for better speed on modern Intel P6 cores.
+
+2017-07-02 Torbjörn Granlund <tg at gmplib.org>
+
+ * tune/tuneup.c (tune_mullo): For MULLO_BASECASE_THRESHOLD start at 2.
+ (tune_sqrlo): Likewise.
+
+2017-06-28 Torbjörn Granlund <tg at gmplib.org>
+
+ * tests/Makefile.am tests/*/Makefile.am tune/Makefile.am (AM_LDFLAGS):
+ Define. (Thanks to Emmanuel Thomé and Vincent Lefevre.)
+
2017-06-27 Torbjörn Granlund <tg at gmplib.org>
* mpn/x86_64/zen/sqr_basecase.asm: Expand to use 4 addmul_1 loops.
diff -r 62abbaeaab13 -r f2247fcc6a86 gmp-impl.h
--- a/gmp-impl.h Wed Jul 19 23:30:07 2017 +0200
+++ b/gmp-impl.h Thu Jul 20 00:25:49 2017 +0200
@@ -1507,6 +1507,9 @@
#define mpn_sbpi1_bdiv_q __MPN(sbpi1_bdiv_q)
__GMP_DECLSPEC void mpn_sbpi1_bdiv_q (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+#define mpn_sbpi1_bdiv_r __MPN(sbpi1_bdiv_r)
+__GMP_DECLSPEC void mpn_sbpi1_bdiv_r (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+
#define mpn_dcpi1_bdiv_qr __MPN(dcpi1_bdiv_qr)
__GMP_DECLSPEC mp_limb_t mpn_dcpi1_bdiv_qr (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
#define mpn_dcpi1_bdiv_qr_n_itch __MPN(dcpi1_bdiv_qr_n_itch)
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/asm-defs.m4
--- a/mpn/asm-defs.m4 Wed Jul 19 23:30:07 2017 +0200
+++ b/mpn/asm-defs.m4 Thu Jul 20 00:25:49 2017 +0200
@@ -1453,6 +1453,9 @@
define_mpn(rsh1sub_nc)
define_mpn(rshift)
define_mpn(rshiftc)
+define_mpn(sbpi1_bdiv_q)
+define_mpn(sbpi1_bdiv_qr)
+define_mpn(sbpi1_bdiv_r)
define_mpn(scan0)
define_mpn(scan1)
define_mpn(set_str)
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/generic/sbpi1_bdiv_r.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/generic/sbpi1_bdiv_r.c Thu Jul 20 00:25:49 2017 +0200
@@ -0,0 +1,79 @@
+/* mpn_sbpi1_bdiv_r -- schoolbook Hensel division with precomputed inverse,
+ returning remainder.
+
+ Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
+ IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp-impl.h"
+
+
+/* Computes a binary quotient of size qn = un - dn.
+ Output:
+
+ Q = -U * D^{-1} mod B^qn,
+
+ R = (U + Q * D) * B^(-qn)
+
+ Stores the dn least significant limbs of R at {up + un - dn, dn},
+ and returns the carry from the addition N + Q*D.
+
+ D must be odd. dinv is (-D)^-1 mod B. */
+
+mp_limb_t
+mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+ mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+{
+ mp_size_t i;
+ mp_limb_t cy;
+
+ ASSERT (dn > 0);
+ ASSERT (un > dn);
+ ASSERT ((dp[0] & 1) != 0);
+ ASSERT ((dp[0] * dinv) == 0);
+
+ for (i = un - dn, cy = 0; i != 0; i--)
+ {
+ mp_limb_t q = dinv * up[0];
+ mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q);
+
+ hi += cy;
+ cy = hi < cy;
+ hi += up[dn];
+ cy += hi < up[dn];
+ up[dn] = hi;
+ up++;
+ }
+
+ return cy;
+}
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/x86/p6/sse2/submul_1.asm
--- a/mpn/x86/p6/sse2/submul_1.asm Wed Jul 19 23:30:07 2017 +0200
+++ b/mpn/x86/p6/sse2/submul_1.asm Thu Jul 20 00:25:49 2017 +0200
@@ -32,4 +32,4 @@
MULFUNC_PROLOGUE(mpn_submul_1)
-include_mpn(`x86/k6/aorsmul_1.asm')
+include_mpn(`x86/pentium4/sse2/submul_1.asm')
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/x86_64/zen/sbpi1_bdiv_r.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/zen/sbpi1_bdiv_r.asm Thu Jul 20 00:25:49 2017 +0200
@@ -0,0 +1,507 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(`up', `%rdi')
+define(`un_param', `%rsi')
+define(`dp_param', `%rdx')
+define(`dn_param', `%rcx')
+define(`dinv', `%r8')
+
+define(`i', `%rcx')
+define(`dn', `%r14')
+
+define(`dp', `%rsi')
+define(`un', `%r15')
+
+C TODO
+C * The o1...o8 loops for special dn counts were naively hand-optimised by
+C folding the generic loops. They can probably be tuned. The speculative
+C quotient limb generation might not be in the optimal spot.
+C * Perhaps avoid late-in-loop jumps, e.g., lo0.
+C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.
+
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), dinv ')
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ sub dn_param, un_param C outer loop count
+ mov dn_param, dn C FIXME: Suppress by reg re-alloc
+ push dinv C keep dinv on stack
+ mov un_param, un C FIXME: Suppress by reg re-alloc
+ xor R32(%rbp), R32(%rbp)
+
+ lea (dp_param,dn_param,8), dp
+
+ mov (up), %rdx
+ imul dinv, %rdx C first quotient limb
+
+ neg dn
+ lea -32(up,dn_param,8), up
+
+ test $1, R8(dn_param)
+ jnz L(cx1)
+
+L(cx0): test $2, R8(dn_param)
+ jnz L(b2)
+
+
+C =============================================================================
+L(b0): cmp $-4, dn
+ jnz L(gt4)
+
+L(o4): mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add (up), %r9
+ adc 8(up), %r11
+ mov %r8, %rdx C dinv
+ mov %r11, 8(up)
+ mulx( %r11, %rdx, %r12) C next quotient
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o4)
+ jmp L(ret)
+
+L(gt4): cmp $-8, dn
+ jnz L(out0)
+
+L(o8): mulx( -64,(dp), %r9, %r14)
+ mulx( -56,(dp), %rcx, %r10)
+ mulx( -48,(dp), %r13, %r12)
+ mulx( -40,(dp), %rbx, %rax)
+ add %r14, %rcx
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add -32(up), %r9
+ mulx( -32,(dp), %r9, %r14)
+ adc -24(up), %rcx
+ mov %rcx, -24(up)
+ mulx( -24,(dp), %r11, %r10)
+ adc %r13, -16(up)
+ mulx( -16,(dp), %r13, %r12)
+ adc %rbx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
More information about the gmp-commit
mailing list