[Gmp-commit] /var/hg/gmp: 7 new changesets

Wed Jul 19 22:25:53 UTC 2017

details:   /var/hg/gmp/rev/31be3698f320
changeset: 17464:31be3698f320
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Jul 19 23:33:52 2017 +0200
description:
Get pentium4 code instead of k6 code for better speed on modern Intel P6 cores.

details:   /var/hg/gmp/rev/79840c3eb0fe
changeset: 17465:79840c3eb0fe
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Jul 20 00:15:04 2017 +0200
description:
(define_mpn): Add sbpi1_bdiv_q, sbpi1_bdiv_qr, sbpi1_bdiv_r.

details:   /var/hg/gmp/rev/819493052b69
changeset: 17466:819493052b69
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Jul 20 00:17:16 2017 +0200
description:
(mpn_sbpi1_bdiv_r): Declare.

details:   /var/hg/gmp/rev/b74ff42f4bbd
changeset: 17467:b74ff42f4bbd
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Jul 20 00:18:17 2017 +0200
description:
New file.

details:   /var/hg/gmp/rev/48d04f272174
changeset: 17468:48d04f272174
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Jul 20 00:22:09 2017 +0200
description:
New file.

details:   /var/hg/gmp/rev/b3104e2006fb
changeset: 17469:b3104e2006fb
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Jul 20 00:25:18 2017 +0200
description:
Minor tweak.

details:   /var/hg/gmp/rev/f2247fcc6a86
changeset: 17470:f2247fcc6a86
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Jul 20 00:25:49 2017 +0200
description:
ChangeLog

diffstat:

 ChangeLog                       |   25 +
 gmp-impl.h                      |    3 +
 mpn/asm-defs.m4                 |    3 +
 mpn/generic/sbpi1_bdiv_r.c      |   79 ++++++
 mpn/x86/p6/sse2/submul_1.asm    |    2 +-
 mpn/x86_64/zen/sbpi1_bdiv_r.asm |  507 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 618 insertions(+), 1 deletions(-)

diffs (truncated from 661 to 300 lines):

diff -r 62abbaeaab13 -r f2247fcc6a86 ChangeLog

--- a/ChangeLog	Wed Jul 19 23:30:07 2017 +0200
+++ b/ChangeLog	Thu Jul 20 00:25:49 2017 +0200
@@ -1,3 +1,28 @@
+2017-07-20  TorbjÃ¶rn Granlund  <tg at gmplib.org>
+
+	* gmp-impl.h (mpn_sbpi1_bdiv_r): Declare.
+
+	* mpn/asm-defs.m4 (define_mpn): Add sbpi1_bdiv_q, sbpi1_bdiv_qr,
+	sbpi1_bdiv_r.
+
+	* mpn/generic/sbpi1_bdiv_r.c: New file.
+	* mpn/x86_64/zen/sbpi1_bdiv_r.asm: New file.
+
+2017-07-19  TorbjÃ¶rn Granlund  <tg at gmplib.org>
+
+	* mpn/x86/p6/sse2/submul_1.asm: Get pentium4 code instead of k6 code
+	for better speed on modern Intel P6 cores.
+
+2017-07-02  TorbjÃ¶rn Granlund  <tg at gmplib.org>
+
+	* tune/tuneup.c (tune_mullo): For MULLO_BASECASE_THRESHOLD start at 2.
+	(tune_sqrlo): Likewise.
+
+2017-06-28  TorbjÃ¶rn Granlund  <tg at gmplib.org>
+
+	* tests/Makefile.am tests/*/Makefile.am tune/Makefile.am (AM_LDFLAGS):
+	Define.  (Thanks to Emmanuel ThomÃ© and Vincent Lefevre.)
+
 2017-06-27  TorbjÃ¶rn Granlund  <tg at gmplib.org>
 
 	* mpn/x86_64/zen/sqr_basecase.asm: Expand to use 4 addmul_1 loops.
diff -r 62abbaeaab13 -r f2247fcc6a86 gmp-impl.h
--- a/gmp-impl.h	Wed Jul 19 23:30:07 2017 +0200
+++ b/gmp-impl.h	Thu Jul 20 00:25:49 2017 +0200
@@ -1507,6 +1507,9 @@
 #define   mpn_sbpi1_bdiv_q __MPN(sbpi1_bdiv_q)
 __GMP_DECLSPEC void      mpn_sbpi1_bdiv_q (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
 
+#define   mpn_sbpi1_bdiv_r __MPN(sbpi1_bdiv_r)
+__GMP_DECLSPEC void      mpn_sbpi1_bdiv_r (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
+
 #define   mpn_dcpi1_bdiv_qr __MPN(dcpi1_bdiv_qr)
 __GMP_DECLSPEC mp_limb_t mpn_dcpi1_bdiv_qr (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
 #define   mpn_dcpi1_bdiv_qr_n_itch __MPN(dcpi1_bdiv_qr_n_itch)
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/asm-defs.m4
--- a/mpn/asm-defs.m4	Wed Jul 19 23:30:07 2017 +0200
+++ b/mpn/asm-defs.m4	Thu Jul 20 00:25:49 2017 +0200
@@ -1453,6 +1453,9 @@
 define_mpn(rsh1sub_nc)
 define_mpn(rshift)
 define_mpn(rshiftc)
+define_mpn(sbpi1_bdiv_q)
+define_mpn(sbpi1_bdiv_qr)
+define_mpn(sbpi1_bdiv_r)
 define_mpn(scan0)
 define_mpn(scan1)
 define_mpn(set_str)
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/generic/sbpi1_bdiv_r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/generic/sbpi1_bdiv_r.c	Thu Jul 20 00:25:49 2017 +0200
@@ -0,0 +1,79 @@
+/* mpn_sbpi1_bdiv_r -- schoolbook Hensel division with precomputed inverse,
+   returning remainder.
+
+   Contributed to the GNU project by Niels MÃ¶ller and TorbjÃ¶rn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
+   IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Computes a binary quotient of size qn = un - dn.
+   Output:
+
+      Q = -U * D^{-1} mod B^qn,
+
+      R = (U + Q * D) * B^(-qn)
+
+   Stores the dn least significant limbs of R at {up + un - dn, dn},
+   and returns the carry from the addition N + Q*D.
+
+   D must be odd. dinv is (-D)^-1 mod B. */
+
+mp_limb_t
+mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+		  mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+{
+  mp_size_t i;
+  mp_limb_t cy;
+
+  ASSERT (dn > 0);
+  ASSERT (un > dn);
+  ASSERT ((dp[0] & 1) != 0);
+  ASSERT ((dp[0] * dinv) == 0);
+
+  for (i = un - dn, cy = 0; i != 0; i--)
+    {
+      mp_limb_t q = dinv * up[0];
+      mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q);
+
+      hi += cy;
+      cy = hi < cy;
+      hi += up[dn];
+      cy += hi < up[dn];
+      up[dn] = hi;
+      up++;
+    }
+
+  return cy;
+}
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/x86/p6/sse2/submul_1.asm
--- a/mpn/x86/p6/sse2/submul_1.asm	Wed Jul 19 23:30:07 2017 +0200
+++ b/mpn/x86/p6/sse2/submul_1.asm	Thu Jul 20 00:25:49 2017 +0200
@@ -32,4 +32,4 @@
 
 
 MULFUNC_PROLOGUE(mpn_submul_1)
-include_mpn(`x86/k6/aorsmul_1.asm')
+include_mpn(`x86/pentium4/sse2/submul_1.asm')
diff -r 62abbaeaab13 -r f2247fcc6a86 mpn/x86_64/zen/sbpi1_bdiv_r.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/zen/sbpi1_bdiv_r.asm	Thu Jul 20 00:25:49 2017 +0200
@@ -0,0 +1,507 @@
+dnl  AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(`up',       `%rdi')
+define(`un_param', `%rsi')
+define(`dp_param', `%rdx')
+define(`dn_param', `%rcx')
+define(`dinv',     `%r8')
+
+define(`i',        `%rcx')
+define(`dn',       `%r14')
+
+define(`dp',       `%rsi')
+define(`un',       `%r15')
+
+C TODO
+C  * The o1...o8  loops for special dn counts were naively hand-optimised by
+C    folding the generic loops.  They can probably be tuned.  The speculative
+C    quotient limb generation might not be in the optimal spot.
+C  * Perhaps avoid late-in-loop jumps, e.g., lo0.
+C  * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.
+
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), dinv	')
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	sub	dn_param, un_param		C outer loop count
+	mov	dn_param, dn		C FIXME: Suppress by reg re-alloc
+	push	dinv				C keep dinv on stack
+	mov	un_param, un		C FIXME: Suppress by reg re-alloc
+	xor	R32(%rbp), R32(%rbp)
+
+	lea	(dp_param,dn_param,8), dp
+
+	mov	(up), %rdx
+	imul	dinv, %rdx			C first quotient limb
+
+	neg	dn
+	lea	-32(up,dn_param,8), up
+
+	test	$1, R8(dn_param)
+	jnz	L(cx1)
+
+L(cx0):	test	$2, R8(dn_param)
+	jnz	L(b2)
+
+
+C =============================================================================
+L(b0):	cmp	$-4, dn
+	jnz	L(gt4)
+
+L(o4):	mulx(	-32,(dp), %r9, %r14)
+	mulx(	-24,(dp), %r11, %r10)
+	mulx(	-16,(dp), %r13, %r12)
+	mulx(	-8,(dp), %rbx, %rax)
+	add	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	(up), %r9
+	adc	8(up), %r11
+	mov	%r8, %rdx			C dinv
+	mov	%r11, 8(up)
+	mulx(	%r11, %rdx, %r12)		C next quotient
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o4)
+	jmp	L(ret)
+
+L(gt4):	cmp	$-8, dn
+	jnz	L(out0)
+
+L(o8):	mulx(	-64,(dp), %r9, %r14)
+	mulx(	-56,(dp), %rcx, %r10)
+	mulx(	-48,(dp), %r13, %r12)
+	mulx(	-40,(dp), %rbx, %rax)
+	add	%r14, %rcx
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	-32(up), %r9
+	mulx(	-32,(dp), %r9, %r14)
+	adc	-24(up), %rcx
+	mov	%rcx, -24(up)
+	mulx(	-24,(dp), %r11, %r10)
+	adc	%r13, -16(up)
+	mulx(	-16,(dp), %r13, %r12)
+	adc	%rbx, -8(up)
+	adc	%rax, %r9
+	mulx(	-8,(dp), %rbx, %rax)
+	adc	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%r8, %rdx			C dinv
+	mulx(	%rcx, %rdx, %r12)		C next quotient
+	add	%r9, (up)
+	adc	%r11, 8(up)