[Gmp-commit] /var/hg/gmp: 2 new changesets

Fri Mar 11 22:33:41 CET 2011

details:   /var/hg/gmp/rev/a7959325467f
changeset: 14037:a7959325467f
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 10 23:35:58 2011 +0100
description:
*** empty log message ***

details:   /var/hg/gmp/rev/2dc198f6c8d1
changeset: 14038:2dc198f6c8d1
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Fri Mar 11 22:33:39 2011 +0100
description:
Trivial merge.

diffstat:

 ChangeLog                           |   24 ++++
 configure.in                        |    6 +
 gmp-impl.h                          |   32 ++++++-
 mpn/asm-defs.m4                     |    7 +
 mpn/generic/toom_interpolate_5pts.c |    4 +-
 mpn/generic/toom_interpolate_6pts.c |    8 +-
 mpn/generic/toom_interpolate_8pts.c |    4 +-
 mpn/x86/atom/aorslshC_n.asm         |    8 +-
 mpn/x86/atom/sublsh1_n.asm          |    2 +-
 mpn/x86/atom/sublsh2_n.asm          |   10 +-
 mpn/x86/k7/sublsh1_n.asm            |  174 +++++++----------------------------
 tests/devel/try.c                   |   80 ++++++++++++++++-
 tests/refmpn.c                      |   60 ++++++++++++
 tests/tests.h                       |    9 +
 tune/common.c                       |   42 ++++++++
 tune/speed.c                        |   18 +++
 tune/speed.h                        |    6 +
 17 files changed, 335 insertions(+), 159 deletions(-)

diffs (truncated from 840 to 300 lines):

diff -r 747f8faeb304 -r 2dc198f6c8d1 ChangeLog

--- a/ChangeLog	Thu Mar 10 22:31:09 2011 +0100
+++ b/ChangeLog	Fri Mar 11 22:33:39 2011 +0100
@@ -1,7 +1,31 @@
+2011-03-11 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* gmp-impl.h (mpn_sublsh1_n_ip1): Declare.
+	* mpn/generic/toom_interpolate_5pts.c: Use mpn_sublsh1_n_ip1.
+
+	* tests/devel/try.c: Tests for {add,sub}lsh*_n_ip[12].
+	* tests/refmpn.c: New reference for mpn_{add,sub}lsh*_n_ip[12].
+	* tests/tests.h: Declarations for reference functions above.
+
+	* tune/common.c: New speed_mpn_{add,sub}lsh*_n_ip[12] functions.
+	* tune/speed.h: Prototypes for functions above.
+	* tune/speed.c: Support for mpn_{add,sub}lsh*_n_ip[12].
+
+	* mpn/x86/k7/sublsh1_n.asm: Replaced generic sublsh1 code with faster _ip1.
+	* mpn/x86/atom/sublsh1_n.asm: Changed PROLOGUE accordingly.
+
 2011-03-10  Marc Glisse  <marc.glisse at inria.fr>
 
 	* tests/cxx/t-istream.cc: Explicit conversion to streampos.
 
+2011-03-10  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86/atom/sse2/mul_basecase.asm: Suppress wind-down rp updates.
+
+	* Move new aorrlsh_n.asm to new k8 dir.  Revert
+	mpn/x86_64/aorrlsh_n.asm.
+	* configure.in: Setup path for new k8 directory.
+
 2011-03-10 Marco Bodrato <bodrato at mail.dm.unipi.it>
 
 	* mpn/x86/pentium4/sse2/bdiv_dbm1c.asm: New file, was in atom.
diff -r 747f8faeb304 -r 2dc198f6c8d1 configure.in
--- a/configure.in	Thu Mar 10 22:31:09 2011 +0100
+++ b/configure.in	Fri Mar 11 22:33:39 2011 +0100
@@ -3078,6 +3078,12 @@
 #undef HAVE_NATIVE_mpn_sublsh1_nc
 #undef HAVE_NATIVE_mpn_sublsh2_nc
 #undef HAVE_NATIVE_mpn_sublsh_nc
+#undef HAVE_NATIVE_mpn_sublsh1_n_ip1
+#undef HAVE_NATIVE_mpn_sublsh2_n_ip1
+#undef HAVE_NATIVE_mpn_sublsh_n_ip1
+#undef HAVE_NATIVE_mpn_sublsh1_nc_ip1
+#undef HAVE_NATIVE_mpn_sublsh2_nc_ip1
+#undef HAVE_NATIVE_mpn_sublsh_nc_ip1
 #undef HAVE_NATIVE_mpn_submul_1c
 #undef HAVE_NATIVE_mpn_udiv_qrnnd
 #undef HAVE_NATIVE_mpn_udiv_qrnnd_r
diff -r 747f8faeb304 -r 2dc198f6c8d1 gmp-impl.h
--- a/gmp-impl.h	Thu Mar 10 22:31:09 2011 +0100
+++ b/gmp-impl.h	Fri Mar 11 22:33:39 2011 +0100
@@ -818,11 +818,25 @@
 __GMP_DECLSPEC mp_limb_t mpn_addlsh_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, unsigned int, mp_limb_t));
 
 /* mpn_sublsh1_n(c,a,b,n), when it exists, sets {c,n} to {a,n}-2*{b,n}, and
-   returns the borrow out (0, 1 or 2).  */
+   returns the borrow out (0, 1 or 2). Use _ip1 when a=c. */
 #define mpn_sublsh1_n __MPN(sublsh1_n)
 __GMP_DECLSPEC mp_limb_t mpn_sublsh1_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
 #define mpn_sublsh1_nc __MPN(sublsh1_nc)
 __GMP_DECLSPEC mp_limb_t mpn_sublsh1_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+#if HAVE_NATIVE_mpn_sublsh1_n && ! HAVE_NATIVE_mpn_sublsh1_n_ip1
+#define mpn_sublsh1_n_ip1(dst,src,n) mpn_sublsh1_n(dst,dst,src,n)
+#define HAVE_NATIVE_mpn_sublsh1_n_ip1 1
+#else
+#define mpn_sublsh1_n_ip1 __MPN(sublsh1_n_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh1_n_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_nc && ! HAVE_NATIVE_mpn_sublsh1_nc_ip1
+#define mpn_sublsh1_nc_ip1(dst,src,n,c) mpn_sublsh1_nc(dst,dst,src,n,c)
+#define HAVE_NATIVE_mpn_sublsh1_nc_ip1 1
+#else
+#define mpn_sublsh1_nc_ip1 __MPN(sublsh1_nc_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh1_nc_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+#endif
 
 /* mpn_rsblsh1_n(c,a,b,n), when it exists, sets {c,n} to 2*{b,n}-{a,n}, and
    returns the carry out (-1, 0, 1).  */
@@ -832,11 +846,25 @@
 __GMP_DECLSPEC mp_limb_signed_t mpn_rsblsh1_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
 
 /* mpn_sublsh2_n(c,a,b,n), when it exists, sets {c,n} to {a,n}-4*{b,n}, and
-   returns the borrow out (0, ..., 4).  */
+   returns the borrow out (0, ..., 4). Use _ip1 when a=c. */
 #define mpn_sublsh2_n __MPN(sublsh2_n)
 __GMP_DECLSPEC mp_limb_t mpn_sublsh2_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
 #define mpn_sublsh2_nc __MPN(sublsh2_nc)
 __GMP_DECLSPEC mp_limb_t mpn_sublsh2_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+#if HAVE_NATIVE_mpn_sublsh2_n && ! HAVE_NATIVE_mpn_sublsh2_n_ip1
+#define mpn_sublsh2_n_ip1(dst,src,n) mpn_sublsh2_n(dst,dst,src,n)
+#define HAVE_NATIVE_mpn_sublsh2_n_ip1 1
+#else
+#define mpn_sublsh2_n_ip1 __MPN(sublsh2_n_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh2_n_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+#if HAVE_NATIVE_mpn_sublsh2_nc && ! HAVE_NATIVE_mpn_sublsh2_nc_ip1
+#define mpn_sublsh2_nc_ip1(dst,src,n,c) mpn_sublsh2_nc(dst,dst,src,n,c)
+#define HAVE_NATIVE_mpn_sublsh2_nc_ip1 1
+#else
+#define mpn_sublsh2_nc_ip1 __MPN(sublsh2_nc_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh2_nc_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+#endif
 
 /* mpn_sublsh_n(c,a,b,n,k), when it exists, sets {c,n} to {a,n}-2^k*{b,n}, and
    returns the carry out (0, ..., 2^k).  */
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/asm-defs.m4
--- a/mpn/asm-defs.m4	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/asm-defs.m4	Fri Mar 11 22:33:39 2011 +0100
@@ -1415,9 +1415,16 @@
 define_mpn(sub_n)
 define_mpn(sublsh1_n)
 define_mpn(sublsh1_nc)
+define_mpn(sublsh1_n_ip1)
+define_mpn(sublsh1_nc_ip1)
 define_mpn(sublsh2_n)
 define_mpn(sublsh2_nc)
+define_mpn(sublsh2_n_ip1)
+define_mpn(sublsh2_nc_ip1)
 define_mpn(sublsh_n)
+define_mpn(sublsh_nc)
+define_mpn(sublsh_n_ip1)
+define_mpn(sublsh_nc_ip1)
 define_mpn(sqrtrem)
 define_mpn(sub)
 define_mpn(sub_1)
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/generic/toom_interpolate_5pts.c
--- a/mpn/generic/toom_interpolate_5pts.c	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/generic/toom_interpolate_5pts.c	Fri Mar 11 22:33:39 2011 +0100
@@ -126,8 +126,8 @@
      result is v2 >= 0 */
   saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
   vinf[0] = vinf0;       /* Set the right value for vinf0                     */
-#ifdef HAVE_NATIVE_mpn_sublsh1_n
-  cy = mpn_sublsh1_n (v2, v2, vinf, twor);
+#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1
+  cy = mpn_sublsh1_n_ip1 (v2, vinf, twor);
 #else
   /* Overwrite unused vm1 */
   cy = mpn_lshift (vm1, vinf, twor, 1);
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/generic/toom_interpolate_6pts.c
--- a/mpn/generic/toom_interpolate_6pts.c	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/generic/toom_interpolate_6pts.c	Fri Mar 11 22:33:39 2011 +0100
@@ -167,11 +167,11 @@
   MPN_INCR_U (pp + 3 * n + 1, n, cy);
 
   /* W2 -= W0<<2 */
-#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n
-#if HAVE_NATIVE_mpn_sublsh2_n
-  cy = mpn_sublsh2_n(w2, w2, w0, w0n);
+#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
+#if HAVE_NATIVE_mpn_sublsh2_n_ip1
+  cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
 #else
-  cy = mpn_sublsh_n(w2, w2, w0, w0n, 2);
+  cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
 #endif
 #else
   /* {W4,2*n+1} is now free and can be overwritten. */
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/generic/toom_interpolate_8pts.c
--- a/mpn/generic/toom_interpolate_8pts.c	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/generic/toom_interpolate_8pts.c	Fri Mar 11 22:33:39 2011 +0100
@@ -54,8 +54,8 @@
 #endif
 #endif
 
-#if HAVE_NATIVE_mpn_sublsh2_n
-#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n(dst,dst,src,n)
+#if HAVE_NATIVE_mpn_sublsh2_n_ip1
+#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n)
 #else
 #define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws)
 #endif
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/x86/atom/aorslshC_n.asm
--- a/mpn/x86/atom/aorslshC_n.asm	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/x86/atom/aorslshC_n.asm	Fri Mar 11 22:33:39 2011 +0100
@@ -21,11 +21,11 @@
 
 include(`../config.m4')
 
-C mp_limb_t mpn_ip1_addlshC_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C mp_limb_t mpn_ip1_addlshC_nc (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
 C				mp_limb_t carry);
-C mp_limb_t mpn_ip1_sublshC_n (mp_ptr dst, mp_srcptr src, mp_size_t size,);
-C mp_limb_t mpn_ip1_sublshC_nc (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
 C				mp_signed_limb_t borrow);
 
 defframe(PARAM_CORB,	16)
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/x86/atom/sublsh1_n.asm
--- a/mpn/x86/atom/sublsh1_n.asm	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/x86/atom/sublsh1_n.asm	Fri Mar 11 22:33:39 2011 +0100
@@ -19,5 +19,5 @@
 
 include(`../config.m4')
 
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
 include_mpn(`x86/k7/sublsh1_n.asm')
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/x86/atom/sublsh2_n.asm
--- a/mpn/x86/atom/sublsh2_n.asm	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/x86/atom/sublsh2_n.asm	Fri Mar 11 22:33:39 2011 +0100
@@ -29,18 +29,18 @@
 	define(M4_opp,		subl)
 	define(M4_function,	mpn_addlsh2_n)
 	define(M4_function_c,	mpn_addlsh2_nc)
-	define(M4_ip_function_c, mpn_ip1_addlsh2_nc)
-	define(M4_ip_function,	mpn_ip1_addlsh2_n)
+	define(M4_ip_function_c, mpn_addlsh2_nc_ip1)
+	define(M4_ip_function,	mpn_addlsh2_n_ip1)
 ',`ifdef(`OPERATION_sublsh2_n', `
 	define(M4_inst,		sbbl)
 	define(M4_opp,		addl)
 	define(M4_function,	mpn_sublsh2_n)
 	define(M4_function_c,	mpn_sublsh2_nc)
-	define(M4_ip_function_c, mpn_ip1_sublsh2_nc)
-	define(M4_ip_function,	mpn_ip1_sublsh2_n)
+	define(M4_ip_function_c, mpn_sublsh2_nc_ip1)
+	define(M4_ip_function,	mpn_sublsh2_n_ip1)
 ',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n
 ')')')
 
-MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_ip1_sublsh2_n mpn_ip1_sublsh2_nc)
+MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1)
 
 include_mpn(`x86/atom/aorslshC_n.asm')
diff -r 747f8faeb304 -r 2dc198f6c8d1 mpn/x86/k7/sublsh1_n.asm
--- a/mpn/x86/k7/sublsh1_n.asm	Thu Mar 10 22:31:09 2011 +0100
+++ b/mpn/x86/k7/sublsh1_n.asm	Fri Mar 11 22:33:39 2011 +0100
@@ -1,4 +1,4 @@
-dnl  AMD K7 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
 
 dnl  Copyright 2011 Free Software Foundation, Inc.
 
@@ -42,211 +42,109 @@
 C AMD K8
 
 C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
-C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
-C that means we need an initial magic multiply.
+C processors.  It uses 2*4-way unrolling, for good reasons.
 C 
 C Breaking carry recurrency might be a good idea.  We would then need separate
 C registers for the shift carry and add/subtract carry, which in turn would
 C force is to 2*2-way unrolling.
 
-defframe(PARAM_SIZE,	16)
-defframe(PARAM_DBLD,	12)
+defframe(PARAM_SIZE,	12)
 defframe(PARAM_SRC,	 8)
 defframe(PARAM_DST,	 4)
 
 dnl  re-use parameter space
 define(VAR_COUNT,`PARAM_SIZE')
-define(VAR_TMP,`PARAM_DBLD')
 define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_VP,`PARAM_DST')
+define(SAVE_EBP,`PARAM_DST')
 
 ASM_START()
 	TEXT
 	ALIGN(8)
-PROLOGUE(mpn_sublsh1_n)
+PROLOGUE(mpn_sublsh1_n_ip1)
 deflit(`FRAME',0)
 
 define(`rp',  `%edi')
 define(`up',  `%esi')
-define(`vp',  `%ebp')
 
+	movl	PARAM_SIZE, %eax	C size
 	push	up			FRAME_pushl()
+	push	rp			FRAME_pushl()
+	xorl	%edx, %edx
 	movl	PARAM_SRC, up
-	push	rp			FRAME_pushl()
 	movl	PARAM_DST, rp
 	movl	%ebx, SAVE_EBX
-	movl	PARAM_SIZE, %ebx	C size
-	movl	vp, SAVE_VP
-	movl	PARAM_DBLD, vp
-	cmp	up, rp
-	je	L(inplace)
+	movl	%eax, %ebx
+	shr	$3, %eax
 
-	mov	$0x2aaaaaab, %eax
-	mull	%ebx
-
-	not	%edx			C count = -(size\6)-1
-	mov	%edx, VAR_COUNT
-
-	leal	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
-	xorl	%edx, %edx