[Gmp-commit] /var/hg/gmp: 4 new changesets

Fri Mar 11 12:51:03 CET 2011

details:   /var/hg/gmp/rev/71bfeee49abb
changeset: 14033:71bfeee49abb
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Mar 11 12:36:46 2011 +0100
description:
Declare and use sublsh1_ip1.

details:   /var/hg/gmp/rev/fbc7aba7ef17
changeset: 14034:fbc7aba7ef17
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Mar 11 12:42:41 2011 +0100
description:
tests/devel/try support for some _ip1 and _ip2 functions.

details:   /var/hg/gmp/rev/29a1f71e0aac
changeset: 14035:29a1f71e0aac
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Mar 11 12:46:51 2011 +0100
description:
tune/speed support for some _ip1 and _ip2 functions.

details:   /var/hg/gmp/rev/41fc97b31c4d
changeset: 14036:41fc97b31c4d
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Mar 11 12:50:58 2011 +0100
description:
Replaced sublsh1 function with _ip1 version for k7 and atom.

diffstat:

 ChangeLog                           |   16 +++
 gmp-impl.h                          |   16 +++-
 mpn/generic/toom_interpolate_5pts.c |    4 +-
 mpn/x86/atom/sublsh1_n.asm          |    2 +-
 mpn/x86/k7/sublsh1_n.asm            |  174 +++++++----------------------------
 tests/devel/try.c                   |   80 ++++++++++++++++-
 tests/refmpn.c                      |   60 ++++++++++++
 tests/tests.h                       |    9 +
 tune/common.c                       |   42 ++++++++
 tune/speed.c                        |   18 +++
 tune/speed.h                        |    6 +
 11 files changed, 284 insertions(+), 143 deletions(-)

diffs (truncated from 686 to 300 lines):

diff -r 042d12fa4384 -r 41fc97b31c4d ChangeLog

--- a/ChangeLog	Fri Mar 11 07:38:08 2011 +0100
+++ b/ChangeLog	Fri Mar 11 12:50:58 2011 +0100
@@ -1,3 +1,19 @@
+2011-03-11 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* gmp-impl.h (mpn_sublsh1_n_ip1): Declare.
+	* mpn/generic/toom_interpolate_5pts.c: Use mpn_sublsh1_n_ip1.
+
+	* tests/devel/try.c: Tests for {add,sub}lsh*_n_ip[12].
+	* tests/refmpn.c: New reference for mpn_{add,sub}lsh*_n_ip[12].
+	* tests/tests.h: Declarations for reference functions above.
+
+	* tune/common.c: New speed_mpn_{add,sub}lsh*_n_ip[12] functions.
+	* tune/speed.h: Prototypes for functions above.
+	* tune/speed.c: Support for mpn_{add,sub}lsh*_n_ip[12].
+
+	* mpn/x86/k7/sublsh1_n.asm: Replaced generic sublsh1 code with faster _ip1.
+	* mpn/x86/atom/sublsh1_n.asm: Changed PROLOGUE accordingly.
+
 2011-03-10  Marc Glisse  <marc.glisse at inria.fr>
 
 	* tests/cxx/t-istream.cc: Explicit conversion to streampos.
diff -r 042d12fa4384 -r 41fc97b31c4d gmp-impl.h
--- a/gmp-impl.h	Fri Mar 11 07:38:08 2011 +0100
+++ b/gmp-impl.h	Fri Mar 11 12:50:58 2011 +0100
@@ -818,11 +818,25 @@
 __GMP_DECLSPEC mp_limb_t mpn_addlsh_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, unsigned int, mp_limb_t));
 
 /* mpn_sublsh1_n(c,a,b,n), when it exists, sets {c,n} to {a,n}-2*{b,n}, and
-   returns the borrow out (0, 1 or 2).  */
+   returns the borrow out (0, 1 or 2). Use _ip1 when a=c. */
 #define mpn_sublsh1_n __MPN(sublsh1_n)
 __GMP_DECLSPEC mp_limb_t mpn_sublsh1_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
 #define mpn_sublsh1_nc __MPN(sublsh1_nc)
 __GMP_DECLSPEC mp_limb_t mpn_sublsh1_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+#if HAVE_NATIVE_mpn_sublsh1_n && ! HAVE_NATIVE_mpn_sublsh1_n_ip1
+#define mpn_sublsh1_n_ip1(dst,src,n) mpn_sublsh1_n(dst,dst,src,n)
+#define HAVE_NATIVE_mpn_sublsh1_n_ip1 1
+#else
+#define mpn_sublsh1_n_ip1 __MPN(sublsh1_n_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh1_n_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_nc && ! HAVE_NATIVE_mpn_sublsh1_nc_ip1
+#define mpn_sublsh1_nc_ip1(dst,src,n,c) mpn_sublsh1_nc(dst,dst,src,n,c)
+#define HAVE_NATIVE_mpn_sublsh1_nc_ip1 1
+#else
+#define mpn_sublsh1_nc_ip1 __MPN(sublsh1_nc_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh1_nc_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+#endif
 
 /* mpn_rsblsh1_n(c,a,b,n), when it exists, sets {c,n} to 2*{b,n}-{a,n}, and
    returns the carry out (-1, 0, 1).  */
diff -r 042d12fa4384 -r 41fc97b31c4d mpn/generic/toom_interpolate_5pts.c
--- a/mpn/generic/toom_interpolate_5pts.c	Fri Mar 11 07:38:08 2011 +0100
+++ b/mpn/generic/toom_interpolate_5pts.c	Fri Mar 11 12:50:58 2011 +0100
@@ -126,8 +126,8 @@
      result is v2 >= 0 */
   saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
   vinf[0] = vinf0;       /* Set the right value for vinf0                     */
-#ifdef HAVE_NATIVE_mpn_sublsh1_n
-  cy = mpn_sublsh1_n (v2, v2, vinf, twor);
+#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1
+  cy = mpn_sublsh1_n_ip1 (v2, vinf, twor);
 #else
   /* Overwrite unused vm1 */
   cy = mpn_lshift (vm1, vinf, twor, 1);
diff -r 042d12fa4384 -r 41fc97b31c4d mpn/x86/atom/sublsh1_n.asm
--- a/mpn/x86/atom/sublsh1_n.asm	Fri Mar 11 07:38:08 2011 +0100
+++ b/mpn/x86/atom/sublsh1_n.asm	Fri Mar 11 12:50:58 2011 +0100
@@ -19,5 +19,5 @@
 
 include(`../config.m4')
 
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
 include_mpn(`x86/k7/sublsh1_n.asm')
diff -r 042d12fa4384 -r 41fc97b31c4d mpn/x86/k7/sublsh1_n.asm
--- a/mpn/x86/k7/sublsh1_n.asm	Fri Mar 11 07:38:08 2011 +0100
+++ b/mpn/x86/k7/sublsh1_n.asm	Fri Mar 11 12:50:58 2011 +0100
@@ -1,4 +1,4 @@
-dnl  AMD K7 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
 
 dnl  Copyright 2011 Free Software Foundation, Inc.
 
@@ -42,211 +42,109 @@
 C AMD K8
 
 C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
-C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
-C that means we need an initial magic multiply.
+C processors.  It uses 2*4-way unrolling, for good reasons.
 C 
 C Breaking carry recurrency might be a good idea.  We would then need separate
 C registers for the shift carry and add/subtract carry, which in turn would
 C force is to 2*2-way unrolling.
 
-defframe(PARAM_SIZE,	16)
-defframe(PARAM_DBLD,	12)
+defframe(PARAM_SIZE,	12)
 defframe(PARAM_SRC,	 8)
 defframe(PARAM_DST,	 4)
 
 dnl  re-use parameter space
 define(VAR_COUNT,`PARAM_SIZE')
-define(VAR_TMP,`PARAM_DBLD')
 define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_VP,`PARAM_DST')
+define(SAVE_EBP,`PARAM_DST')
 
 ASM_START()
 	TEXT
 	ALIGN(8)
-PROLOGUE(mpn_sublsh1_n)
+PROLOGUE(mpn_sublsh1_n_ip1)
 deflit(`FRAME',0)
 
 define(`rp',  `%edi')
 define(`up',  `%esi')
-define(`vp',  `%ebp')
 
+	movl	PARAM_SIZE, %eax	C size
 	push	up			FRAME_pushl()
+	push	rp			FRAME_pushl()
+	xorl	%edx, %edx
 	movl	PARAM_SRC, up
-	push	rp			FRAME_pushl()
 	movl	PARAM_DST, rp
 	movl	%ebx, SAVE_EBX
-	movl	PARAM_SIZE, %ebx	C size
-	movl	vp, SAVE_VP
-	movl	PARAM_DBLD, vp
-	cmp	up, rp
-	je	L(inplace)
+	movl	%eax, %ebx
+	shr	$3, %eax
 
-	mov	$0x2aaaaaab, %eax
-	mull	%ebx
-
-	not	%edx			C count = -(size\6)-1
-	mov	%edx, VAR_COUNT
-
-	leal	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
-	xorl	%edx, %edx
-	leal	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
-	orl	%ebx, %ebx
+	not	%eax			C count = -(size\8)-i
+	andl	$7, %ebx		C size % 8
 	jz	L(exact)
 
 L(oop):
 ifdef(`CPU_P6',`
 	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(vp), %eax
-	adc	%eax, %eax
+	mov	(up), %ecx
+	adc	%ecx, %ecx
 	rcr	%edx			C restore 1st saved carry bit
-	mov	(up), %ecx
-	sbb	%eax, %ecx
-	mov	%ecx, (rp)
+	lea	4(up), up
+	sbb	%ecx, (rp)
+	lea	4(rp), rp
 	adc	%edx, %edx		C save a carry bit in edx
-	lea	4(vp), vp
-	lea	4(up), up
-	lea	4(rp), rp
 ifdef(`CPU_P6',`
 	adc	%edx, %edx ')		C save another carry bit in edx
 	decl	%ebx
 	jnz	L(oop)
-	movl	vp, VAR_TMP
 L(exact):
-	incl	VAR_COUNT
+	inc	%eax
 	jz	L(end)
+	mov	%eax, VAR_COUNT
+	movl	%ebp, SAVE_EBP
 
 	ALIGN(16)
 L(top):
 ifdef(`CPU_P6',`
 	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(vp), %eax
+	mov	(up), %eax
 	adc	%eax, %eax
-	mov	4(vp), %ebx
+	mov	4(up), %ebx
 	adc	%ebx, %ebx
-	mov	8(vp), %ecx
+	mov	8(up), %ecx
 	adc	%ecx, %ecx
-
-	rcr	%edx			C restore 1st saved carry bit
-	
-	mov	(up), vp
-	sbb	%eax, vp
-	mov	4(up), %eax
-	mov	vp, (rp)
-	sbb	%ebx, %eax
-	movl	VAR_TMP, vp
-	mov	8(up), %ebx
-	mov	%eax, 4(rp)
-	sbb	%ecx, %ebx
-	mov	%ebx, 8(rp)
-
-	mov	12(vp), %eax
-	adc	%eax, %eax
-	mov	16(vp), %ebx
-	adc	%ebx, %ebx
-	mov	20(vp), %ecx
-	adc	%ecx, %ecx
-
-	adc	%edx, %edx		C save a carry bit in edx
-
-	mov	12(up), vp
-	sbb	%eax, vp
-	mov	16(up), %eax
-	mov	vp, 12(rp)
-	movl	VAR_TMP, vp
-	sbb	%ebx, %eax
-	mov	20(up), %ebx
-	mov	%eax, 16(rp)
-	sbb	%ecx, %ebx
-	mov	%ebx, 20(rp)
-
-	lea	24(vp), vp
-	lea	24(up), up
-	lea	24(rp), rp
-
-ifdef(`CPU_P6',`
-	adc	%edx, %edx ')		C save another carry bit in edx
-	incl	VAR_COUNT
-	movl	vp, VAR_TMP
-	jne	L(top)
-
-	jmp	L(end)
-
-L(inplace):
-	xorl	%edx, %edx
-	movl	%ebx, %eax
-
-	shr	$3, %eax
-
-	not	%eax			C count = -(size\8)-1
-	mov	%eax, VAR_COUNT
-
-	andl	$7, %ebx		C size % 8
-	jz	L(exactinplace)
-
-L(oopinplace):
-ifdef(`CPU_P6',`
-	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(vp), %eax
-	adc	%eax, %eax
-	rcr	%edx			C restore 1st saved carry bit
-	sbb	%eax, (rp)
-	adc	%edx, %edx		C save a carry bit in edx
-	lea	4(vp), vp
-	lea	4(rp), rp
-ifdef(`CPU_P6',`
-	adc	%edx, %edx ')		C save another carry bit in edx
-	decl	%ebx
-	jnz	L(oopinplace)
-L(exactinplace):
-	incl	VAR_COUNT
-	jz	L(end)
-
-	ALIGN(16)
-L(unrolleight):
-ifdef(`CPU_P6',`
-	shr	%edx ')			C restore 2nd saved carry bit
-	mov	(vp), %eax
-	adc	%eax, %eax
-	mov	4(vp), %ebx
-	adc	%ebx, %ebx
-	mov	8(vp), %ecx
-	adc	%ecx, %ecx
-	mov	12(vp), %esi
-	adc	%esi, %esi
+	mov	12(up), %ebp
+	adc	%ebp, %ebp
 
 	rcr	%edx			C restore 1st saved carry bit
 	
 	sbb	%eax, (rp)
 	sbb	%ebx, 4(rp)
 	sbb	%ecx, 8(rp)
-	sbb	%esi, 12(rp)
+	sbb	%ebp, 12(rp)
 
-	mov	16(vp), %eax
+	mov	16(up), %eax
 	adc	%eax, %eax
-	mov	20(vp), %ebx
+	mov	20(up), %ebx
 	adc	%ebx, %ebx