[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Fri Mar 11 12:51:03 CET 2011
details: /var/hg/gmp/rev/71bfeee49abb
changeset: 14033:71bfeee49abb
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Fri Mar 11 12:36:46 2011 +0100
description:
Declare and use sublsh1_ip1.
details: /var/hg/gmp/rev/fbc7aba7ef17
changeset: 14034:fbc7aba7ef17
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Fri Mar 11 12:42:41 2011 +0100
description:
tests/devel/try support for some _ip1 and _ip2 functions.
details: /var/hg/gmp/rev/29a1f71e0aac
changeset: 14035:29a1f71e0aac
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Fri Mar 11 12:46:51 2011 +0100
description:
tune/speed support for some _ip1 and _ip2 functions.
details: /var/hg/gmp/rev/41fc97b31c4d
changeset: 14036:41fc97b31c4d
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Fri Mar 11 12:50:58 2011 +0100
description:
Replaced sublsh1 function with _ip1 version for k7 and atom.
diffstat:
ChangeLog | 16 +++
gmp-impl.h | 16 +++-
mpn/generic/toom_interpolate_5pts.c | 4 +-
mpn/x86/atom/sublsh1_n.asm | 2 +-
mpn/x86/k7/sublsh1_n.asm | 174 +++++++----------------------------
tests/devel/try.c | 80 ++++++++++++++++-
tests/refmpn.c | 60 ++++++++++++
tests/tests.h | 9 +
tune/common.c | 42 ++++++++
tune/speed.c | 18 +++
tune/speed.h | 6 +
11 files changed, 284 insertions(+), 143 deletions(-)
diffs (truncated from 686 to 300 lines):
diff -r 042d12fa4384 -r 41fc97b31c4d ChangeLog
--- a/ChangeLog Fri Mar 11 07:38:08 2011 +0100
+++ b/ChangeLog Fri Mar 11 12:50:58 2011 +0100
@@ -1,3 +1,19 @@
+2011-03-11 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * gmp-impl.h (mpn_sublsh1_n_ip1): Declare.
+ * mpn/generic/toom_interpolate_5pts.c: Use mpn_sublsh1_n_ip1.
+
+ * tests/devel/try.c: Tests for {add,sub}lsh*_n_ip[12].
+ * tests/refmpn.c: New reference for mpn_{add,sub}lsh*_n_ip[12].
+ * tests/tests.h: Declarations for reference functions above.
+
+ * tune/common.c: New speed_mpn_{add,sub}lsh*_n_ip[12] functions.
+ * tune/speed.h: Prototypes for functions above.
+ * tune/speed.c: Support for mpn_{add,sub}lsh*_n_ip[12].
+
+ * mpn/x86/k7/sublsh1_n.asm: Replaced generic sublsh1 code with faster _ip1.
+ * mpn/x86/atom/sublsh1_n.asm: Changed PROLOGUE accordingly.
+
2011-03-10 Marc Glisse <marc.glisse at inria.fr>
* tests/cxx/t-istream.cc: Explicit conversion to streampos.
diff -r 042d12fa4384 -r 41fc97b31c4d gmp-impl.h
--- a/gmp-impl.h Fri Mar 11 07:38:08 2011 +0100
+++ b/gmp-impl.h Fri Mar 11 12:50:58 2011 +0100
@@ -818,11 +818,25 @@
__GMP_DECLSPEC mp_limb_t mpn_addlsh_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, unsigned int, mp_limb_t));
/* mpn_sublsh1_n(c,a,b,n), when it exists, sets {c,n} to {a,n}-2*{b,n}, and
- returns the borrow out (0, 1 or 2). */
+ returns the borrow out (0, 1 or 2). Use _ip1 when a=c. */
#define mpn_sublsh1_n __MPN(sublsh1_n)
__GMP_DECLSPEC mp_limb_t mpn_sublsh1_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
#define mpn_sublsh1_nc __MPN(sublsh1_nc)
__GMP_DECLSPEC mp_limb_t mpn_sublsh1_nc __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+#if HAVE_NATIVE_mpn_sublsh1_n && ! HAVE_NATIVE_mpn_sublsh1_n_ip1
+#define mpn_sublsh1_n_ip1(dst,src,n) mpn_sublsh1_n(dst,dst,src,n)
+#define HAVE_NATIVE_mpn_sublsh1_n_ip1 1
+#else
+#define mpn_sublsh1_n_ip1 __MPN(sublsh1_n_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh1_n_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+#if HAVE_NATIVE_mpn_sublsh1_nc && ! HAVE_NATIVE_mpn_sublsh1_nc_ip1
+#define mpn_sublsh1_nc_ip1(dst,src,n,c) mpn_sublsh1_nc(dst,dst,src,n,c)
+#define HAVE_NATIVE_mpn_sublsh1_nc_ip1 1
+#else
+#define mpn_sublsh1_nc_ip1 __MPN(sublsh1_nc_ip1)
+__GMP_DECLSPEC mp_limb_t mpn_sublsh1_nc_ip1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+#endif
/* mpn_rsblsh1_n(c,a,b,n), when it exists, sets {c,n} to 2*{b,n}-{a,n}, and
returns the carry out (-1, 0, 1). */
diff -r 042d12fa4384 -r 41fc97b31c4d mpn/generic/toom_interpolate_5pts.c
--- a/mpn/generic/toom_interpolate_5pts.c Fri Mar 11 07:38:08 2011 +0100
+++ b/mpn/generic/toom_interpolate_5pts.c Fri Mar 11 12:50:58 2011 +0100
@@ -126,8 +126,8 @@
result is v2 >= 0 */
saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */
vinf[0] = vinf0; /* Set the right value for vinf0 */
-#ifdef HAVE_NATIVE_mpn_sublsh1_n
- cy = mpn_sublsh1_n (v2, v2, vinf, twor);
+#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1
+ cy = mpn_sublsh1_n_ip1 (v2, vinf, twor);
#else
/* Overwrite unused vm1 */
cy = mpn_lshift (vm1, vinf, twor, 1);
diff -r 042d12fa4384 -r 41fc97b31c4d mpn/x86/atom/sublsh1_n.asm
--- a/mpn/x86/atom/sublsh1_n.asm Fri Mar 11 07:38:08 2011 +0100
+++ b/mpn/x86/atom/sublsh1_n.asm Fri Mar 11 12:50:58 2011 +0100
@@ -19,5 +19,5 @@
include(`../config.m4')
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
include_mpn(`x86/k7/sublsh1_n.asm')
diff -r 042d12fa4384 -r 41fc97b31c4d mpn/x86/k7/sublsh1_n.asm
--- a/mpn/x86/k7/sublsh1_n.asm Fri Mar 11 07:38:08 2011 +0100
+++ b/mpn/x86/k7/sublsh1_n.asm Fri Mar 11 12:50:58 2011 +0100
@@ -1,4 +1,4 @@
-dnl AMD K7 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
dnl Copyright 2011 Free Software Foundation, Inc.
@@ -42,211 +42,109 @@
C AMD K8
C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
-C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
-C that means we need an initial magic multiply.
+C processors. It uses 2*4-way unrolling, for good reasons.
C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
C force is to 2*2-way unrolling.
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_DBLD, 12)
+defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC, 8)
defframe(PARAM_DST, 4)
dnl re-use parameter space
define(VAR_COUNT,`PARAM_SIZE')
-define(VAR_TMP,`PARAM_DBLD')
define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_VP,`PARAM_DST')
+define(SAVE_EBP,`PARAM_DST')
ASM_START()
TEXT
ALIGN(8)
-PROLOGUE(mpn_sublsh1_n)
+PROLOGUE(mpn_sublsh1_n_ip1)
deflit(`FRAME',0)
define(`rp', `%edi')
define(`up', `%esi')
-define(`vp', `%ebp')
+ movl PARAM_SIZE, %eax C size
push up FRAME_pushl()
+ push rp FRAME_pushl()
+ xorl %edx, %edx
movl PARAM_SRC, up
- push rp FRAME_pushl()
movl PARAM_DST, rp
movl %ebx, SAVE_EBX
- movl PARAM_SIZE, %ebx C size
- movl vp, SAVE_VP
- movl PARAM_DBLD, vp
- cmp up, rp
- je L(inplace)
+ movl %eax, %ebx
+ shr $3, %eax
- mov $0x2aaaaaab, %eax
- mull %ebx
-
- not %edx C count = -(size\6)-1
- mov %edx, VAR_COUNT
-
- leal 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
- xorl %edx, %edx
- leal (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
- orl %ebx, %ebx
+ not %eax C count = -(size\8)-i
+ andl $7, %ebx C size % 8
jz L(exact)
L(oop):
ifdef(`CPU_P6',`
shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
+ mov (up), %ecx
+ adc %ecx, %ecx
rcr %edx C restore 1st saved carry bit
- mov (up), %ecx
- sbb %eax, %ecx
- mov %ecx, (rp)
+ lea 4(up), up
+ sbb %ecx, (rp)
+ lea 4(rp), rp
adc %edx, %edx C save a carry bit in edx
- lea 4(vp), vp
- lea 4(up), up
- lea 4(rp), rp
ifdef(`CPU_P6',`
adc %edx, %edx ') C save another carry bit in edx
decl %ebx
jnz L(oop)
- movl vp, VAR_TMP
L(exact):
- incl VAR_COUNT
+ inc %eax
jz L(end)
+ mov %eax, VAR_COUNT
+ movl %ebp, SAVE_EBP
ALIGN(16)
L(top):
ifdef(`CPU_P6',`
shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
+ mov (up), %eax
adc %eax, %eax
- mov 4(vp), %ebx
+ mov 4(up), %ebx
adc %ebx, %ebx
- mov 8(vp), %ecx
+ mov 8(up), %ecx
adc %ecx, %ecx
-
- rcr %edx C restore 1st saved carry bit
-
- mov (up), vp
- sbb %eax, vp
- mov 4(up), %eax
- mov vp, (rp)
- sbb %ebx, %eax
- movl VAR_TMP, vp
- mov 8(up), %ebx
- mov %eax, 4(rp)
- sbb %ecx, %ebx
- mov %ebx, 8(rp)
-
- mov 12(vp), %eax
- adc %eax, %eax
- mov 16(vp), %ebx
- adc %ebx, %ebx
- mov 20(vp), %ecx
- adc %ecx, %ecx
-
- adc %edx, %edx C save a carry bit in edx
-
- mov 12(up), vp
- sbb %eax, vp
- mov 16(up), %eax
- mov vp, 12(rp)
- movl VAR_TMP, vp
- sbb %ebx, %eax
- mov 20(up), %ebx
- mov %eax, 16(rp)
- sbb %ecx, %ebx
- mov %ebx, 20(rp)
-
- lea 24(vp), vp
- lea 24(up), up
- lea 24(rp), rp
-
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- incl VAR_COUNT
- movl vp, VAR_TMP
- jne L(top)
-
- jmp L(end)
-
-L(inplace):
- xorl %edx, %edx
- movl %ebx, %eax
-
- shr $3, %eax
-
- not %eax C count = -(size\8)-1
- mov %eax, VAR_COUNT
-
- andl $7, %ebx C size % 8
- jz L(exactinplace)
-
-L(oopinplace):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
- rcr %edx C restore 1st saved carry bit
- sbb %eax, (rp)
- adc %edx, %edx C save a carry bit in edx
- lea 4(vp), vp
- lea 4(rp), rp
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- decl %ebx
- jnz L(oopinplace)
-L(exactinplace):
- incl VAR_COUNT
- jz L(end)
-
- ALIGN(16)
-L(unrolleight):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
- mov 4(vp), %ebx
- adc %ebx, %ebx
- mov 8(vp), %ecx
- adc %ecx, %ecx
- mov 12(vp), %esi
- adc %esi, %esi
+ mov 12(up), %ebp
+ adc %ebp, %ebp
rcr %edx C restore 1st saved carry bit
sbb %eax, (rp)
sbb %ebx, 4(rp)
sbb %ecx, 8(rp)
- sbb %esi, 12(rp)
+ sbb %ebp, 12(rp)
- mov 16(vp), %eax
+ mov 16(up), %eax
adc %eax, %eax
- mov 20(vp), %ebx
+ mov 20(up), %ebx
adc %ebx, %ebx
More information about the gmp-commit
mailing list