[Gmp-commit] /home/hgfiles/gmp: 4 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sat Jan 22 18:10:06 CET 2011


details:   /home/hgfiles/gmp/rev/ea1884c2e91d
changeset: 13757:ea1884c2e91d
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Jan 22 14:19:31 2011 +0100
description:
Add new internal mpn routines addmul_2s and sqr_diag_addlsh1.

details:   /home/hgfiles/gmp/rev/4b6555afd6a4
changeset: 13758:4b6555afd6a4
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Jan 22 17:55:28 2011 +0100
description:
Rewrite mpn_sqr_basecase.

details:   /home/hgfiles/gmp/rev/42cadec09a14
changeset: 13759:42cadec09a14
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Jan 22 18:01:19 2011 +0100
description:
Define some shorter convenience mnemonics.

details:   /home/hgfiles/gmp/rev/7ba9324af20a
changeset: 13760:7ba9324af20a
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Jan 22 18:04:52 2011 +0100
description:
mpn/ia64/sqr_diag_addlsh1.asm: New file.

diffstat:

 ChangeLog                     |   17 +++++
 configure.in                  |    4 +-
 gmp-impl.h                    |    7 ++
 mpn/asm-defs.m4               |    2 +
 mpn/generic/sqr_basecase.c    |   55 ++++++++++------
 mpn/ia64/ia64-defs.m4         |   12 +++
 mpn/ia64/sqr_diag_addlsh1.asm |  133 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 208 insertions(+), 22 deletions(-)

diffs (truncated from 360 to 300 lines):

diff -r 33ef11b9a199 -r 7ba9324af20a ChangeLog
--- a/ChangeLog	Sat Jan 22 14:03:18 2011 +0100
+++ b/ChangeLog	Sat Jan 22 18:04:52 2011 +0100
@@ -1,3 +1,20 @@
+2011-01-22  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/ia64/sqr_diag_addlsh1.asm: New file.
+
+	* mpn/ia64/ia64-defs.m4: Define some shorter convenience mnemonics.
+
+	* mpn/generic/sqr_basecase.c (MPN_SQR_DIAG_ADDLSH1): New macro, using
+	new function mpn_sqr_diag_addlsh1 or defining its equivalent.
+
+	* gmp-impl.h (mpn_addmul_2s): Declare.
+	(mpn_sqr_diag_addlsh1): Declare.
+	* mpn/asm-defs.m4 (define_mpn): Add addmul_2s and sqr_diag_addlsh1.
+
+	* configure.in: Add HAVE_NATIVEs for mpn_sqr_diag_addlsh1 and
+	mpn_addmul_2s.
+	(gmp_mpn_functions_optional): Add sqr_diag_addlsh1.
+
 2011-01-21 Marco Bodrato <bodrato at mail.dm.unipi.it>
 
 	* tests/devel/try.c: Initial support for mpn_bdiv_q_1.
diff -r 33ef11b9a199 -r 7ba9324af20a configure.in
--- a/configure.in	Sat Jan 22 14:03:18 2011 +0100
+++ b/configure.in	Sat Jan 22 18:04:52 2011 +0100
@@ -2500,7 +2500,7 @@
 #       divrem_1 and pre_divrem_1.
 
 gmp_mpn_functions_optional="umul udiv					\
-  invert_limb sqr_diagonal						\
+  invert_limb sqr_diagonal sqr_diag_addlsh1				\
   mul_2 mul_3 mul_4 mul_5 mul_6						\
   addmul_2 addmul_3 addmul_4 addmul_5 addmul_6 addmul_7 addmul_8	\
   addlsh1_n sublsh1_n rsblsh1_n rsh1add_n rsh1sub_n			\
@@ -3011,6 +3011,7 @@
 #undef HAVE_NATIVE_mpn_addmul_6
 #undef HAVE_NATIVE_mpn_addmul_7
 #undef HAVE_NATIVE_mpn_addmul_8
+#undef HAVE_NATIVE_mpn_addmul_2s
 #undef HAVE_NATIVE_mpn_and_n
 #undef HAVE_NATIVE_mpn_andn_n
 #undef HAVE_NATIVE_mpn_bdiv_dbm1c
@@ -3065,6 +3066,7 @@
 #undef HAVE_NATIVE_mpn_rshift
 #undef HAVE_NATIVE_mpn_sqr_basecase
 #undef HAVE_NATIVE_mpn_sqr_diagonal
+#undef HAVE_NATIVE_mpn_sqr_diag_addlsh1
 #undef HAVE_NATIVE_mpn_sub_n
 #undef HAVE_NATIVE_mpn_sub_nc
 #undef HAVE_NATIVE_mpn_sublsh1_n
diff -r 33ef11b9a199 -r 7ba9324af20a gmp-impl.h
--- a/gmp-impl.h	Sat Jan 22 14:03:18 2011 +0100
+++ b/gmp-impl.h	Sat Jan 22 18:04:52 2011 +0100
@@ -792,6 +792,10 @@
 #define mpn_addmul_8 __MPN(addmul_8)
 __GMP_DECLSPEC mp_limb_t mpn_addmul_8 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
 
+/* Alternative entry point in mpn_addmul_2 for the benefit of mpn_sqr_basecase.  */
+#define mpn_addmul_2s __MPN(addmul_2s)
+__GMP_DECLSPEC mp_limb_t mpn_addmul_2s __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr));
+
 /* mpn_addlsh1_n(c,a,b,n), when it exists, sets {c,n} to {a,n}+2*{b,n}, and
    returns the carry out (0, 1 or 2).  */
 #define mpn_addlsh1_n __MPN(addlsh1_n)
@@ -1076,6 +1080,9 @@
 #define   mpn_sqr_diagonal __MPN(sqr_diagonal)
 __GMP_DECLSPEC void      mpn_sqr_diagonal __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
 
+#define mpn_sqr_diag_addlsh1 __MPN(sqr_diag_addlsh1)
+__GMP_DECLSPEC void      mpn_sqr_diag_addlsh1 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+
 #define   mpn_toom_interpolate_5pts __MPN(toom_interpolate_5pts)
 __GMP_DECLSPEC void      mpn_toom_interpolate_5pts __GMP_PROTO ((mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_size_t, int, mp_limb_t));
 
diff -r 33ef11b9a199 -r 7ba9324af20a mpn/asm-defs.m4
--- a/mpn/asm-defs.m4	Sat Jan 22 14:03:18 2011 +0100
+++ b/mpn/asm-defs.m4	Sat Jan 22 18:04:52 2011 +0100
@@ -1320,6 +1320,7 @@
 define_mpn(addmul_6)
 define_mpn(addmul_7)
 define_mpn(addmul_8)
+define_mpn(addmul_2s)
 define_mpn(add_n_sub_n)
 define_mpn(add_n_sub_nc)
 define_mpn(addaddmul_1msb0)
@@ -1404,6 +1405,7 @@
 define_mpn(set_str)
 define_mpn(sqr_basecase)
 define_mpn(sqr_diagonal)
+define_mpn(sqr_diag_addlsh1)
 define_mpn(sub_n)
 define_mpn(sublsh1_n)
 define_mpn(sublsh2_n)
diff -r 33ef11b9a199 -r 7ba9324af20a mpn/generic/sqr_basecase.c
--- a/mpn/generic/sqr_basecase.c	Sat Jan 22 14:03:18 2011 +0100
+++ b/mpn/generic/sqr_basecase.c	Sat Jan 22 18:04:52 2011 +0100
@@ -6,7 +6,7 @@
 
 
 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2003, 2004,
-2005, 2008 Free Software Foundation, Inc.
+2005, 2008, 2010, 2011 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -45,6 +45,29 @@
   } while (0)
 #endif
 
+#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
+#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
+  mpn_sqr_diag_addlsh1 (rp, tp, up, n)
+#else
+#if HAVE_NATIVE_mpn_addlsh1_n
+#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    mp_limb_t cy;							\
+    MPN_SQR_DIAGONAL (rp, up, n);					\
+    cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);			\
+    rp[2 * n - 1] += cy;						\
+  } while (0)
+#else
+  do {									\
+    mp_limb_t cy;							\
+    MPN_SQR_DIAGONAL (rp, up, n);					\
+    cy = mpn_lshift (tp, tp, 2 * n - 2, 1);				\
+    cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);			\
+    rp[2 * n - 1] += cy;						\
+  } while (0)
+#endif
+#endif
+
 
 #undef READY_WITH_mpn_sqr_basecase
 
@@ -84,9 +107,13 @@
     {
       if (n == 2)
 	{
+#if HAVE_NATIVE_mpn_mul_2
+	  rp[3] = mpn_mul_2 (rp, up, 2, up);
+#else
 	  rp[0] = 0;
 	  rp[1] = 0;
 	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
+#endif
 	  return;
 	}
 
@@ -101,15 +128,7 @@
       tp[2 * n - 3] = cy;
     }
 
-  MPN_SQR_DIAGONAL (rp, up, n);
-
-#if HAVE_NATIVE_mpn_addlsh1_n
-  cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
-#else
-  cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
-  cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
-#endif
-  rp[2 * n - 1] += cy;
+  MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
 }
 #define READY_WITH_mpn_sqr_basecase
 #endif
@@ -194,9 +213,13 @@
 
       if (n == 2)
 	{
+#if HAVE_NATIVE_mpn_mul_2
+	  rp[3] = mpn_mul_2 (rp, up, 2, up);
+#else
 	  rp[0] = 0;
 	  rp[1] = 0;
 	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
+#endif
 	  return;
 	}
 
@@ -283,18 +306,8 @@
 	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
 	  tp[n + i - 2] = cy;
 	}
-      MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);
 
-      {
-	mp_limb_t cy;
-#if HAVE_NATIVE_mpn_addlsh1_n
-	cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
-#else
-	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
-	cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
-#endif
-	rp[2 * n - 1] += cy;
-      }
+      MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
     }
 }
 #endif
diff -r 33ef11b9a199 -r 7ba9324af20a mpn/ia64/ia64-defs.m4
--- a/mpn/ia64/ia64-defs.m4	Sat Jan 22 14:03:18 2011 +0100
+++ b/mpn/ia64/ia64-defs.m4	Sat Jan 22 18:04:52 2011 +0100
@@ -120,5 +120,17 @@
 ')')
 define(`ASSERT_label_counter',1)
 
+define(`getfsig', `getf.sig')
+define(`setfsig', `setf.sig')
+define(`cmpeq',   `cmp.eq')
+define(`cmpne',   `cmp.ne')
+define(`cmpltu',  `cmp.ltu')
+define(`cmpleu',  `cmp.leu')
+define(`cmpgtu',  `cmp.gtu')
+define(`cmpgeu',  `cmp.geu')
+define(`cmple',   `cmp.le')
+define(`cmpgt',   `cmp.gt')
+define(`cmpeqor', `cmp.eq.or')
+define(`cmpequc', `cmp.eq.unc')
 
 divert
diff -r 33ef11b9a199 -r 7ba9324af20a mpn/ia64/sqr_diag_addlsh1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/ia64/sqr_diag_addlsh1.asm	Sat Jan 22 18:04:52 2011 +0100
@@ -0,0 +1,133 @@
+dnl  IA-64 mpn_sqr_diag_addlsh1
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    2	Unrolling could bring it to 1.5 + epsilon
+
+C Exact performance table.  The 2nd line is this code, the 3rd line is ctop-
+C less code.  In an assembly sqr_basecase, the ctop-full numbers will become a
+C few cycles better since we can mitigate the many I0 instructions.
+C
+C 1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
+C -  20  22  24  26  28  30  32  34  36  38  40  42  44  46  48  50  52  54  56 Needs updating
+C -  13  16  17  18  20  21  23  25  26  30  31  31  33  34  36  38  39  42  43
+
+C We should keep in mind that this code takes linear time in a O(n^2) context
+C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
+C around 60.  Keeping overhead down for smallish operands (< 10) is more
+C important than optimal cycle counts.
+
+C TODO
+C  * Make sure we don't depend on uninitialised r-registers, f-registers, or
+C  * p-registers.
+C  * Optimise by doing first two loop iterations in function header.
+
+C INPUT PARAMETERS
+define(`rp_param', `r32')  define(`rp', `r14')		C size: 2n
+define(`tp_param', `r33')  define(`tp', `r15')		C size: 2n - 2
+define(`up_param', `r34')  define(`up', `r31')		C size: n
+define(`n',  `r35')
+
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+
+	.prologue
+	.save	ar.pfs, r2
+	.save	ar.lc, r3
+	.body
+
+.mmi;		alloc	r2 = ar.pfs, 4,24,0,24	C			M
+		nop	4711
+		mov	r3 = ar.lc		C			I0
+.mmi;		mov	tp = tp_param		C			M I
+		mov	up = up_param		C			M I
+		mov	rp = rp_param		C			M I
+	;;
+.mmi;		ld8	r36 = [tp], 8		C			M
+		add	r20 = -2, n		C			M I
+		mov	r9 = ar.ec		C			I0
+	;;
+.mmi;		ld8	r32 = [tp], 8		C			M
+		mov	r16 = 0			C			M I


More information about the gmp-commit mailing list