[Gmp-commit] /var/hg/gmp: 3 new changesets

Mon Feb 28 22:58:40 CET 2011

details:   /var/hg/gmp/rev/ff573b48f6e9
changeset: 13947:ff573b48f6e9
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Feb 28 20:22:26 2011 +0100
description:
(mpn_rootrem): Combine two similar scalar divisions.  Misc minor cleanup.

details:   /var/hg/gmp/rev/74a7752cfd14
changeset: 13948:74a7752cfd14
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Feb 28 20:23:17 2011 +0100
description:
*** empty log message ***

details:   /var/hg/gmp/rev/e42801a3626b
changeset: 13949:e42801a3626b
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Feb 28 22:58:27 2011 +0100
description:
Trivial merge.

diffstat:

 ChangeLog              |   30 +++++++++
 mpn/generic/mod_1_1.c  |  156 +++++++++++++++++++++++++++++++++++++++++++++++++
 mpn/generic/rootrem.c  |   23 +++---
 mpn/x86_64/mod_1_1.asm |   31 +++------
 tune/Makefile.am       |    2 +-
 tune/common.c          |   10 +++
 tune/mod_1_1-1.c       |   28 ++++++++
 tune/mod_1_1-2.c       |   28 ++++++++
 tune/speed.c           |    2 +
 tune/speed.h           |    8 ++
 tune/tuneup.c          |   24 +++++++
 11 files changed, 309 insertions(+), 33 deletions(-)

diffs (truncated from 519 to 300 lines):

diff -r 4828d99fcfb3 -r e42801a3626b ChangeLog

--- a/ChangeLog	Mon Feb 28 16:54:52 2011 +0100
+++ b/ChangeLog	Mon Feb 28 22:58:27 2011 +0100
@@ -1,5 +1,35 @@
+2011-02-28  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/generic/rootrem.c (mpn_rootrem): Combine two similar scalar
+	divisions.  Misc minor cleanup.
+
 2011-02-28  Niels Möller  <nisse at lysator.liu.se>
 
+	* tune/tuneup.c (tune_mod_1): Measure mpn_mod_1_1_1 and
+	mpn_mod_1_1_2, to set MOD_1_1P_METHOD.
+
+	* tune/speed.c (routine): Added mpn_mod_1_1_1 and mpn_mod_1_1_2.
+
+	* tune/speed.h: Declare speed_mpn_mod_1_1_1, speed_mpn_mod_1_1_2,
+	mpn_mod_1_1p_1, mpn_mod_1_1p_2, mpn_mod_1_1p_cps_1, and
+	mpn_mod_1_1p_cps_2.
+
+	* tune/common.c (speed_mpn_mod_1_1_1): New function.
+	(speed_mpn_mod_1_1_2): New function.
+
+	* tune/Makefile.am (libspeed_la_SOURCES): Added mod_1_1-1.c
+	mod_1_1-2.c.
+
+	* tune/mod_1_1-1.c: New file.
+	* tune/mod_1_1-2.c: New file.
+
+	* mpn/generic/mod_1_1.c: Implemented an algorithm with fewer
+	multiplications, configured via MOD_1_1P_METHOD.
+
+	* mpn/x86_64/mod_1_1.asm (mpn_mod_1_1p_cps): Simplified
+	computation of B2modb, use B^2 mod (normalized b).
+	(mpn_mod_1_1p): Corresponding changes. Don't shift b.
+
 	* mpn/generic/pre_mod_1.c (mpn_preinv_mod_1): Use udiv_rnnd_preinv
 	rather than udiv_qrnnd_preinv.
 
diff -r 4828d99fcfb3 -r e42801a3626b mpn/generic/mod_1_1.c
--- a/mpn/generic/mod_1_1.c	Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/generic/mod_1_1.c	Mon Feb 28 22:58:27 2011 +0100
@@ -29,6 +29,61 @@
 #include "gmp-impl.h"
 #include "longlong.h"
 
+#ifndef MOD_1_1P_METHOD
+# define MOD_1_1P_METHOD 1
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+ * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
+ * carry out, in the form of a mask. */
+
+#if defined (__GNUC__)
+
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_mssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
+  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tsbb\t%k0, %k0"               \
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
+	   : "0"  ((USItype)(s2)),                                      \
+	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
+  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tsbb\t%q0, %q0"               \
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
+	   : "0"  ((UDItype)(s2)),                                      \
+	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+/* FIXME: How to do carry -> mask on powerpc? */
+#if 0 && HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"        \
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
+	   : "0"  ((UDItype)(s2)),                                      \
+	     "r"  ((UDItype)(a1)), "r" ((UDItype)(b1)),                 \
+	     "%2" ((UDItype)(a0)), "rI" ((UDItype)(b0)))
+#endif
+#endif /* defined (__GNUC__) */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
+  do {                                                                  \
+    UWtype __s0, __s1, __c0, __c1;                                      \
+    __s0 = (a0) + (b0);                                                 \
+    __s1 = (a1) + (b1);                                                 \
+    __c0 = __s0 < (a0);                                                 \
+    __c1 = __s1 < (a1);                                                 \
+    (s0) = __s0;                                                        \
+    __s1 = __s1 + __c0;                                                 \
+    (s1) = __s1;                                                        \
+    (s2) = - (__c1 + (__s1 < __c0));					\
+  } while (0)
+#endif
+
+#if MOD_1_1P_METHOD == 1
 void
 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
 {
@@ -103,3 +158,104 @@
 
   return r >> cnt;
 }
+#endif /* MOD_1_1P_METHOD == 1 */
+
+#if MOD_1_1P_METHOD == 2
+void
+mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B2modb;
+  int cnt;
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  if (LIKELY (cnt != 0))
+    {
+      mp_limb_t B1modb = -b;
+      B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+      ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+      cps[2] = B1modb >> cnt;
+    }
+  B2modb = - b * bi;
+  ASSERT (B2modb <= b);    // NB: equality iff b = B/2
+  cps[3] = B2modb;
+}
+
+mp_limb_t
+mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
+{
+  int cnt;
+  mp_limb_t bi, B1modb;
+  mp_limb_t r0, r1;
+  mp_limb_t r;
+
+  ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
+
+  r0 = ap[n-2];
+  r1 = ap[n-1];
+
+  if (n > 2)
+    {
+      mp_limb_t B2modb, B2mb;
+      mp_limb_t p0, p1;
+      mp_limb_t r2;
+      mp_size_t j;
+
+      B2modb = bmodb[3];
+      B2mb = B2modb - b;
+
+      umul_ppmm (p1, p0, r1, B2modb);
+      add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
+
+      for (j = n-4; j >= 0; j--)
+	{
+	  mp_limb_t cy;
+	  /* mp_limb_t t = r0 + B2mb; */
+	  umul_ppmm (p1, p0, r1, B2modb);
+
+	  ADDC_LIMB (cy, r0, r0, r2 & B2modb);
+	  /* Alternative, for cmov: if (cy) r0 = t; */
+	  r0 -= (-cy) & b;
+	  add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
+	}
+
+      r1 -= (r2 & b);
+    }
+
+  cnt = bmodb[1];
+
+  if (LIKELY (cnt != 0))
+    {
+      mp_limb_t t;
+      mp_limb_t B1modb = bmodb[2];
+
+      umul_ppmm (r1, t, r1, B1modb);
+      r0 += t;
+      r1 += (r0 < t);
+
+      /* Normalize */
+      r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
+      r0 <<= cnt;
+
+      /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows
+	 that. */
+    }
+  else
+    {
+      mp_limb_t mask = - (r1 >= b);
+      r1 -= mask & b;
+    }
+
+  bi = bmodb[0];
+
+  udiv_rnnd_preinv (r, r1, r0, b, bi);
+  return r >> cnt;
+}
+#endif /* MOD_1_1P_METHOD == 2 */
diff -r 4828d99fcfb3 -r e42801a3626b mpn/generic/rootrem.c
--- a/mpn/generic/rootrem.c	Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/generic/rootrem.c	Mon Feb 28 22:58:27 2011 +0100
@@ -79,14 +79,15 @@
 mpn_rootrem (mp_ptr rootp, mp_ptr remp,
 	     mp_srcptr up, mp_size_t un, mp_limb_t k)
 {
+  mp_size_t m;
   ASSERT (un > 0);
   ASSERT (up[un - 1] != 0);
   ASSERT (k > 1);
 
-  if ((remp == NULL) && (un / k > 2))
-    /* call mpn_rootrem recursively, padding {up,un} with k zero limbs,
-       which will produce an approximate root with one more limb,
-       so that in most cases we can conclude. */
+  m = (un - 1) / k;		/* ceil(un/k) - 1 */
+  if (remp == NULL && m > 2)
+    /* Pad {up,un} with k zero limbs.  This will produce an approximate root
+       with one more limb, allowing us to compute the exact integral result. */
     {
       mp_ptr sp, wp;
       mp_size_t rn, sn, wn;
@@ -94,21 +95,21 @@
       TMP_MARK;
       wn = un + k;
       wp = TMP_ALLOC_LIMBS (wn); /* will contain the padded input */
-      sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
+      sn = m + 2; /* ceil(un/k) + 1 */
       sp = TMP_ALLOC_LIMBS (sn); /* approximate root of padded input */
       MPN_COPY (wp + k, up, un);
       MPN_ZERO (wp, k);
       rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
-      /* the approximate root S = {sp,sn} is either the correct root of
-	 {sp,sn}, or one too large. Thus unless the least significant limb
-	 of S is 0 or 1, we can deduce the root of {up,un} is S truncated by
-	 one limb. (In case sp[0]=1, we can deduce the root, but not decide
+      /* The approximate root S = {sp,sn} is either the correct root of
+	 {sp,sn}, or 1 too large.  Thus unless the least significant limb of
+	 S is 0 or 1, we can deduce the root of {up,un} is S truncated by one
+	 limb.  (In case sp[0]=1, we can deduce the root, but not decide
 	 whether it is exact or not.) */
       MPN_COPY (rootp, sp + 1, sn - 1);
       TMP_FREE;
       return rn;
     }
-  else /* remp <> NULL */
+  else
     {
       return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
     }
@@ -395,7 +396,7 @@
       ASSERT_ALWAYS (rn >= qn);
 
       /* R = R - Q = floor(U/2^kk) - S^k */
-      if ((i > 1) || (approx == 0))
+      if (i > 1 || approx == 0)
 	{
 	  mpn_sub (rp, rp, rn, qp, qn);
 	  MPN_NORMALIZE (rp, rn);
diff -r 4828d99fcfb3 -r e42801a3626b mpn/x86_64/mod_1_1.asm
--- a/mpn/x86_64/mod_1_1.asm	Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/x86_64/mod_1_1.asm	Mon Feb 28 22:58:27 2011 +0100
@@ -48,10 +48,6 @@
 C The pre array contains bi, cnt, B1modb, B2modb
 C Note: This implementaion needs B1modb only when cnt > 0
 
-C Currently needs b to not be preshifted, we actually have to undo shift done
-C by caller.  Perhaps b shouldn't be passed at all, it should be in the pre
-C block where the cps function is free to store whatever is needed.
-
 C The iteration is almost as follows,
 C
 C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
@@ -80,9 +76,6 @@
 	mov	%rdx, b
 	mov	%rcx, pre
 
-	mov	8(pre), R32(%rcx)
-	shr	R8(%rcx), b
-
 	mov	-8(ap, n, 8), %rax
 	cmp	$3, n
 	jnc	L(first)
@@ -128,7 +121,7 @@
 	test	R32(%rcx), R32(%rcx)
 	jz	L(normalized)
 
-	C Unnormalized, use B1modb to reduce to size < B b
+	C Unnormalized, use B1modb to reduce to size < B (b+1)
 	mulq	16(pre)