[Gmp-commit] /var/hg/gmp: 6 new changesets

Tue Oct 11 00:08:37 CEST 2011

details:   /var/hg/gmp/rev/304715fdf683
changeset: 14302:304715fdf683
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Oct 10 20:19:48 2011 +0200
description:
(STCK): Use proper memory constraint.

details:   /var/hg/gmp/rev/6c9488cb9164
changeset: 14303:6c9488cb9164
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Oct 10 20:20:01 2011 +0200
description:
*** empty log message ***

details:   /var/hg/gmp/rev/775c88cc1414
changeset: 14304:775c88cc1414
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Oct 10 21:52:10 2011 +0200
description:
(add_mssaaaa): Add s390x variant.  Put arm code inside __GNUC__.

details:   /var/hg/gmp/rev/ef982b1e0db7
changeset: 14305:ef982b1e0db7
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Oct 11 00:05:18 2011 +0200
description:
New file.

details:   /var/hg/gmp/rev/f0023774d824
changeset: 14306:f0023774d824
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Oct 11 00:06:51 2011 +0200
description:
Add cycle numbers.

details:   /var/hg/gmp/rev/3acb09aa1091
changeset: 14307:3acb09aa1091
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Oct 11 00:08:33 2011 +0200
description:
Trivial merge.

diffstat:

 ChangeLog                  |   18 +
 gmp-impl.h                 |    4 +
 mpn/generic/hgcd_appr.c    |  476 +++++++++++++++++++++++++++++++++-----------
 mpn/generic/mod_1_1.c      |   13 +-
 mpn/s390_64/bdiv_dbm1c.asm |    2 +
 mpn/s390_64/copyd.asm      |  125 +++++++++++
 tests/mpn/t-hgcd_appr.c    |   26 ++-
 tune/time.c                |    2 +-
 8 files changed, 537 insertions(+), 129 deletions(-)

diffs (truncated from 820 to 300 lines):

diff -r 74dd752b960d -r 3acb09aa1091 ChangeLog

--- a/ChangeLog	Mon Oct 10 17:06:37 2011 +0200
+++ b/ChangeLog	Tue Oct 11 00:08:33 2011 +0200
@@ -1,10 +1,28 @@
+2011-10-10  Niels Möller  <nisse at lysator.liu.se>
+
+	* mpn/generic/hgcd_appr.c: Deleted debugging code.
+
+	* tests/mpn/t-hgcd_appr.c (main): Added -v flag.
+	(hgcd_appr_valid_p): Increased margin of non-minimality for
+	divide-and-conquer algorithm. Display bit counts only if
+	-v is used.
+
+	* mpn/generic/hgcd_appr.c (submul): New (static) function.
+	(hgcd_matrix_apply): New function.
+	(mpn_hgcd_appr_itch): Account for divide-and-conquer algorithm.
+	(mpn_hgcd_appr): Implemented divide-and-conquer.
+
 2011-10-10  Torbjorn Granlund  <tege at gmplib.org>
 
+	* tune/time.c (STCK): Use proper memory constraint.
+
 	From Marco Trudel:
 	* tests/mpz/t-scan.c (check_ref): Fix loop end bound.
 
 2011-10-10  Niels Möller  <nisse at lysator.liu.se>
 
+	* gmp-impl.h: (HGCD_APPR_THRESHOLD): New threshold.
+
 	* mpn/generic/hgcd_appr.c (mpn_hgcd_appr): Interface change.
 	Destroy inputs, let caller make working copies if needed.
 	(mpn_hgcd_appr_itch): Reduced scratch need.
diff -r 74dd752b960d -r 3acb09aa1091 gmp-impl.h
--- a/gmp-impl.h	Mon Oct 10 17:06:37 2011 +0200
+++ b/gmp-impl.h	Tue Oct 11 00:08:33 2011 +0200
@@ -4057,6 +4057,10 @@
 #define HGCD_THRESHOLD 400
 #endif
 
+#ifndef HGCD_APPR_THRESHOLD
+#define HGCD_APPR_THRESHOLD 300
+#endif
+
 #ifndef GCD_DC_THRESHOLD
 #define GCD_DC_THRESHOLD 1000
 #endif
diff -r 74dd752b960d -r 3acb09aa1091 mpn/generic/hgcd_appr.c
--- a/mpn/generic/hgcd_appr.c	Mon Oct 10 17:06:37 2011 +0200
+++ b/mpn/generic/hgcd_appr.c	Tue Oct 11 00:08:33 2011 +0200
@@ -21,20 +21,196 @@
 You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
-#include <stdio.h>
-
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#define TRACE 0
+/* Computes R -= A * B. Result must be non-negative. Normalized down
+   to size an, and resulting size is returned. */
+static mp_size_t
+submul (mp_ptr rp, mp_size_t rn,
+	mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
+{
+  mp_ptr tp;
+  TMP_DECL;
 
-/* Scratch need: Current Lehmer algorithm needs n limbs for
-   hgcd_appr_step. */
+  ASSERT (bn > 0);
+  ASSERT (an >= bn);
+  ASSERT (rn >= an);
+  ASSERT (an + bn <= rn + 1);
+  
+  TMP_MARK;
+  tp = TMP_ALLOC_LIMBS (an + bn);
+
+  mpn_mul (tp, ap, an, bp, bn);
+  if (an + bn > rn)
+    {
+      ASSERT (tp[rn] == 0);
+      bn--;
+    }
+  ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn));
+  TMP_FREE;
+
+  while (rn > an && (rp[rn-1] == 0))
+    rn--;
+
+  return rn;
+}
+
+/* Computes (a, b)  <--  M^{-1} (a; b) */
+/* FIXME:
+    x Take scratch parameter, and figure out scratch need.
+
+    x Use some fallback for small M->n?    
+*/
+static mp_size_t
+hgcd_matrix_apply (const struct hgcd_matrix *M,
+		   mp_ptr ap, mp_ptr bp,
+		   mp_size_t n)
+{
+  mp_size_t an, bn, un, vn, nn;
+  mp_size_t mn[2][2];
+  mp_size_t modn;
+  mp_ptr tp, sp, scratch;
+  mp_limb_t cy;
+  unsigned i, j;
+
+  TMP_DECL;
+
+  ASSERT ( (ap[n-1] | bp[n-1]) > 0);
+
+  an = n;
+  MPN_NORMALIZE (ap, an);
+  bn = n;
+  MPN_NORMALIZE (bp, bn);
+  
+  for (i = 0; i < 2; i++)
+    for (j = 0; j < 2; j++)
+      {
+	mp_size_t k;
+	k = M->n;
+	MPN_NORMALIZE (M->p[i][j], k);
+	mn[i][j] = k;
+      }
+
+  ASSERT (mn[0][0] > 0);
+  ASSERT (mn[1][1] > 0);
+  ASSERT ( (mn[0][1] | mn[1][0]) > 0);
+
+  TMP_MARK;
+
+  if (mn[0][1] == 0)
+    {
+      mp_size_t qn;
+      
+      /* A unchanged, M = (1, 0; q, 1) */
+      ASSERT (mn[0][0] == 1);
+      ASSERT (M->p[0][0][0] == 1);
+      ASSERT (mn[1][1] == 1);
+      ASSERT (M->p[1][1][0] == 1);
+
+      /* Put B <-- B - q A */
+      nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
+    }
+  else if (mn[1][0] == 0)
+    {
+      /* B unchanged, M = (1, q; 0, 1) */
+      ASSERT (mn[0][0] == 1);
+      ASSERT (M->p[0][0][0] == 1);
+      ASSERT (mn[1][1] == 1);
+      ASSERT (M->p[1][1][0] == 1);
+
+      /* Put A  <-- A - q * B */
+      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);      
+    }
+  else
+    {
+      /* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
+	 B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
+      un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
+      vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
+
+      nn = MAX (un, vn);
+      /* In the range of interest, mulmod_bnm1 should always beat mullo. */
+      modn = mpn_mulmod_bnm1_next_size (nn + 1);
+
+      scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
+      tp = TMP_ALLOC_LIMBS (modn);
+      sp = TMP_ALLOC_LIMBS (modn);
+
+      ASSERT (n <= 2*modn);
+
+      if (n > modn)
+	{
+	  cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
+	  MPN_INCR_U (ap, modn, cy);
+
+	  cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
+	  MPN_INCR_U (bp, modn, cy);
+
+	  n = modn;
+	}
+
+      mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
+      mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
+
+      /* FIXME: Handle the small n case in some better way. */
+      if (n + mn[1][1] < modn)
+	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
+      if (n + mn[0][1] < modn)
+	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
+  
+      cy = mpn_sub_n (tp, tp, sp, modn);
+      MPN_DECR_U (tp, modn, cy);
+
+      ASSERT (mpn_zero_p (tp + nn, modn - nn));
+
+      mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
+      MPN_COPY (ap, tp, nn);
+      mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
+
+      if (n + mn[1][0] < modn)
+	MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
+      if (n + mn[0][0] < modn)
+	MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
+
+      cy = mpn_sub_n (tp, tp, sp, modn);
+      MPN_DECR_U (tp, modn, cy);
+
+      ASSERT (mpn_zero_p (tp + nn, modn - nn));
+      MPN_COPY (bp, tp, nn);
+
+      while ( (ap[nn-1] | bp[nn-1]) == 0)
+	{
+	  nn--;
+	  ASSERT (nn > 0);
+	}
+    }
+  TMP_FREE;
+
+  return nn;
+}
+
+/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
+   HGCD_THRESHOLD at the end? */
 mp_size_t
 mpn_hgcd_appr_itch (mp_size_t n)
 {
-  return n;
+  if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
+    return n;
+  else
+    {
+      unsigned k;
+      int count;
+      mp_size_t nscaled;
+
+      /* Get the recursion depth. */
+      nscaled = (n - 1) / (HGCD_APPR_THRESHOLD - 1);
+      count_leading_zeros (count, nscaled);
+      k = GMP_LIMB_BITS - count;
+
+      return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD;
+    }
 }
 
 /* Destroys inputs. */
@@ -42,10 +218,8 @@
 mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
 	       struct hgcd_matrix *M, mp_ptr tp)
 {
-  unsigned extra_bits;
   mp_size_t s;
   mp_limb_t mask;
-  int count;
   int success = 0;
 
   ASSERT (n > 0);
@@ -57,104 +231,18 @@
     /* Implies s = n. A fairly uninteresting case but exercised by the
        random inputs of the testsuite. */
     return 0;
-    
+
+  ASSERT ((n+1)/2 - 1 < M->alloc);
+
   /* We aim for reduction of to GMP_NUMB_BITS * s bits. But each time
      we discard some of the least significant limbs, we must keep one
      additional bit to account for the truncation error. We maintain
      the GMP_NUMB_BITS * s - extra_bits as the current target size. */
      
   s = n/2 + 1;
-  extra_bits = 0;
-
-#if TRACE
-  fprintf (stderr, "hgcd_appr: In: n = %u, s = %u\n",
-	   (unsigned) n, (unsigned) s);
-#endif
-  while (n > 2)
+  if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
     {
-      mp_size_t nn;
-
-#if TRACE
-      fprintf (stderr, "loop: n = %u\n", (unsigned) n);
-#endif
-      ASSERT (n > s);
-      ASSERT (n <= 2*s);
-
-      nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-      if (!nn)
-	break;
-
-      n = nn;
-      success = 1;