[Gmp-commit] /var/hg/gmp: 6 new changesets

Fri May 15 06:15:12 UTC 2015

details:   /var/hg/gmp/rev/339e72465081
changeset: 16623:339e72465081
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri May 15 07:52:56 2015 +0200
description:
mpn/generic/invertappr.c (mpn_ni_invertappr): Always USE_MUL_N

details:   /var/hg/gmp/rev/fbeca2813724
changeset: 16624:fbeca2813724
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri May 15 07:57:20 2015 +0200
description:
mpn/generic/invertappr.c (mpn_ni_invertappr): Use sublsh1

details:   /var/hg/gmp/rev/4986f13cb779
changeset: 16625:4986f13cb779
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri May 15 08:00:27 2015 +0200
description:
mpn/generic/invertappr.c (mpn_ni_invertappr): Reduce memory used (and related updates)

details:   /var/hg/gmp/rev/84ee9d5cd0d3
changeset: 16626:84ee9d5cd0d3
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri May 15 08:05:37 2015 +0200
description:
ampn/generic/mu_div*: Pass the scratch parameter to mpn_invertappr

details:   /var/hg/gmp/rev/6e856cdf492b
changeset: 16627:6e856cdf492b
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri May 15 08:06:24 2015 +0200
description:
mpn/generic/invertappr.c: Stop supporting scratch==NULL

details:   /var/hg/gmp/rev/ff2b31ba754f
changeset: 16628:ff2b31ba754f
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri May 15 08:12:32 2015 +0200
description:
ChangeLog

diffstat:

 ChangeLog                  |   9 ++++
 gmp-impl.h                 |   2 +-
 mpn/generic/invertappr.c   |  92 ++++++++++++++++++++--------------------------
 mpn/generic/mu_div_qr.c    |  11 +++--
 mpn/generic/mu_divappr_q.c |  11 +++--
 tune/tuneup.c              |   6 +-
 6 files changed, 66 insertions(+), 65 deletions(-)

diffs (truncated from 321 to 300 lines):

diff -r f904b57a39d8 -r ff2b31ba754f ChangeLog

--- a/ChangeLog	Tue May 12 21:43:24 2015 +0200
+++ b/ChangeLog	Fri May 15 08:12:32 2015 +0200
@@ -1,3 +1,12 @@
+2015-05-15 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpn/generic/invertappr.c: Reduce memory usage.
+	* gmp-impl.h (mpn_invertappr_itch): Update accordingly.
+	* tune/tuneup.c (tune_invertappr, tune_invert): Update min_size.
+
+	* mpn/generic/mu_div_qr.c: Pass scratch memory to mpn_invertappr.
+	* mpn/generic/mu_divappr_q.c: Likewise.
+
 2015-05-12  Felix Janda  <felix.janda at posteo.de>
 
 	* mpn/powerpc32/elf.m4 (LEA): Adopt to new ABI.
diff -r f904b57a39d8 -r ff2b31ba754f gmp-impl.h
--- a/gmp-impl.h	Tue May 12 21:43:24 2015 +0200
+++ b/gmp-impl.h	Fri May 15 08:12:32 2015 +0200
@@ -1466,7 +1466,7 @@
 __GMP_DECLSPEC mp_limb_t mpn_ni_invertappr (mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
 #define   mpn_invertappr __MPN(invertappr)
 __GMP_DECLSPEC mp_limb_t mpn_invertappr (mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
-#define mpn_invertappr_itch(n)  (2 * (n) + 4)
+#define mpn_invertappr_itch(n)  (2 * (n))
 
 #define   mpn_binvert __MPN(binvert)
 __GMP_DECLSPEC void      mpn_binvert (mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
diff -r f904b57a39d8 -r ff2b31ba754f mpn/generic/invertappr.c
--- a/mpn/generic/invertappr.c	Tue May 12 21:43:24 2015 +0200
+++ b/mpn/generic/invertappr.c	Fri May 15 08:12:32 2015 +0200
@@ -40,10 +40,6 @@
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */
 
-/* FIXME: Remove NULL and TMP_*, as soon as all the callers properly
-   allocate and pass the scratch to the function. */
-#include <stdlib.h>		/* for NULL */
-
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
@@ -140,9 +136,12 @@
    Some adaptations were introduced, to allow product mod B^m-1 and return the
    value e.
 
-   USE_MUL_N = 1 (default) introduces a correction in such a way that "the
-   value of B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
-   "2B^n-1").  This correction should not require to modify the proof.
+   We introduced a correction in such a way that "the value of
+   B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
+   "2B^n-1").
+
+   Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
+   in the scratch, i.e. 3*rn <= 2*n: we require n>4.
 
    We use a wrapped product modulo B^m-1.  NOTE: is there any normalisation
    problem for the [0] class?  It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
@@ -152,22 +151,20 @@
    incremented (because A < B^n).
 
    FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
-   is allocated apart.  */
-
-#define USE_MUL_N 1
+   is allocated apart.
+ */
 
 mp_limb_t
 mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
 {
   mp_limb_t cy;
-  mp_ptr rp;
   mp_size_t rn, mn;
   mp_size_t sizes[NPOWS], *sizp;
   mp_ptr tp;
   TMP_DECL;
 #define xp scratch
 
-  ASSERT (n > 2);
+  ASSERT (n > 4);
   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
   ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
@@ -199,12 +196,7 @@
     }
   /* Use Newton's iterations to get the desired precision.*/
 
-  /* define rp scratch; 2rn + 1 limbs <= 2(n>>1 + 1) + 1 <= n + 3  limbs */
-  /* Maximum scratch needed by this branch <= 2*n + 4 - USE_MUL_N */
-  rp = xp + n + 1 - USE_MUL_N;				/*  n + 3 limbs */
   while (1) {
-    mp_limb_t method;
-
     n = *--sizp;
     /*
       v    n  v
@@ -218,7 +210,7 @@
       /* FIXME: We do only need {xp,n+1}*/
       mpn_mul (xp, dp - n, n, ip - rn, rn);
       mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
-      method = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
+      cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
       /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
     } else { /* Use B^mn-1 wraparound */
       mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
@@ -232,62 +224,61 @@
       xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
       MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
       MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
-      method = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
+      cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
     }
 
     if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
       cy = xp[n]; /* 0 <= cy <= 1 here. */
-#if ! USE_MUL_N
-      xp[n] = CNST_LIMB (0);
-#endif
+#if HAVE_NATIVE_mpn_sublsh1_n
+      if (cy++) {
+	if (mpn_cmp (xp, dp - n, n) > 0) {
+	  mp_limb_t chk;
+	  chk = mpn_sublsh1_n (xp, xp, dp - n, n);
+	  ASSERT (chk == xp[n]);
+	  ++ cy;
+	} else
+	  ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
+      }
+#else /* no mpn_sublsh1_n*/
       if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
 	ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
 	++cy;
-      } /* 1 <= cy <= 2 here. */
+      }
+#endif
+      /* 1 <= cy <= 3 here. */
 #if HAVE_NATIVE_mpn_rsblsh1_n
       if (mpn_cmp (xp, dp - n, n) > 0) {
-	ASSERT_NOCARRY (mpn_rsblsh1_n (xp, xp, dp - n, n));
+	ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
 	++cy;
       } else
-	ASSERT_NOCARRY (mpn_sub_n (xp, dp - n, xp, n));
-#else
+	ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
+#else /* no mpn_rsblsh1_n*/
       if (mpn_cmp (xp, dp - n, n) > 0) {
 	ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
 	++cy;
       }
-      ASSERT_NOCARRY (mpn_sub_n (xp, dp - n, xp, n));
+      ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
 #endif
-      MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 3 here. */
+      MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
     } else { /* "negative" residue class */
-      MPN_DECR_U(xp, n + 1, method);
-#if USE_MUL_N
+      ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
+      MPN_DECR_U(xp, n + 1, cy);
       if (xp[n] != GMP_NUMB_MAX) {
 	MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
 	ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
       }
-#endif
-      mpn_com (xp + n - rn, xp + n - rn, rn + 1 - USE_MUL_N);
-      ASSERT (USE_MUL_N || xp[n] <= CNST_LIMB (1));
+      mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
     }
 
-    /* Compute x_ju_j. FIXME:We need {rp+rn,rn}, mulhi? */
-#if USE_MUL_N
-    mpn_mul_n (rp, xp + n - rn, ip - rn, rn);
-#else
-    rp[2*rn] = 0;
-    mpn_mul (rp, xp + n - rn, rn + xp[n], ip - rn, rn);
-#endif
-    cy = mpn_add_n (rp + rn, rp + rn, xp + n - rn, 2*rn - n);
-    cy = mpn_add_nc (ip - n, rp + 3*rn - n, xp + rn, n - rn, cy);
-#if USE_MUL_N
+    /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
+    mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
+    cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
+    cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
     MPN_INCR_U (ip - rn, rn, cy);
-#else
-    MPN_INCR_U (ip - rn, rn, cy + rp[2*rn] + xp[n]);
-#endif
     if (sizp == sizes) { /* Get out of the cycle */
       /* Check for possible carry propagation from below. */
-      cy = rp[3*rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
-/*    cy = mpn_add_1 (rp + rn, rp + rn, 2*rn - n, 4); */
+      cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
+      /*    cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
       break;
     }
     rn = n;
@@ -295,7 +286,7 @@
   TMP_FREE;
 
   return cy;
-#undef rp
+#undef xp
 }
 
 mp_limb_t
@@ -306,9 +297,6 @@
 
   TMP_MARK;
 
-  if (scratch == NULL)
-    scratch = TMP_ALLOC_LIMBS (mpn_invertappr_itch (n));
-
   ASSERT (n > 0);
   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
diff -r f904b57a39d8 -r ff2b31ba754f mpn/generic/mu_div_qr.c
--- a/mpn/generic/mu_div_qr.c	Tue May 12 21:43:24 2015 +0200
+++ b/mpn/generic/mu_div_qr.c	Fri May 15 08:12:32 2015 +0200
@@ -187,7 +187,7 @@
     {
       MPN_COPY (tp + 1, dp, in);
       tp[0] = 1;
-      mpn_invertappr (ip, tp, in + 1, NULL);
+      mpn_invertappr (ip, tp, in + 1, tp + in + 1);
       MPN_COPY_INCR (ip, ip + 1, in);
     }
   else
@@ -197,7 +197,7 @@
 	MPN_ZERO (ip, in);
       else
 	{
-	  mpn_invertappr (ip, tp, in + 1, NULL);
+	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
 	  MPN_COPY_INCR (ip, ip + 1, in);
 	}
     }
@@ -399,11 +399,12 @@
 mp_size_t
 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
 {
-  mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
   mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);
-  mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
+  mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in);
+  mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
 
-  return in + itch_local + itch_out;
+  ASSERT (itch_preinv >= itch_invapp);
+  return in + MAX (itch_invapp, itch_preinv);
 }
 
 mp_size_t
diff -r f904b57a39d8 -r ff2b31ba754f mpn/generic/mu_divappr_q.c
--- a/mpn/generic/mu_divappr_q.c	Tue May 12 21:43:24 2015 +0200
+++ b/mpn/generic/mu_divappr_q.c	Fri May 15 08:12:32 2015 +0200
@@ -113,7 +113,7 @@
     {
       MPN_COPY (tp + 1, dp, in);
       tp[0] = 1;
-      mpn_invertappr (ip, tp, in + 1, NULL);
+      mpn_invertappr (ip, tp, in + 1, tp + in + 1);
       MPN_COPY_INCR (ip, ip + 1, in);
     }
   else
@@ -123,7 +123,7 @@
 	MPN_ZERO (ip, in);
       else
 	{
-	  mpn_invertappr (ip, tp, in + 1, NULL);
+	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
 	  MPN_COPY_INCR (ip, ip + 1, in);
 	}
     }
@@ -348,7 +348,7 @@
 mp_size_t
 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
 {
-  mp_size_t qn, in, itch_local, itch_out;
+  mp_size_t qn, in, itch_local, itch_out, itch_invapp;
 
   qn = nn - dn;
   if (qn + 1 < dn)
@@ -359,5 +359,8 @@
 
   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
-  return in + dn + itch_local + itch_out;
+  itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
+
+  ASSERT (dn + itch_local + itch_out >= itch_invapp);
+  return in + MAX (dn + itch_local + itch_out, itch_invapp);
 }
diff -r f904b57a39d8 -r ff2b31ba754f tune/tuneup.c
--- a/tune/tuneup.c	Tue May 12 21:43:24 2015 +0200
+++ b/tune/tuneup.c	Fri May 15 08:12:32 2015 +0200
@@ -1694,12 +1694,12 @@
 
   param.function = speed_mpn_ni_invertappr;