[Gmp-commit] /home/hgfiles/gmp: Save some memory when sqaring. Long-needed p...

Sat Jan 30 00:02:20 CET 2010

details:   /home/hgfiles/gmp/rev/d2b1eba73eae
changeset: 13406:d2b1eba73eae
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Jan 30 00:02:13 2010 +0100
description:
Save some memory when sqaring.  Long-needed partial cleanup.

diffstat:

 ChangeLog             |   11 ++
 mpn/generic/mul_fft.c |  203 +++++++++++++++++++++++--------------------------
 2 files changed, 105 insertions(+), 109 deletions(-)

diffs (truncated from 366 to 300 lines):

diff -r c833ceac4f33 -r d2b1eba73eae ChangeLog

--- a/ChangeLog	Thu Jan 28 08:41:23 2010 +0100
+++ b/ChangeLog	Sat Jan 30 00:02:13 2010 +0100
@@ -1,3 +1,14 @@
+2010-01-29  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/generic/mul_fft.c (mpn_mul_fft_internal): Remove arguments n, m,
+	k and rec; add argument sqr.  Don't call mpn_mul_fft_decompose here,
+	instead do that in all callers.
+	(mpn_mul_fft): Trim allocation when squaring, and use TMP_ALLOC*, not
+	explicit alloc/free.
+	(mpn_fft_div_2exp_modF): Avoid a scalar division.
+	(mpn_fft_mul_modF_K): Replace some multiplies by K with shifting by k.
+	(mpn_fft_mul_2exp_modF): Make function more symmetrical.
+
 2010-01-27  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/generic/mu_div_q.c (mpn_mu_div_q_itch): Rewrite.
diff -r c833ceac4f33 -r d2b1eba73eae mpn/generic/mul_fft.c
--- a/mpn/generic/mul_fft.c	Thu Jan 28 08:41:23 2010 +0100
+++ b/mpn/generic/mul_fft.c	Sat Jan 30 00:02:13 2010 +0100
@@ -65,9 +65,10 @@
 #endif
 
 static mp_limb_t mpn_mul_fft_internal
-__GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, int, int, mp_ptr *, mp_ptr *,
-	      mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr,
-	      int));
+__GMP_PROTO ((mp_ptr, mp_size_t, int, mp_ptr *, mp_ptr *,
+	      mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr, int));
+static void mpn_mul_fft_decompose
+__GMP_PROTO ((mp_ptr, mp_ptr *, int, int, mp_srcptr, mp_size_t, int, int, mp_ptr));
 
 
 /* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n.
@@ -227,26 +228,25 @@
 }
 
 
-/* r <- a*2^e mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
+/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
    Assumes a is semi-normalized, i.e. a[n] <= 1.
    r and a must have n+1 limbs, and not overlap.
 */
 static void
 mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, unsigned int d, mp_size_t n)
 {
-  int sh, negate;
+  int sh;
   mp_limb_t cc, rd;
 
   sh = d % GMP_NUMB_BITS;
   d /= GMP_NUMB_BITS;
-  negate = d >= n;
-  if (negate)
-    d -= n;
 
-  if (negate)
+  if (d >= n)			/* negate */
     {
       /* r[0..d-1]  <-- lshift(a[n-d]..a[n-1], sh)
 	 r[d..n-1]  <-- -lshift(a[0]..a[n-d-1],  sh) */
+
+      d -= n;
       if (sh != 0)
 	{
 	  /* no out shift below since a[n] <= 1 */
@@ -271,54 +271,52 @@
       cc++;
       mpn_incr_u (r, cc);
 
-      rd ++;
+      rd++;
       /* rd might overflow when sh=GMP_NUMB_BITS-1 */
       cc = (rd == 0) ? 1 : rd;
       r = r + d + (rd == 0);
       mpn_incr_u (r, cc);
-
-      return;
-    }
-
-  /* if negate=0,
-	r[0..d-1]  <-- -lshift(a[n-d]..a[n-1], sh)
-	r[d..n-1]  <-- lshift(a[0]..a[n-d-1],  sh)
-  */
-  if (sh != 0)
-    {
-      /* no out bits below since a[n] <= 1 */
-      mpn_lshiftc (r, a + n - d, d + 1, sh);
-      rd = ~r[d];
-      /* {r, d+1} = {a+n-d, d+1} << sh */
-      cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<<sh */
     }
   else
     {
-      /* r[d] is not used below, but we save a test for d=0 */
-      mpn_com (r, a + n - d, d + 1);
-      rd = a[n];
-      MPN_COPY (r + d, a, n - d);
-      cc = 0;
+      /* r[0..d-1]  <-- -lshift(a[n-d]..a[n-1], sh)
+	 r[d..n-1]  <-- lshift(a[0]..a[n-d-1],  sh)  */
+      if (sh != 0)
+	{
+	  /* no out bits below since a[n] <= 1 */
+	  mpn_lshiftc (r, a + n - d, d + 1, sh);
+	  rd = ~r[d];
+	  /* {r, d+1} = {a+n-d, d+1} << sh */
+	  cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<<sh */
+	}
+      else
+	{
+	  /* r[d] is not used below, but we save a test for d=0 */
+	  mpn_com (r, a + n - d, d + 1);
+	  rd = a[n];
+	  MPN_COPY (r + d, a, n - d);
+	  cc = 0;
+	}
+
+      /* now complement {r, d}, subtract cc from r[0], subtract rd from r[d] */
+
+      /* if d=0 we just have r[0]=a[n] << sh */
+      if (d != 0)
+	{
+	  /* now add 1 in r[0], subtract 1 in r[d] */
+	  if (cc-- == 0) /* then add 1 to r[0] */
+	    cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
+	  cc = mpn_sub_1 (r, r, d, cc) + 1;
+	  /* add 1 to cc instead of rd since rd might overflow */
+	}
+
+      /* now subtract cc and rd from r[d..n] */
+
+      r[n] = -mpn_sub_1 (r + d, r + d, n - d, cc);
+      r[n] -= mpn_sub_1 (r + d, r + d, n - d, rd);
+      if (r[n] & GMP_LIMB_HIGHBIT)
+	r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
     }
-
-  /* now complement {r, d}, subtract cc from r[0], subtract rd from r[d] */
-
-  /* if d=0 we just have r[0]=a[n] << sh */
-  if (d != 0)
-    {
-      /* now add 1 in r[0], subtract 1 in r[d] */
-      if (cc-- == 0) /* then add 1 to r[0] */
-	cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
-      cc = mpn_sub_1 (r, r, d, cc) + 1;
-      /* add 1 to cc instead of rd since rd might overflow */
-    }
-
-  /* now subtract cc and rd from r[d..n] */
-
-  r[n] = -mpn_sub_1 (r + d, r + d, n - d, cc);
-  r[n] -= mpn_sub_1 (r + d, r + d, n - d, rd);
-  if (r[n] & GMP_LIMB_HIGHBIT)
-    r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
 }
 
 
@@ -472,7 +470,7 @@
   if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
     {
       int k, K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
-      int **_fft_l;
+      int **fft_l;
       mp_ptr *Ap, *Bp, A, B, T;
 
       k = mpn_fft_best_k (n, sqr);
@@ -505,23 +503,30 @@
 
       Ap = TMP_ALLOC_MP_PTRS (K2);
       Bp = TMP_ALLOC_MP_PTRS (K2);
-      A = TMP_ALLOC_LIMBS (2 * K2 * (nprime2 + 1));
+      A = TMP_ALLOC_LIMBS (2 * (nprime2 + 1) << k);
       T = TMP_ALLOC_LIMBS (2 * (nprime2 + 1));
-      B = A + K2 * (nprime2 + 1);
-      _fft_l = TMP_ALLOC_TYPE (k + 1, int *);
+      B = A + ((nprime2 + 1) << k);
+      fft_l = TMP_ALLOC_TYPE (k + 1, int *);
       for (i = 0; i <= k; i++)
-	_fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
-      mpn_fft_initl (_fft_l, k);
+	fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+      mpn_fft_initl (fft_l, k);
 
       TRACE (printf ("recurse: %ldx%ld limbs -> %d times %dx%d (%1.2f)\n", n,
 		    n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
       for (i = 0; i < K; i++, ap++, bp++)
 	{
+	  mp_limb_t cy;
 	  mpn_fft_normalize (*ap, n);
 	  if (!sqr)
 	    mpn_fft_normalize (*bp, n);
-	  mpn_mul_fft_internal (*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
-				l, Mp2, _fft_l, T, 1);
+
+	  mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T);
+	  if (!sqr)
+	    mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T);
+
+	  cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2,
+				     l, Mp2, fft_l, T, sqr);
+	  (*ap)[n] = cy;
 	}
     }
   else
@@ -602,15 +607,14 @@
 }
 
 
-/* A <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
+/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
 static void
 mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, int k, mp_size_t n)
 {
   int i;
 
   ASSERT (r != a);
-  i = 2 * n * GMP_NUMB_BITS;
-  i = (i - k) % i;		/* FIXME: This % looks superfluous */
+  i = 2 * n * GMP_NUMB_BITS - k;
   mpn_fft_mul_2exp_modF (r, a, i, n);
   /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */
   /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */
@@ -738,48 +742,30 @@
 }
 
 /* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS
-   n and m have respectively nl and ml limbs
-   op must have space for pl+1 limbs if rec=1 (and pl limbs if rec=0).
+   op is pl limbs, its high bit is returned.
    One must have pl = mpn_fft_next_size (pl, k).
    T must have space for 2 * (nprime + 1) limbs.
-
-   If rec=0, then store only the pl low bits of the result, and return
-   the out carry.
 */
 
 static mp_limb_t
-mpn_mul_fft_internal (mp_ptr op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
-		      int k, int K,
-		      mp_ptr *Ap, mp_ptr *Bp,
-		      mp_ptr A, mp_ptr B,
+mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
+		      mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B,
 		      mp_size_t nprime, mp_size_t l, mp_size_t Mp,
-		      int **_fft_l,
-		      mp_ptr T, int rec)
+		      int **fft_l, mp_ptr T, int sqr)
 {
-  int i, sqr, pla, lo, sh, j;
+  int K, i, pla, lo, sh, j;
   mp_ptr p;
   mp_limb_t cc;
 
-  sqr = n == m;
-
-  TRACE (printf ("pl=%ld k=%d K=%d np=%ld l=%ld Mp=%ld rec=%d sqr=%d\n",
-		 pl,k,K,nprime,l,Mp,rec,sqr));
-
-  /* decomposition of inputs into arrays Ap[i] and Bp[i] */
-  if (rec)
-    {
-      mpn_mul_fft_decompose (A, Ap, K, nprime, n, K * l + 1, l, Mp, T);
-      if (!sqr)
-	mpn_mul_fft_decompose (B, Bp, K, nprime, m, K * l + 1, l, Mp, T);
-    }
+  K = 1 << k;
 
   /* direct fft's */
-  mpn_fft_fft (Ap, K, _fft_l + k, 2 * Mp, nprime, 1, T);
+  mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T);
   if (!sqr)
-    mpn_fft_fft (Bp, K, _fft_l + k, 2 * Mp, nprime, 1, T);
+    mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T);
 
   /* term to term multiplications */
-  mpn_fft_mul_modF_K (Ap, (sqr) ? Ap : Bp, nprime, K);
+  mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K);
 
   /* inverse fft's */
   mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
@@ -843,11 +829,7 @@
   /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
      < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
      < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
-  i = mpn_fft_norm_modF (op, pl, p, pla);
-  if (rec) /* store the carry out */
-    op[pl] = i;
-
-  return i;
+  return mpn_fft_norm_modF (op, pl, p, pla);
 }
 
 /* return the lcm of a and 2^k */
@@ -874,7 +856,7 @@
   int K, maxLK, i;
   mp_size_t N, Nprime, nprime, M, Mp, l;
   mp_ptr *Ap, *Bp, A, T, B;