[Gmp-commit] /home/hgfiles/gmp: Save some memory when sqaring. Long-needed p...
mercurial at gmplib.org
mercurial at gmplib.org
Sat Jan 30 00:02:20 CET 2010
details: /home/hgfiles/gmp/rev/d2b1eba73eae
changeset: 13406:d2b1eba73eae
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Jan 30 00:02:13 2010 +0100
description:
Save some memory when sqaring. Long-needed partial cleanup.
diffstat:
ChangeLog | 11 ++
mpn/generic/mul_fft.c | 203 +++++++++++++++++++++++--------------------------
2 files changed, 105 insertions(+), 109 deletions(-)
diffs (truncated from 366 to 300 lines):
diff -r c833ceac4f33 -r d2b1eba73eae ChangeLog
--- a/ChangeLog Thu Jan 28 08:41:23 2010 +0100
+++ b/ChangeLog Sat Jan 30 00:02:13 2010 +0100
@@ -1,3 +1,14 @@
+2010-01-29 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/generic/mul_fft.c (mpn_mul_fft_internal): Remove arguments n, m,
+ k and rec; add argument sqr. Don't call mpn_mul_fft_decompose here,
+ instead do that in all callers.
+ (mpn_mul_fft): Trim allocation when squaring, and use TMP_ALLOC*, not
+ explicit alloc/free.
+ (mpn_fft_div_2exp_modF): Avoid a scalar division.
+ (mpn_fft_mul_modF_K): Replace some multiplies by K with shifting by k.
+ (mpn_fft_mul_2exp_modF): Make function more symmetrical.
+
2010-01-27 Torbjorn Granlund <tege at gmplib.org>
* mpn/generic/mu_div_q.c (mpn_mu_div_q_itch): Rewrite.
diff -r c833ceac4f33 -r d2b1eba73eae mpn/generic/mul_fft.c
--- a/mpn/generic/mul_fft.c Thu Jan 28 08:41:23 2010 +0100
+++ b/mpn/generic/mul_fft.c Sat Jan 30 00:02:13 2010 +0100
@@ -65,9 +65,10 @@
#endif
static mp_limb_t mpn_mul_fft_internal
-__GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, int, int, mp_ptr *, mp_ptr *,
- mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr,
- int));
+__GMP_PROTO ((mp_ptr, mp_size_t, int, mp_ptr *, mp_ptr *,
+ mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr, int));
+static void mpn_mul_fft_decompose
+__GMP_PROTO ((mp_ptr, mp_ptr *, int, int, mp_srcptr, mp_size_t, int, int, mp_ptr));
/* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n.
@@ -227,26 +228,25 @@
}
-/* r <- a*2^e mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
+/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
Assumes a is semi-normalized, i.e. a[n] <= 1.
r and a must have n+1 limbs, and not overlap.
*/
static void
mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, unsigned int d, mp_size_t n)
{
- int sh, negate;
+ int sh;
mp_limb_t cc, rd;
sh = d % GMP_NUMB_BITS;
d /= GMP_NUMB_BITS;
- negate = d >= n;
- if (negate)
- d -= n;
- if (negate)
+ if (d >= n) /* negate */
{
/* r[0..d-1] <-- lshift(a[n-d]..a[n-1], sh)
r[d..n-1] <-- -lshift(a[0]..a[n-d-1], sh) */
+
+ d -= n;
if (sh != 0)
{
/* no out shift below since a[n] <= 1 */
@@ -271,54 +271,52 @@
cc++;
mpn_incr_u (r, cc);
- rd ++;
+ rd++;
/* rd might overflow when sh=GMP_NUMB_BITS-1 */
cc = (rd == 0) ? 1 : rd;
r = r + d + (rd == 0);
mpn_incr_u (r, cc);
-
- return;
- }
-
- /* if negate=0,
- r[0..d-1] <-- -lshift(a[n-d]..a[n-1], sh)
- r[d..n-1] <-- lshift(a[0]..a[n-d-1], sh)
- */
- if (sh != 0)
- {
- /* no out bits below since a[n] <= 1 */
- mpn_lshiftc (r, a + n - d, d + 1, sh);
- rd = ~r[d];
- /* {r, d+1} = {a+n-d, d+1} << sh */
- cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<<sh */
}
else
{
- /* r[d] is not used below, but we save a test for d=0 */
- mpn_com (r, a + n - d, d + 1);
- rd = a[n];
- MPN_COPY (r + d, a, n - d);
- cc = 0;
+ /* r[0..d-1] <-- -lshift(a[n-d]..a[n-1], sh)
+ r[d..n-1] <-- lshift(a[0]..a[n-d-1], sh) */
+ if (sh != 0)
+ {
+ /* no out bits below since a[n] <= 1 */
+ mpn_lshiftc (r, a + n - d, d + 1, sh);
+ rd = ~r[d];
+ /* {r, d+1} = {a+n-d, d+1} << sh */
+ cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<<sh */
+ }
+ else
+ {
+ /* r[d] is not used below, but we save a test for d=0 */
+ mpn_com (r, a + n - d, d + 1);
+ rd = a[n];
+ MPN_COPY (r + d, a, n - d);
+ cc = 0;
+ }
+
+ /* now complement {r, d}, subtract cc from r[0], subtract rd from r[d] */
+
+ /* if d=0 we just have r[0]=a[n] << sh */
+ if (d != 0)
+ {
+ /* now add 1 in r[0], subtract 1 in r[d] */
+ if (cc-- == 0) /* then add 1 to r[0] */
+ cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
+ cc = mpn_sub_1 (r, r, d, cc) + 1;
+ /* add 1 to cc instead of rd since rd might overflow */
+ }
+
+ /* now subtract cc and rd from r[d..n] */
+
+ r[n] = -mpn_sub_1 (r + d, r + d, n - d, cc);
+ r[n] -= mpn_sub_1 (r + d, r + d, n - d, rd);
+ if (r[n] & GMP_LIMB_HIGHBIT)
+ r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
}
-
- /* now complement {r, d}, subtract cc from r[0], subtract rd from r[d] */
-
- /* if d=0 we just have r[0]=a[n] << sh */
- if (d != 0)
- {
- /* now add 1 in r[0], subtract 1 in r[d] */
- if (cc-- == 0) /* then add 1 to r[0] */
- cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
- cc = mpn_sub_1 (r, r, d, cc) + 1;
- /* add 1 to cc instead of rd since rd might overflow */
- }
-
- /* now subtract cc and rd from r[d..n] */
-
- r[n] = -mpn_sub_1 (r + d, r + d, n - d, cc);
- r[n] -= mpn_sub_1 (r + d, r + d, n - d, rd);
- if (r[n] & GMP_LIMB_HIGHBIT)
- r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
}
@@ -472,7 +470,7 @@
if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
{
int k, K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
- int **_fft_l;
+ int **fft_l;
mp_ptr *Ap, *Bp, A, B, T;
k = mpn_fft_best_k (n, sqr);
@@ -505,23 +503,30 @@
Ap = TMP_ALLOC_MP_PTRS (K2);
Bp = TMP_ALLOC_MP_PTRS (K2);
- A = TMP_ALLOC_LIMBS (2 * K2 * (nprime2 + 1));
+ A = TMP_ALLOC_LIMBS (2 * (nprime2 + 1) << k);
T = TMP_ALLOC_LIMBS (2 * (nprime2 + 1));
- B = A + K2 * (nprime2 + 1);
- _fft_l = TMP_ALLOC_TYPE (k + 1, int *);
+ B = A + ((nprime2 + 1) << k);
+ fft_l = TMP_ALLOC_TYPE (k + 1, int *);
for (i = 0; i <= k; i++)
- _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
- mpn_fft_initl (_fft_l, k);
+ fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+ mpn_fft_initl (fft_l, k);
TRACE (printf ("recurse: %ldx%ld limbs -> %d times %dx%d (%1.2f)\n", n,
n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
for (i = 0; i < K; i++, ap++, bp++)
{
+ mp_limb_t cy;
mpn_fft_normalize (*ap, n);
if (!sqr)
mpn_fft_normalize (*bp, n);
- mpn_mul_fft_internal (*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
- l, Mp2, _fft_l, T, 1);
+
+ mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T);
+ if (!sqr)
+ mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T);
+
+ cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2,
+ l, Mp2, fft_l, T, sqr);
+ (*ap)[n] = cy;
}
}
else
@@ -602,15 +607,14 @@
}
-/* A <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
+/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
static void
mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, int k, mp_size_t n)
{
int i;
ASSERT (r != a);
- i = 2 * n * GMP_NUMB_BITS;
- i = (i - k) % i; /* FIXME: This % looks superfluous */
+ i = 2 * n * GMP_NUMB_BITS - k;
mpn_fft_mul_2exp_modF (r, a, i, n);
/* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */
/* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */
@@ -738,48 +742,30 @@
}
/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS
- n and m have respectively nl and ml limbs
- op must have space for pl+1 limbs if rec=1 (and pl limbs if rec=0).
+ op is pl limbs, its high bit is returned.
One must have pl = mpn_fft_next_size (pl, k).
T must have space for 2 * (nprime + 1) limbs.
-
- If rec=0, then store only the pl low bits of the result, and return
- the out carry.
*/
static mp_limb_t
-mpn_mul_fft_internal (mp_ptr op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
- int k, int K,
- mp_ptr *Ap, mp_ptr *Bp,
- mp_ptr A, mp_ptr B,
+mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
+ mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B,
mp_size_t nprime, mp_size_t l, mp_size_t Mp,
- int **_fft_l,
- mp_ptr T, int rec)
+ int **fft_l, mp_ptr T, int sqr)
{
- int i, sqr, pla, lo, sh, j;
+ int K, i, pla, lo, sh, j;
mp_ptr p;
mp_limb_t cc;
- sqr = n == m;
-
- TRACE (printf ("pl=%ld k=%d K=%d np=%ld l=%ld Mp=%ld rec=%d sqr=%d\n",
- pl,k,K,nprime,l,Mp,rec,sqr));
-
- /* decomposition of inputs into arrays Ap[i] and Bp[i] */
- if (rec)
- {
- mpn_mul_fft_decompose (A, Ap, K, nprime, n, K * l + 1, l, Mp, T);
- if (!sqr)
- mpn_mul_fft_decompose (B, Bp, K, nprime, m, K * l + 1, l, Mp, T);
- }
+ K = 1 << k;
/* direct fft's */
- mpn_fft_fft (Ap, K, _fft_l + k, 2 * Mp, nprime, 1, T);
+ mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T);
if (!sqr)
- mpn_fft_fft (Bp, K, _fft_l + k, 2 * Mp, nprime, 1, T);
+ mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T);
/* term to term multiplications */
- mpn_fft_mul_modF_K (Ap, (sqr) ? Ap : Bp, nprime, K);
+ mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K);
/* inverse fft's */
mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
@@ -843,11 +829,7 @@
/* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
< K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
< K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
- i = mpn_fft_norm_modF (op, pl, p, pla);
- if (rec) /* store the carry out */
- op[pl] = i;
-
- return i;
+ return mpn_fft_norm_modF (op, pl, p, pla);
}
/* return the lcm of a and 2^k */
@@ -874,7 +856,7 @@
int K, maxLK, i;
mp_size_t N, Nprime, nprime, M, Mp, l;
mp_ptr *Ap, *Bp, A, T, B;
More information about the gmp-commit
mailing list