[Gmp-commit] /home/hgfiles/gmp: toom32 reorganization, to use less scratch sp...
mercurial at gmplib.org
mercurial at gmplib.org
Fri Dec 25 22:14:49 CET 2009
details: /home/hgfiles/gmp/rev/fc63792c7ba5
changeset: 13221:fc63792c7ba5
user: Niels M?ller <nisse at lysator.liu.se>
date: Fri Dec 25 22:14:43 2009 +0100
description:
toom32 reorganization, to use less scratch space.
diffstat:
ChangeLog | 10 +
gmp-impl.h | 5 +-
mpn/generic/toom32_mul.c | 275 +++++++++++++++++++++++++---------------------
tests/mpn/t-toom32.c | 6 +-
4 files changed, 165 insertions(+), 131 deletions(-)
diffs (truncated from 439 to 300 lines):
diff -r 988c5e49b995 -r fc63792c7ba5 ChangeLog
--- a/ChangeLog Fri Dec 25 17:19:46 2009 +0100
+++ b/ChangeLog Fri Dec 25 22:14:43 2009 +0100
@@ -1,3 +1,13 @@
+2009-12-25 Niels Möller <nisse at lysator.liu.se>
+
+ * tests/mpn/t-toom32.c (MIN_AN, MIN_BN, MAX_BN): Tightened requirements.
+ * gmp-impl.h (mpn_toom32_mul_itch): Updated. Less scratch needed
+ by toom32 itself, and also the pointwise multiplications are
+ currently mpn_mul_n with no supplied scratch.
+ * mpn/generic/toom32_mul.c (mpn_toom32_mul): Reorganized
+ interpolation to use less scratch space. No longer supports the
+ most extreme size ratios.
+
2009-12-25 Torbjorn Granlund <tege at gmplib.org>
* tune/tuneup.c (tune_preinv_mod_1): Purge.
diff -r 988c5e49b995 -r fc63792c7ba5 gmp-impl.h
--- a/gmp-impl.h Fri Dec 25 17:19:46 2009 +0100
+++ b/gmp-impl.h Fri Dec 25 22:14:43 2009 +0100
@@ -2745,7 +2745,6 @@
floor ((B^3 - 1) / (d0 + d1 B)) - B.
-
NOTE: Output variables are updated multiple times. Only some inputs
and outputs may overlap.
*/
@@ -4519,9 +4518,7 @@
mpn_toom32_mul_itch (mp_size_t an, mp_size_t bn)
{
mp_size_t n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
- mp_size_t itch = 4 * n + 2;
- if (ABOVE_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
- itch += mpn_toom22_mul_itch (n, n);
+ mp_size_t itch = 2 * n + 1;
return itch;
}
diff -r 988c5e49b995 -r fc63792c7ba5 mpn/generic/toom32_mul.c
--- a/mpn/generic/toom32_mul.c Fri Dec 25 17:19:46 2009 +0100
+++ b/mpn/generic/toom32_mul.c Fri Dec 25 22:14:43 2009 +0100
@@ -2,7 +2,7 @@
times as large as bn. Or more accurately, bn < an < 3bn.
Contributed to the GNU project by Torbjorn Granlund.
- Improvements by Marco Bodrato.
+ Improvements by Marco Bodrato and Niels Möller.
The idea of applying toom to unbalanced multiplication is due to Marco
Bodrato and Alberto Zanoni.
@@ -60,6 +60,7 @@
mp_size_t n, s, t;
int vm1_neg;
mp_limb_t cy;
+ int hi;
#define a0 ap
#define a1 (ap + n)
@@ -67,6 +68,15 @@
#define b0 bp
#define b1 (bp + n)
+ /* Required, to guarantee that s + t >= n + 2.
+
+ Also implies that bn >= 9
+
+ FIXME: Too strict? E.g., an = 15, bn = 9, ==> n = 5, s = 5, t = 4
+ seem to be ok. */
+
+ ASSERT (bn + 4 <= an && an + 14 <= 3*bn);
+
n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
s = an - 2 * n;
@@ -74,208 +84,225 @@
ASSERT (0 < s && s <= n);
ASSERT (0 < t && t <= n);
+ ASSERT (s + t >= n + 2);
-#define as1 (pp + n + 1) /* n+1 */
-#define asm1 (scratch + n) /* n+1 */
-#define bs1 (pp) /* n+1 */
-#define bsm1 (scratch) /* n */
-#define a0_a2 (scratch) /* n */
+ /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
+#define ap1 (pp) /* n + 1 */
+#define bp1 (pp + n + 1) /* n + 1 */
+#define am1 (pp + 2*n + 2) /* n, most significant bit stored elsewhere */
+#define bm1 (pp + 3*n + 2) /* n */
+#define v1 (scratch) /* 2n + 1 */
+#define vm1 (pp) /* 2n + 1 */
+#define scratch_out (scratch + 2*n + 1) /* Currently unused. *(
- /* Compute as1 and asm1. */
- asm1[n] = mpn_add (a0_a2, a0, n, a2, s);
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (asm1[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0)
+ /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
+
+ /* FIXME: Keep ap1[n] and bp1[n] in scalar variables. */
+
+ /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
+ ap1[n] = mpn_add (ap1, a0, n, a2, s);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (ap1[n] == 0 && mpn_cmp (ap1, a1, n) < 0)
{
- cy = mpn_add_n_sub_n (as1, asm1, a1, a0_a2, n);
- as1[n] = cy >> 1;
+ ap1[n] = mpn_addsub_n (ap1, am1, a1, ap1, n) >> 1;
+ hi = 0;
vm1_neg = 1;
}
else
{
- cy = mpn_add_n_sub_n (as1, asm1, a0_a2, a1, n);
- as1[n] = asm1[n] + (cy >> 1);
- asm1[n]-= cy & 1;
+ cy = mpn_addsub_n (ap1, am1, ap1, a1, n);
+ hi = ap1[n] - (cy & 1);
+ ap1[n] += (cy >> 1);
vm1_neg = 0;
}
#else
- as1[n] = asm1[n] + mpn_add_n (as1, a0_a2, a1, n);
- if (asm1[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0)
+ if (ap1[n] == 0 && mpn_cmp (ap1, a1, n) < 0)
{
- mpn_sub_n (asm1, a1, a0_a2, n);
+ ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
+ hi = 0;
vm1_neg = 1;
}
else
{
- cy = mpn_sub_n (asm1, a0_a2, a1, n);
- asm1[n]-= cy;
+ hi = ap1[n] - mpn_sub_n (am1, ap1, a1, n);
vm1_neg = 0;
}
+ ap1[n] += mpn_add_n (ap1, ap1, a1, n);
#endif
- /* Compute bs1 and bsm1. */
+ /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
if (t == n)
{
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (mpn_cmp (b0, b1, n) < 0)
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
+ cy = mpn_addsub_n (bp1, bm1, b1, b0, n);
vm1_neg ^= 1;
}
else
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
+ cy = mpn_addsub_n (bp1, bm1, b0, b1, n);
}
- bs1[n] = cy >> 1;
+ bp1[n] = cy >> 1;
#else
- bs1[n] = mpn_add_n (bs1, b0, b1, n);
+ bp1[n] = mpn_add_n (bp1, b0, b1, n);
if (mpn_cmp (b0, b1, n) < 0)
{
- mpn_sub_n (bsm1, b1, b0, n);
+ ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
vm1_neg ^= 1;
}
else
{
- mpn_sub_n (bsm1, b0, b1, n);
+ ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
}
#endif
}
else
{
- bs1[n] = mpn_add (bs1, b0, n, b1, t);
+ /* FIXME: Should still use addsub for the main part. */
+ bp1[n] = mpn_add (bp1, b0, n, b1, t);
if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
{
- mpn_sub_n (bsm1, b1, b0, t);
- MPN_ZERO (bsm1 + t, n - t);
+ ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
+ MPN_ZERO (bm1 + t, n - t);
vm1_neg ^= 1;
}
else
{
- mpn_sub (bsm1, b0, n, b1, t);
+ ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
}
}
- ASSERT (as1[n] <= 2);
- ASSERT (bs1[n] <= 1);
- ASSERT (asm1[n] <= 1);
-/*ASSERT (bsm1[n] == 0); */
-
-#define v0 pp /* 2n */
-#define v1 (scratch) /* 2n+1 */
-#define vinf (pp + 3 * n) /* s+t */
-#define vm1 (scratch + 2 * n + 1) /* 2n+1 */
-#define scratch_out scratch + 4 * n + 2
-
- /* vm1, 2n+1 limbs */
- TOOM32_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
- cy = 0;
- if (asm1[n] != 0)
- cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
- vm1[2 * n] = cy;
-
- /* v1, 2n+1 limbs */
- TOOM32_MUL_N_REC (v1, as1, bs1, n, scratch_out);
- if (as1[n] == 1)
+ TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
+ if (ap1[n] == 1)
{
- cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
+ cy = bp1[n] + mpn_add_n (v1 + n, v1 + n, bp1, n);
}
- else if (as1[n] == 2)
+ else if (ap1[n] == 2)
{
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
+ cy = 2 * bp1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
#else
- cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
+ cy = 2 * bp1[n] + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
#endif
}
else
cy = 0;
- if (bs1[n] != 0)
- cy += mpn_add_n (v1 + n, v1 + n, as1, n);
+ if (bp1[n] != 0)
+ cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
v1[2 * n] = cy;
+
+ TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
+ if (hi)
+ hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
- /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
- if (s > t) mpn_mul (vinf, a2, s, b1, t);
- else mpn_mul (vinf, b1, t, a2, s);
+ vm1[2*n] = hi;
- /* v0, 2n limbs */
- TOOM32_MUL_N_REC (v0, ap, bp, n, scratch_out);
-
- /* Interpolate */
-
+ /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
if (vm1_neg)
{
-#if HAVE_NATIVE_mpn_rsh1add_n
- mpn_rsh1add_n (vm1, v1, vm1, 2 * n + 1);
+#if HAVE_NATIVE_mpn_rsh1sub_n
+ mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
#else
- mpn_add_n (vm1, v1, vm1, 2 * n + 1);
- mpn_rshift (vm1, vm1, 2 * n + 1, 1);
+ mpn_sub_n (v1, v1, vm1, 2*n+1);
+ ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
#endif
}
else
{
-#if HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (vm1, v1, vm1, 2 * n + 1);
+#if HAVE_NATIVE_mpn_rsh1add_n
+ mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
#else
- mpn_sub_n (vm1, v1, vm1, 2 * n + 1);
- mpn_rshift (vm1, vm1, 2 * n + 1, 1);
+ mpn_add_n (v1, v1, vm1, 2*n+1);
+ ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
#endif
}
- t += s; /* limbs in vinf */
- if (LIKELY(t>=n)) /* We should consider this branch only, even better t>n */
- s = n; /* limbs in Lvinf */
- else {
- s = t; /* limbs in Lvinf */
- v1[2 * n]=0;
- }
+ /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
- mpn_sub_n (v1, v1, vm1, s + n + 1);
+ y = x1 + x3 + (x0 + x2) * B
More information about the gmp-commit
mailing list