[Gmp-commit] /home/hgfiles/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Dec 20 04:00:26 CET 2009
details: /home/hgfiles/gmp/rev/3fef4dbd4df0
changeset: 13138:3fef4dbd4df0
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Dec 20 03:56:18 2009 +0100
description:
Adjust defaults of just added threshold parameters.
details: /home/hgfiles/gmp/rev/13e7ee60e57c
changeset: 13139:13e7ee60e57c
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Dec 20 04:00:24 2009 +0100
description:
Major rewrite.
diffstat:
ChangeLog | 3 +
gmp-impl.h | 8 +-
mpn/generic/mul.c | 215 +++++++++++++++++++++++++++++++++--------------------
3 files changed, 139 insertions(+), 87 deletions(-)
diffs (truncated from 307 to 300 lines):
diff -r 018d0eeb7339 -r 13e7ee60e57c ChangeLog
--- a/ChangeLog Sun Dec 20 03:55:38 2009 +0100
+++ b/ChangeLog Sun Dec 20 04:00:24 2009 +0100
@@ -1,5 +1,8 @@
2009-12-20 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/generic/mul.c: Major rewrite. Use toom43, toom53, toom63.
+ Call mpn_nussbaumer_mul for largest operands.
+
* tune/speed.h (SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL): New macro.
(SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL): New macro.
(SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL): New macro.
diff -r 018d0eeb7339 -r 13e7ee60e57c gmp-impl.h
--- a/gmp-impl.h Sun Dec 20 03:55:38 2009 +0100
+++ b/gmp-impl.h Sun Dec 20 04:00:24 2009 +0100
@@ -1655,19 +1655,19 @@
#endif
#ifndef MUL_TOOM32_TO_TOOM43_THRESHOLD
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 140
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 100
#endif
#ifndef MUL_TOOM32_TO_TOOM53_THRESHOLD
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 170
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
#endif
#ifndef MUL_TOOM42_TO_TOOM53_THRESHOLD
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 190
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 100
#endif
#ifndef MUL_TOOM42_TO_TOOM63_THRESHOLD
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 230
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 110
#endif
/* MUL_TOOM22_THRESHOLD_LIMIT is the maximum for MUL_TOOM22_THRESHOLD. In a
diff -r 018d0eeb7339 -r 13e7ee60e57c mpn/generic/mul.c
--- a/mpn/generic/mul.c Sun Dec 20 03:55:38 2009 +0100
+++ b/mpn/generic/mul.c Sun Dec 20 04:00:24 2009 +0100
@@ -43,6 +43,37 @@
2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
the multiplier and the multiplicand. */
+/*
+ * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
+ ideal lines of the surrounding algorithms. Is that optimal?
+
+ * The toomX3 code now uses a structure similar to the one of toomX2, except
+ that it loops longer in the unbalanced case. The result is that the
+ remaining area might have un < vn. Should we fix the toomX2 code in a
+ similar way?
+
+ * The toomX3 code is used for the largest non-FFT unbalanced operands. It
+ therefore calls mpn_mul recursively for certain cases.
+
+ * Allocate static temp space using THRESHOLD variables (except for toom44
+ when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
+
+ * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
+ have the same vn threshold. This is not true, we should actually use
+ mul_basecase for slightly larger operands for toom32 than for toom22, and
+ even larger for toom42.
+
+ * That problem is even more prevalent for toomX3. We therefore have a
+ special fix for avoiding toom63; we call toom42 sometimes in the X3 code.
+ We have an uglier fix for toom53, but here we need to choose toom32 or
+ toom42...
+
+ * Is our ITCH allocation correct?
+*/
+
+#define ITCH (16*vn + 100)
+#define WSALLOC (4 * vn)
+
mp_limb_t
mpn_mul (mp_ptr prodp,
mp_srcptr up, mp_size_t un,
@@ -121,32 +152,31 @@
}
else
{
- ASSERT_ALWAYS (un > 0);
+ ASSERT (un > 0);
mpn_mul_basecase (prodp, vp, vn, up, un);
}
cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
mpn_incr_u (prodp + vn, cy);
}
}
- else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD) ||
- /* Also do larger unbalanced here, up to a (somewhat arbitrarily)
- larger vn limit, unless toom33 can do this product directly. */
- (!TOOM33_OK (un, vn) && BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD * 3 / 2)))
+ else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
{
- /* Loop over toom42, then choose toom42, toom32, or toom22 */
+ /* Use ToomX2 variants */
mp_ptr scratch;
TMP_DECL; TMP_MARK;
+ scratch = TMP_ALLOC_LIMBS (ITCH);
-#define ITCH ((un + vn) * 4 + 100)
- scratch = TMP_ALLOC_LIMBS (ITCH + 1);
-
+ /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
+ square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
+ wise; we would get better balance by slightly moving the bound. We
+ will sometimes end up with un < vn, like the the X3 arm below. */
if (un >= 3 * vn)
{
mp_limb_t cy;
mp_ptr ws;
-#define WSALL (4 * vn)
- ws = TMP_SALLOC_LIMBS (WSALL + 1);
+
+ ws = TMP_SALLOC_LIMBS (WSALLOC + 1);
mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
un -= 2 * vn;
@@ -186,92 +216,111 @@
}
TMP_FREE;
}
- else if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD))
+ else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) || un > 8 * vn)
{
+ /* Handle the largest operands that are not in the FFT range. The 2nd
+ condition makes very unbalanced operands avoid the FFT code (except
+ perhaps as coefficient products of the Toom code. */
+
TMP_DECL; TMP_MARK;
- if (TOOM33_OK (un, vn))
+
+ if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
{
- /* Apply toom33 directly, since operands are balanced enough. */
+ /* Use ToomX3 variants */
mp_ptr scratch;
- scratch = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (un, vn));
- mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
+
+ scratch = TMP_ALLOC_LIMBS (ITCH);
+
+ if (2 * un >= 5 * vn)
+ {
+ mp_limb_t cy;
+ mp_ptr ws;
+
+ ws = TMP_SALLOC_LIMBS (WSALLOC + 1);
+
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+ mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
+ else
+ mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ prodp += 2 * vn;
+
+ while (2 * un >= 5 * vn)
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+ mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
+ else
+ mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, 2 * vn);
+ mpn_incr_u (prodp + vn, cy);
+ prodp += 2 * vn;
+ }
+
+ if (un < vn)
+ mpn_mul (ws, vp, vn, up, un);
+ else
+ mpn_mul (ws, up, un, vp, vn);
+
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+ }
+ else
+ {
+ if (6 * un < 7 * vn)
+ mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
+ else if (6 * un < 9 * vn)
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
+ mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
+ }
+ else if (6 * un < 11 * vn)
+ {
+ if (4 * un < 7 * vn)
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
+ mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
+ }
+ else
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
+ mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
+ }
+ }
+ else
+ {
+ if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+ mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
+ }
+ }
}
else
{
- /* Apply toom33, recurse. FUTURE: toom63, toom53, toom43, toom33 */
- mp_ptr scratch, pp; /* FIXME: use same area for these */
- mp_limb_t cy;
- scratch = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (vn, vn));
- mpn_toom33_mul (prodp, up, vn, vp, vn, scratch);
- prodp += vn;
- up += vn;
- un -= vn;
- pp = TMP_SALLOC_LIMBS (2 * vn);
- while (un >= vn)
- {
- mpn_toom33_mul (pp, up, vn, vp, vn, scratch);
- cy = mpn_add_n (prodp, prodp, pp, vn);
- MPN_COPY (prodp + vn, pp + vn, vn);
- mpn_incr_u (prodp + vn, cy);
- prodp += vn;
- up += vn;
- un -= vn;
- }
- if (un != 0)
- {
- mpn_mul (pp, vp, vn, up, un);
- cy = mpn_add_n (prodp, prodp, pp, vn);
- MPN_COPY (prodp + vn, pp + vn, un);
- mpn_incr_u (prodp + vn, cy);
- }
- }
- TMP_FREE;
- }
- else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
- BELOW_THRESHOLD (vn, MUL_FFT_THRESHOLD / 3)) /* FIXME */
- {
- TMP_DECL; TMP_MARK;
- if (TOOM44_OK (un, vn))
- {
- /* Apply toom44 directly, since operands are balanced enough. */
mp_ptr scratch;
+
+ ASSERT (TOOM44_OK (un, vn));
+
scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
}
- else
- {
- /* Apply toom44, recurse. FUTURE: toom84, toom74, toom64, toom54, toom44 */
- mp_ptr scratch, pp; /* FIXME: use same area for these */
- mp_limb_t cy;
- scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (vn, vn));
- mpn_toom44_mul (prodp, up, vn, vp, vn, scratch);
- prodp += vn;
- up += vn;
- un -= vn;
- pp = TMP_SALLOC_LIMBS (2 * vn);
- while (un >= vn)
- {
- mpn_toom44_mul (pp, up, vn, vp, vn, scratch);
- cy = mpn_add_n (prodp, prodp, pp, vn);
- MPN_COPY (prodp + vn, pp + vn, vn);
- mpn_incr_u (prodp + vn, cy);
- prodp += vn;
- up += vn;
- un -= vn;
- }
- if (un != 0)
- {
- mpn_mul (pp, vp, vn, up, un);
- cy = mpn_add_n (prodp, prodp, pp, vn);
- MPN_COPY (prodp + vn, pp + vn, un);
- mpn_incr_u (prodp + vn, cy);
- }
- }
TMP_FREE;
}
else
{
More information about the gmp-commit
mailing list