What's a reasonable size ratio for toom32?
Niels Möller
nisse at lysator.liu.se
Tue Jul 25 09:13:59 CEST 2023
Niels Möller <nisse at lysator.liu.se> writes:
> Bottom line: If mpn_mul is fixed to not call toom32 in the above range
> (toom22 could be a reasonable replacement), and stricter ranges for
> toom22 and toom32 are properly documented, then we can change toom32 to
> not recurse using the top-level mpn_mul. And it should also get a lot
> easier to adapt toom22 and toom32 to the itch/scratch convention, if we
> want to go that way.
See patch below. Makes both toom22 and toom32 recurse only to themselves
(or to schoolbook), and use provided scratch space, without any
additional allocations. Passes make check (with asserts enabled) on my
machine, but I probably don't hit the case marked with a FIXME in
mpn_mul.
Not sure what would be the appropriate way to benchmark; could look at a
range of unbalancedness ratio, at one or a few fixed sizes relevant to
toom22 and toom32, or look at fixed ratio, perhaps 1/2, over a range of
sizes.
Ratio 1/2 is particularly interesting because we go into toom32 and stay
there, with the same ratio for the top product, until we get into
schoolbook range.
Regards,
/Niels
diff -r 6741673d1679 gmp-impl.h
--- a/gmp-impl.h Thu Jul 20 12:44:33 2023 +0200
+++ b/gmp-impl.h Tue Jul 25 08:34:58 2023 +0200
@@ -5238,13 +5238,14 @@
return mpn_toom8_mul_n_itch (estimatedN * 8);
}
+/* Always less than mpn_toom22_mul_itch (an). */
static inline mp_size_t
mpn_toom32_mul_itch (mp_size_t an, mp_size_t bn)
{
mp_size_t n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
mp_size_t itch = 2 * n + 1;
- return itch;
+ return itch + mpn_toom22_mul_itch (n, n);
}
static inline mp_size_t
diff -r 6741673d1679 mpn/generic/mul.c
--- a/mpn/generic/mul.c Thu Jul 20 12:44:33 2023 +0200
+++ b/mpn/generic/mul.c Tue Jul 25 08:34:58 2023 +0200
@@ -246,7 +246,7 @@
/* vn <= un < 3vn */
- if (4 * un < 5 * vn)
+ if (4 * un < 5 * vn + 3)
mpn_toom22_mul (ws, up, un, vp, vn, scratch);
else if (4 * un < 7 * vn)
mpn_toom32_mul (ws, up, un, vp, vn, scratch);
@@ -259,7 +259,7 @@
}
else
{
- if (4 * un < 5 * vn)
+ if (4 * un < 5 * vn + 3)
mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
else if (4 * un < 7 * vn)
mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
@@ -339,6 +339,7 @@
else if (2 * un < 3 * vn)
{
if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
+ /* FIXME: May call toom32 with out of range ratio? */
mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
else
mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
diff -r 6741673d1679 mpn/generic/toom22_mul.c
--- a/mpn/generic/toom22_mul.c Thu Jul 20 12:44:33 2023 +0200
+++ b/mpn/generic/toom22_mul.c Tue Jul 25 08:34:58 2023 +0200
@@ -78,7 +78,7 @@
if (! MAYBE_mul_toom22 \
|| BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD)) \
mpn_mul_basecase (p, a, an, b, bn); \
- else if (4 * an < 5 * bn) \
+ else if (4 * an < 5 * bn + 3) \
mpn_toom22_mul (p, a, an, b, bn, ws); \
else \
mpn_toom32_mul (p, a, an, b, bn, ws); \
@@ -106,7 +106,9 @@
n = an - s;
t = bn - n;
+ /* Ensure that t/s >= 1/2, for proper call to mpn_toom32_mul. */
ASSERT (an >= bn);
+ ASSERT (3*an <= 4*bn);
ASSERT (0 < s && s <= n && (n - s) == (an & 1));
ASSERT (0 < t && t <= s);
diff -r 6741673d1679 mpn/generic/toom32_mul.c
--- a/mpn/generic/toom32_mul.c Thu Jul 20 12:44:33 2023 +0200
+++ b/mpn/generic/toom32_mul.c Tue Jul 25 08:34:58 2023 +0200
@@ -58,7 +58,21 @@
#define TOOM32_MUL_N_REC(p, a, b, n, ws) \
do { \
- mpn_mul_n (p, a, b, n); \
+ if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
+ mpn_mul_basecase (p, a, n, b, n); \
+ else { \
+ mpn_toom22_mul (p, a, n, b, n, ws); \
+ } \
+ } while (0)
+
+#define TOOM32_MUL_REC(p, a, an, b, bn, ws) \
+ do { \
+ if (BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD)) \
+ mpn_mul_basecase (p, a, an, b, bn); \
+ else if (4 * an < 5 * bn + 3) \
+ mpn_toom22_mul (p, a, an, b, bn, ws); \
+ else \
+ mpn_toom32_mul (p, a, an, b, bn, ws); \
} while (0)
void
@@ -79,6 +93,15 @@
#define b0 bp
#define b1 (bp + n)
+ /* Require 1/2 <= bn/an <= 4/5, or more precisely,
+ ceil(5/4 bn) <= an <= 2 bn.
+
+ This ensures that min(s,t) / max(s,t) >= 1/2. */
+ ASSERT (an <= 2*bn);
+ /* Extra +3 required for odd bn, due to rounding in n = ceil(bn/2).
+ FIXME: +2 might be sufficient */
+ ASSERT (4*an >= 5*bn + 3);
+
/* Required, to ensure that s + t >= n. */
ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
@@ -271,9 +294,9 @@
}
TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
- /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
- if (s > t) mpn_mul (pp+3*n, a2, s, b1, t);
- else mpn_mul (pp+3*n, b1, t, a2, s);
+ /* vinf, s+t limbs. */
+ if (s > t) TOOM32_MUL_REC (pp+3*n, a2, s, b1, t, scratch_out);
+ else TOOM32_MUL_REC (pp+3*n, b1, t, a2, s, scratch_out);
/* Remaining interpolation.
diff -r 6741673d1679 tests/mpn/t-toom22.c
--- a/tests/mpn/t-toom22.c Thu Jul 20 12:44:33 2023 +0200
+++ b/tests/mpn/t-toom22.c Tue Jul 25 08:34:58 2023 +0200
@@ -2,9 +2,7 @@
#define mpn_toomMN_mul_itch mpn_toom22_mul_itch
#define MIN_AN MIN(MPN_TOOM22_MUL_MINSIZE,4)
-#define MIN_BN(an) \
- ((an) >= 2*MUL_TOOM22_THRESHOLD \
- ? (an) + 2 - MUL_TOOM22_THRESHOLD \
- : ((an)+1)/2 + 1 + (an & 1))
+/* Rule out an = 5, bn = 4. */
+#define MIN_BN(an) ((an) == 5 ? 5 : 3*((an) + 1) / 4)
#include "toom-shared.h"
diff -r 6741673d1679 tests/mpn/t-toom32.c
--- a/tests/mpn/t-toom32.c Thu Jul 20 12:44:33 2023 +0200
+++ b/tests/mpn/t-toom32.c Tue Jul 25 08:34:58 2023 +0200
@@ -2,7 +2,7 @@
#define mpn_toomMN_mul_itch mpn_toom32_mul_itch
#define MIN_AN 6
-#define MIN_BN(an) (((an) + 8) / (size_t) 3)
-#define MAX_BN(an) ((an) - 2)
+#define MIN_BN(an) (((an) <= 10) ? ((an) + 8) / (size_t) 3 : ((an)+1)/2)
+#define MAX_BN(an) ((4*(an)-3) / (size_t) 5)
#include "toom-shared.h"
> Regards,
> /Niels
--
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list