Allocation for toom33

Sun Oct 25 23:33:17 CET 2009

Torbjorn Granlund <tg at gmplib.org> writes:

> nisse at lysator.liu.se (Niels Möller) writes:
>
>   Ok, it should be easy to increase the size claimed by
>   mpn_toom33_mul_itch by n+1 (and then hope that it still works with
>   current callers that bypass the itch function).

Below is a patch to do that. But due to the recursion, the increase is
actually 1.5 n (+ logterm which is supposedly still small).

> Those naughty callers deserve to have their wrists slapped!

I'll leave that to you... I seemed to remember that one offending
function was mpn_mul, but it looks like it does the right thing. 

Regards,
/Niels


diff -r bd0e76eee038 gmp-impl.h

--- a/gmp-impl.h	Sat Oct 24 23:03:26 2009 +0200
+++ b/gmp-impl.h	Sun Oct 25 22:48:37 2009 +0100
@@ -1021,35 +1021,7 @@
    For now, assume n is arbitrarily large.  */
 #define MPN_KARA_MUL_N_TSIZE(n)   (2*(n) + 2*GMP_LIMB_BITS)
 #define MPN_KARA_SQR_N_TSIZE(n)   (2*(n) + 2*GMP_LIMB_BITS)
-
-/* toom3 uses 2n + 2n/3 + o(n) limbs of temporary space if mpn_sublsh1_n is
-   unavailable, but just 2n + o(n) if mpn_sublsh1_n is available.  It is hard
-   to pin down the value of o(n), since it is a complex function of
-   MUL_TOOM3_THRESHOLD and n.  Normally toom3 is used between kara and fft; in
-   that case o(n) will be really limited.  If toom3 is used for arbitrarily
-   large operands, o(n) will be larger.  These definitions handle operands of
-   up to 8956264246117233 limbs.  A single multiplication using toom3 on the
-   fastest hardware currently (2008) would need 10 million years, which
-   suggests that these limits are acceptable.  */
-#if WANT_FFT
-#if HAVE_NATIVE_mpn_sublsh1_n
-#define MPN_TOOM3_MUL_N_TSIZE(n)  (2*(n) + 63)
-#define MPN_TOOM3_SQR_N_TSIZE(n)  (2*(n) + 63)
-#else
-#define MPN_TOOM3_MUL_N_TSIZE(n)  (2*(n) + 2*(n/3) + 63)
-#define MPN_TOOM3_SQR_N_TSIZE(n)  (2*(n) + 2*(n/3) + 63)
-#endif
-#else /* WANT_FFT */
-#if HAVE_NATIVE_mpn_sublsh1_n
-#define MPN_TOOM3_MUL_N_TSIZE(n)  (2*(n) + 255)
-#define MPN_TOOM3_SQR_N_TSIZE(n)  (2*(n) + 255)
-#else
-#define MPN_TOOM3_MUL_N_TSIZE(n)  (2*(n) + 2*(n/3) + 255)
-#define MPN_TOOM3_SQR_N_TSIZE(n)  (2*(n) + 2*(n/3) + 255)
-#endif
-#define MPN_TOOM44_MAX_N 285405
-#endif /* WANT_FFT */
-
+  
 /* need 2 so that n2>=1 */
 #define MPN_KARA_MUL_N_MINSIZE    2
 #define MPN_KARA_SQR_N_MINSIZE    2
@@ -4300,10 +4272,9 @@
 static inline mp_size_t
 mpn_toom33_mul_itch (mp_size_t an, mp_size_t bn)
 {
-  /* We could trim this to 4n+3 if HAVE_NATIVE_mpn_sublsh1_n, since
-     mpn_toom_interpolate_5pts only needs scratch otherwise.  */
+  /* Can probably be trimmed to 2 an + O(log an). */
   mp_size_t n = (an + 2) / (size_t) 3;
-  return 6 * n + GMP_NUMB_BITS;
+  return 15 * n / 2 + GMP_NUMB_BITS;
 }
 
 static inline mp_size_t
@@ -4372,10 +4343,10 @@
 static inline mp_size_t
 mpn_toom3_sqr_itch (mp_size_t an)
 {
-  /* We could trim this to 4n+3 if HAVE_NATIVE_mpn_sublsh1_n, since
-     mpn_toom_interpolate_5pts only needs scratch otherwise.  */
+  /* Same as mpn_toom33_mul_itch. Can probably be trimmed to 2 an +
+     O(log an). */
   mp_size_t n = (an + 2) / (size_t) 3;
-  return 6 * n + GMP_NUMB_BITS;
+  return 15 * n / 2 + GMP_NUMB_BITS;
 }
 
 static inline mp_size_t
diff -r bd0e76eee038 mpn/generic/mul_n.c
--- a/mpn/generic/mul_n.c	Sat Oct 24 23:03:26 2009 +0200
+++ b/mpn/generic/mul_n.c	Sun Oct 25 22:48:37 2009 +0100
@@ -713,7 +713,7 @@
       mp_ptr ws;
       TMP_SDECL;
       TMP_SMARK;
-      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
+      ws = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (n, n));
       mpn_toom33_mul (p, a, n, b, n, ws);
       TMP_SFREE;
     }
@@ -782,7 +782,7 @@
       mp_ptr ws;
       TMP_SDECL;
       TMP_SMARK;
-      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n));
+      ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n));
       mpn_toom3_sqr_n (p, a, n, ws);
       TMP_SFREE;
     }
diff -r bd0e76eee038 mpn/generic/toom33_mul.c
--- a/mpn/generic/toom33_mul.c	Sat Oct 24 23:03:26 2009 +0200
+++ b/mpn/generic/toom33_mul.c	Sun Oct 25 22:48:37 2009 +0100
@@ -1,4 +1,4 @@
-/* mpn_toom33_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
+/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
    size.  Or more accurately, bn <= an < (3/2)bn.
 
    Contributed to the GNU project by Torbjorn Granlund.
@@ -78,7 +78,6 @@
   mp_ptr gp;
   mp_ptr as1, asm1, as2;
   mp_ptr bs1, bsm1, bs2;
-  TMP_DECL;
 
 #define a0  ap
 #define a1  (ap + n)
@@ -97,9 +96,7 @@
   ASSERT (0 < s && s <= n);
   ASSERT (0 < t && t <= n);
 
-  TMP_MARK;
-
-  as1 = TMP_SALLOC_LIMBS (n + 1); /* Should be (pp + 4 * n + 4), but we need 5n+5<=4n+s+t i.e. s+t>n+4*/
+  as1 = scratch + 4*n + 4;
   asm1 = scratch + 2 * n + 2;
   as2 = pp + n + 1;
 
@@ -217,7 +214,7 @@
 #define vinf  (pp + 4 * n)			/* s+t */
 #define vm1   scratch				/* 2n+1 */
 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
-#define scratch_out  (scratch + 4 * n + 4)
+#define scratch_out  (scratch + 5 * n + 5)
 
   /* vm1, 2n+1 limbs */
 #ifdef SMALLER_RECURSION
@@ -279,6 +276,4 @@
   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
 
   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, 1^vm1_neg, vinf0);
-
-  TMP_FREE;
 }
diff -r bd0e76eee038 mpn/generic/toom3_sqr.c
--- a/mpn/generic/toom3_sqr.c	Sat Oct 24 23:03:26 2009 +0200
+++ b/mpn/generic/toom3_sqr.c	Sun Oct 25 22:48:37 2009 +0100
@@ -72,7 +72,6 @@
   mp_limb_t cy, vinf0;
   mp_ptr gp;
   mp_ptr as1, asm1, as2;
-  TMP_DECL;
 
 #define a0  ap
 #define a1  (ap + n)
@@ -84,9 +83,7 @@
 
   ASSERT (0 < s && s <= n);
 
-  TMP_MARK;
-
-  as1 = TMP_SALLOC_LIMBS (n + 1); /* Should be (pp + 4 * n + 4), but we need 5n+5<=4n+s+t i.e. s+t>n+4*/
+  as1 = scratch + 4 * n + 4;
   asm1 = scratch + 2 * n + 2;
   as2 = pp + n + 1;
 
@@ -145,7 +142,7 @@
 #define vinf  (pp + 4 * n)			/* s+s */
 #define vm1   scratch				/* 2n+1 */
 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
-#define scratch_out  (scratch + 4 * n + 4)
+#define scratch_out  (scratch + 5 * n + 5)
 
   /* vm1, 2n+1 limbs */
 #ifdef SMALLER_RECURSION
@@ -205,6 +202,4 @@
   TOOM3_SQR_N_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
 
   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 1, vinf0);
-
-  TMP_FREE;
 }