# mpz_mul core dump - attempt 2

Torbjorn Granlund tege at swox.com
Sat Sep 18 02:05:53 CEST 2004

```Please try this patch for the FFT crash.  (Thanks to Paul
Zimmermann for debugging this and providing the patch.)

--
Torbjörn

--- mul_fft.c.orig	2004-09-16 16:10:43.000000000 +0200
+++ mul_fft.c	2004-09-17 17:05:02.339802415 +0200
@@ -72,8 +72,10 @@
mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr,int));

-/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
-   sqr==0 if for a multiply, sqr==1 for a square */
+/* Find the best k to use for a mod 2^(m*BITS_PER_MP_LIMB)+1 FFT
+   with m >= n.
+   sqr==0 if for a multiply, sqr==1 for a square.
+*/
int
mpn_fft_best_k (mp_size_t n, int sqr)
{
@@ -91,26 +93,17 @@
}

-/* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
-
-   FIXME: Is this N rounded up to the next multiple of (2^k)*BITS_PER_MP_LIMB
-   bits and therefore simply pl rounded up to a multiple of 2^k? */
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k,
+   i.e. smallest multiple of 2^k >= pl. */

mp_size_t
mpn_fft_next_size (mp_size_t pl, int k)
{
-  mp_size_t N, M;
-  int K;
+  unsigned long K;

-  /*  if (k==0) k = mpn_fft_best_k (pl, sqr); */
-  N = pl * BITS_PER_MP_LIMB;
K = 1 << k;
-  if (N % K)
-    N = (N / K + 1) * K;
-  M = N / K;
-  if (M % BITS_PER_MP_LIMB)
-    N = ((M / BITS_PER_MP_LIMB) + 1) * BITS_PER_MP_LIMB * K;
-  return N / BITS_PER_MP_LIMB;
+  pl = 1 + (pl - 1) / K; /* ceil(pl/K) */
+  return pl * K;
}

@@ -205,15 +198,18 @@
{
if (K == 2)
{
+      mp_limb_t cy;
-      if (mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1)
-	Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, CNST_LIMB(1));
+      cy = mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
#else
MPN_COPY (tp, Ap[0], n + 1);
-      mpn_add_n (Ap[0], Ap[0], Ap[inc],n + 1);
-      if (mpn_sub_n (Ap[inc], tp, Ap[inc],n + 1))
-	Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, CNST_LIMB(1));
+      mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
+      cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
#endif
+      if (Ap[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
+        Ap[0][n] = CNST_LIMB(1) - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - CNST_LIMB(1));
+      if (cy) /* Ap[inc][n] can be -1 or -2 */
+        Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + CNST_LIMB(1));
}
else
{
@@ -252,24 +248,26 @@
{
if (K == 2)
{
+      mp_limb_t ca, cb;
-      if (mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1)
-	Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, CNST_LIMB(1));
+      ca = mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
+      cb = mpn_addsub_n (Bp[0], Bp[inc], Bp[0], Bp[inc], n + 1) & 1;
#else
MPN_COPY (tp, Ap[0], n + 1);
-      mpn_add_n (Ap[0], Ap[0], Ap[inc],n + 1);
-      if (mpn_sub_n (Ap[inc], tp, Ap[inc],n + 1))
-	Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, CNST_LIMB(1));
-#endif
-      if (mpn_addsub_n (Bp[0], Bp[inc], Bp[0], Bp[inc], n + 1) & 1)
-	Bp[inc][n] = mpn_add_1 (Bp[inc], Bp[inc], n, CNST_LIMB(1));
-#else
+      mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
+      ca = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
MPN_COPY (tp, Bp[0], n + 1);
-      mpn_add_n (Bp[0], Bp[0], Bp[inc],n + 1);
-      if (mpn_sub_n (Bp[inc], tp, Bp[inc],n + 1))
-	Bp[inc][n] = mpn_add_1 (Bp[inc], Bp[inc], n, CNST_LIMB(1));
+      mpn_add_n (Bp[0], Bp[0], Bp[inc], n + 1);
+      cb = mpn_sub_n (Bp[inc], tp, Bp[inc], n + 1);
#endif
+      if (Ap[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
+        Ap[0][n] = CNST_LIMB(1) - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - CNST_LIMB(1));
+      if (ca) /* Ap[inc][n] can be -1 or -2 */
+        Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + CNST_LIMB(1));
+      if (Bp[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
+        Bp[0][n] = CNST_LIMB(1) - mpn_sub_1 (Bp[0], Bp[0], n, Bp[0][n] - CNST_LIMB(1));
+      if (cb) /* Bp[inc][n] can be -1 or -2 */
+        Bp[inc][n] = mpn_add_1 (Bp[inc], Bp[inc], n, ~Bp[inc][n] + CNST_LIMB(1));
}
else
{
@@ -339,13 +337,29 @@
mp_ptr *Ap,*Bp,A,B,T;

k = mpn_fft_best_k (n, sqr);
-      K2 = 1<<k;
+      K2 = 1 << k;
+      ASSERT_ALWAYS(n % K2 == 0);
maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
M2 = n*BITS_PER_MP_LIMB/K2;
-      l = n/K2;
-      Nprime2 = ((2 * M2+k+2+maxLK)/maxLK)*maxLK; /* ceil()(2*M2+k+3)/maxLK)*maxLK*/
-      nprime2 = Nprime2/BITS_PER_MP_LIMB;
-      Mp2 = Nprime2/K2;
+      l = n / K2;
+      Nprime2 = ((2 * M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
+      nprime2 = Nprime2 / BITS_PER_MP_LIMB;
+
+      /* we should ensure that nprime2 is a multiple of the next K */
+      if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+        {
+          unsigned long K3;
+          while (nprime2 % (K3 = 1 << mpn_fft_best_k (nprime2, sqr)))
+            {
+              nprime2 = ((nprime2 + K3 - 1) / K3) * K3;
+              Nprime2 = nprime2 * BITS_PER_MP_LIMB;
+              /* warning: since nprime2 changed, K3 may change too! */
+            }
+          ASSERT(nprime2 % K3 == 0);
+        }
+      ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */
+
+      Mp2 = Nprime2 / K2;

Ap = TMP_ALLOC_MP_PTRS (K2);
Bp = TMP_ALLOC_MP_PTRS (K2);
@@ -402,22 +416,28 @@

/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
-   output: K*A[0] K*A[K-1] ... K*A[1] */
+   output: K*A[0] K*A[K-1] ... K*A[1].
+   Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1.
+   This condition is also fulfilled at exit.
+*/

static void
mpn_fft_fftinv (mp_ptr *Ap, int K, mp_size_t omega, mp_size_t n, mp_ptr tp)
{
if (K == 2)
{
+      mp_limb_t cy;
-      if (mpn_addsub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1)
-	Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, CNST_LIMB(1));
+      cy = mpn_addsub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
#else
MPN_COPY (tp, Ap[0], n + 1);
mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1);
-      if (mpn_sub_n (Ap[1], tp, Ap[1], n + 1))
-	Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, CNST_LIMB(1));
+      cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1);
#endif
+      if (Ap[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
+        Ap[0][n] = CNST_LIMB(1) - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - CNST_LIMB(1));
+      if (cy) /* Ap[1][n] can be -1 or -2 */
+        Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + CNST_LIMB(1));
}
else
{
@@ -459,12 +479,13 @@
}

-/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
+/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n <= an <= 3*n */
static void
mpn_fft_norm_modF (mp_ptr rp, mp_ptr ap, mp_size_t n, mp_size_t an)
{
mp_size_t l;

+  ASSERT (n <= an && an <= 3 * n);
if (an > 2 * n)
{
l = n;
@@ -595,7 +616,7 @@
/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
n and m have respectively nl and ml limbs
op must have space for pl+1 limbs
-   One must have pl = mpn_fft_next_size (pl, k).
+   Assumes pl is multiple of 2^k.
*/

void
@@ -612,7 +633,6 @@
TMP_DECL(marker);

TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k));
-  ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl);

TMP_MARK(marker);
N = pl * BITS_PER_MP_LIMB;
@@ -620,25 +640,31 @@
for (i = 0; i <= k; i++)
_fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
mpn_fft_initl (_fft_l, k);
-  K = 1<<k;
-  M = N/K;	/* N = 2^k M */
-  l = M/BITS_PER_MP_LIMB;
+  K = 1 << k;
+  ASSERT_ALWAYS (pl % K == 0);
+  M = N/K;	/* exact: N = 2^k M */
+  l = M / BITS_PER_MP_LIMB; /* l = pl / K also */
maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;

Nprime = ((2 * M + k + 2 + maxLK) / maxLK) * maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
nprime = Nprime / BITS_PER_MP_LIMB;
+  /* with B := BITS_PER_MP_LIMB, nprime >= 2*M/B = 2*N/(K*B) = 2*pl/K = 2*l */
TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
N, K, M, l, maxLK, Nprime, nprime));
+  /* we should ensure that recursively, nprime is a multiple of the next K */
if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
{
-      maxLK = (1 << mpn_fft_best_k (nprime,n == m)) * BITS_PER_MP_LIMB;
-      if (Nprime % maxLK)
-	{
-	  Nprime = ((Nprime / maxLK) + 1) * maxLK;
-	  nprime = Nprime / BITS_PER_MP_LIMB;
-	}
+      unsigned long K2;
+      while (nprime % (K2 = 1 << mpn_fft_best_k (nprime, sqr)))
+        {
+          nprime = ((nprime + K2 - 1) / K2) * K2;
+          Nprime = nprime * BITS_PER_MP_LIMB;
+          /* warning: since nprime changed, K2 may change too! */
+        }
TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
+      ASSERT(nprime % K2 == 0);
}
+  ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */

T = TMP_ALLOC_LIMBS (nprime + 1);
Mp = Nprime/K;
```