subquadratic mpn_hgcd_appr

Tue Oct 11 14:17:21 CEST 2011

nisse at lysator.liu.se (Niels Möller) writes:

> So one should really benchmark hgcd + hgcd_adjust vs hgcd_appr +
> hgcd_matrix_apply, to see which one is fastest, and then *both*
> functions should use the fastest variant for each size. The theory is
> that hgcd_appr ought to win for large enough sizes.

I wrote the below function (any suggestion on a better name?)

mp_size_t
mpn_hgcd_big_step (struct hgcd_matrix *M,
		   mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p,
		   mp_ptr tp)
{
  mp_size_t nn;
  if (BELOW_THRESHOLD (n, HGCD_BIG_STEP_THRESHOLD))
    {
      nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
      if (nn > 0)
	/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
	   = 2 (n - 1) */
	return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
    }
  else
    {
      MPN_COPY (tp, ap + p, n - p);
      MPN_COPY (tp + n - p, bp + p, n - p);
      if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
	return hgcd_matrix_apply (M, ap, bp, n);
    }
  return 0;
}

I then use this for the first recursive call in mpn_hgcd and mpn_hgcd_appr (it
could be used also for the second call in mpn_hgcd, but it might need
more scratch space, and also the tuning might a bit different).

I tried some tuning, and then HGCD_BIG_STEP_THRESHOLD  seems to be
around 1000 limbs, with HGCD_THRESHOLD at 117 and HGCD_APPR_THRESHOLD at
161 limbs. The saving appears to be about 8% at large sizes.

Below I also append the hgcd_matrix_apply function. I wonder if it would
be useful to make this function use other strategies for smaller n.
Alternatives:

1. mullo (typical size 3n/4 x n/4, so it would really be one n/2 x n/4
   regular multiplication and one n/4 mullo.)

2. mulmid (typical size also 3n/4 x n/4). This would be useful when
   called from hgcd_appr, but not if called from hgcd, because in the
   latter case also the low limbs are needed.

The current strategy uses n x n/4 multiplication mod (B^{3n/4} - 1).
Maybe mpn_mulmod_bnm1 could be better optimized for such unbalanced
operands, or maybe one can't get any larger saving from wraparound.

Comments appreciated.

Regards,
/Niels

/* Computes R -= A * B. Result must be non-negative. Normalized down
   to size an, and resulting size is returned. */
static mp_size_t
submul (mp_ptr rp, mp_size_t rn,
	mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
{ [...] }

/* Computes (a, b)  <--  M^{-1} (a; b) */
/* FIXME:
    x Take scratch parameter, and figure out scratch need.

    x Use some fallback for small M->n?    
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
		   mp_ptr ap, mp_ptr bp,
		   mp_size_t n)
{
  mp_size_t an, bn, un, vn, nn;
  mp_size_t mn[2][2];
  mp_size_t modn;
  mp_ptr tp, sp, scratch;
  mp_limb_t cy;
  unsigned i, j;

  TMP_DECL;

  ASSERT ( (ap[n-1] | bp[n-1]) > 0);

  an = n;
  MPN_NORMALIZE (ap, an);
  bn = n;
  MPN_NORMALIZE (bp, bn);

  for (i = 0; i < 2; i++)
    for (j = 0; j < 2; j++)
      {
	mp_size_t k;
	k = M->n;
	MPN_NORMALIZE (M->p[i][j], k);
	mn[i][j] = k;
      }

  ASSERT (mn[0][0] > 0);
  ASSERT (mn[1][1] > 0);
  ASSERT ( (mn[0][1] | mn[1][0]) > 0);

  TMP_MARK;

  if (mn[0][1] == 0)
    {
      mp_size_t qn;

      /* A unchanged, M = (1, 0; q, 1) */
      ASSERT (mn[0][0] == 1);
      ASSERT (M->p[0][0][0] == 1);
      ASSERT (mn[1][1] == 1);
      ASSERT (M->p[1][1][0] == 1);

      /* Put B <-- B - q A */
      nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
    }
  else if (mn[1][0] == 0)
    {
      /* B unchanged, M = (1, q; 0, 1) */
      ASSERT (mn[0][0] == 1);
      ASSERT (M->p[0][0][0] == 1);
      ASSERT (mn[1][1] == 1);
      ASSERT (M->p[1][1][0] == 1);

      /* Put A  <-- A - q * B */
      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);      
    }
  else
    {
      /* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
	 B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
      un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
      vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;

      nn = MAX (un, vn);
      /* In the range of interest, mulmod_bnm1 should always beat mullo. */
      modn = mpn_mulmod_bnm1_next_size (nn + 1);

      scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
      tp = TMP_ALLOC_LIMBS (modn);
      sp = TMP_ALLOC_LIMBS (modn);

      ASSERT (n <= 2*modn);

      if (n > modn)
	{
	  cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
	  MPN_INCR_U (ap, modn, cy);

	  cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
	  MPN_INCR_U (bp, modn, cy);

	  n = modn;
	}

      mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
      mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);

      /* FIXME: Handle the small n case in some better way. */
      if (n + mn[1][1] < modn)
	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
      if (n + mn[0][1] < modn)
	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);

      cy = mpn_sub_n (tp, tp, sp, modn);
      MPN_DECR_U (tp, modn, cy);

      ASSERT (mpn_zero_p (tp + nn, modn - nn));

      mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
      MPN_COPY (ap, tp, nn);
      mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);

      if (n + mn[1][0] < modn)
	MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
      if (n + mn[0][0] < modn)
	MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);

      cy = mpn_sub_n (tp, tp, sp, modn);
      MPN_DECR_U (tp, modn, cy);

      ASSERT (mpn_zero_p (tp + nn, modn - nn));
      MPN_COPY (bp, tp, nn);

      while ( (ap[nn-1] | bp[nn-1]) == 0)
	{
	  nn--;
	  ASSERT (nn > 0);
	}
    }
  TMP_FREE;

  return nn;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.