Niels Möller nisse at lysator.liu.se
Tue Oct 11 14:17:21 CEST 2011

nisse at lysator.liu.se (Niels Möller) writes:

> So one should really benchmark hgcd + hgcd_adjust vs hgcd_appr +
> hgcd_matrix_apply, to see which one is fastest, and then *both*
> functions should use the fastest variant for each size. The theory is
> that hgcd_appr ought to win for large enough sizes.

I wrote the below function (any suggestion on a better name?)

mp_size_t
mpn_hgcd_big_step (struct hgcd_matrix *M,
mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p,
mp_ptr tp)
{
mp_size_t nn;
if (BELOW_THRESHOLD (n, HGCD_BIG_STEP_THRESHOLD))
{
nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
if (nn > 0)
/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
= 2 (n - 1) */
return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
}
else
{
MPN_COPY (tp, ap + p, n - p);
MPN_COPY (tp + n - p, bp + p, n - p);
if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
return hgcd_matrix_apply (M, ap, bp, n);
}
return 0;
}

I then use this for the first recursive call in mpn_hgcd and mpn_hgcd_appr (it
could be used also for the second call in mpn_hgcd, but it might need
more scratch space, and also the tuning might a bit different).

I tried some tuning, and then HGCD_BIG_STEP_THRESHOLD  seems to be
around 1000 limbs, with HGCD_THRESHOLD at 117 and HGCD_APPR_THRESHOLD at
161 limbs. The saving appears to be about 8% at large sizes.

Below I also append the hgcd_matrix_apply function. I wonder if it would
be useful to make this function use other strategies for smaller n.
Alternatives:

1. mullo (typical size 3n/4 x n/4, so it would really be one n/2 x n/4
regular multiplication and one n/4 mullo.)

2. mulmid (typical size also 3n/4 x n/4). This would be useful when
called from hgcd_appr, but not if called from hgcd, because in the
latter case also the low limbs are needed.

The current strategy uses n x n/4 multiplication mod (B^{3n/4} - 1).
Maybe mpn_mulmod_bnm1 could be better optimized for such unbalanced
operands, or maybe one can't get any larger saving from wraparound.

Regards,
/Niels

/* Computes R -= A * B. Result must be non-negative. Normalized down
to size an, and resulting size is returned. */
static mp_size_t
submul (mp_ptr rp, mp_size_t rn,
mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
{ [...] }

/* Computes (a, b)  <--  M^{-1} (a; b) */
/* FIXME:
x Take scratch parameter, and figure out scratch need.

x Use some fallback for small M->n?
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
mp_ptr ap, mp_ptr bp,
mp_size_t n)
{
mp_size_t an, bn, un, vn, nn;
mp_size_t mn;
mp_size_t modn;
mp_ptr tp, sp, scratch;
mp_limb_t cy;
unsigned i, j;

TMP_DECL;

ASSERT ( (ap[n-1] | bp[n-1]) > 0);

an = n;
MPN_NORMALIZE (ap, an);
bn = n;
MPN_NORMALIZE (bp, bn);

for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
mp_size_t k;
k = M->n;
MPN_NORMALIZE (M->p[i][j], k);
mn[i][j] = k;
}

ASSERT (mn > 0);
ASSERT (mn > 0);
ASSERT ( (mn | mn) > 0);

TMP_MARK;

if (mn == 0)
{
mp_size_t qn;

/* A unchanged, M = (1, 0; q, 1) */
ASSERT (mn == 1);
ASSERT (M->p == 1);
ASSERT (mn == 1);
ASSERT (M->p == 1);

/* Put B <-- B - q A */
nn = submul (bp, bn, ap, an, M->p, mn);
}
else if (mn == 0)
{
/* B unchanged, M = (1, q; 0, 1) */
ASSERT (mn == 1);
ASSERT (M->p == 1);
ASSERT (mn == 1);
ASSERT (M->p == 1);

/* Put A  <-- A - q * B */
nn = submul (ap, an, bp, bn, M->p, mn);
}
else
{
/* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
un = MIN (an - mn, bn - mn) + 1;
vn = MIN (an - mn, bn - mn) + 1;

nn = MAX (un, vn);
/* In the range of interest, mulmod_bnm1 should always beat mullo. */
modn = mpn_mulmod_bnm1_next_size (nn + 1);

scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
tp = TMP_ALLOC_LIMBS (modn);
sp = TMP_ALLOC_LIMBS (modn);

ASSERT (n <= 2*modn);

if (n > modn)
{
cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
MPN_INCR_U (ap, modn, cy);

cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
MPN_INCR_U (bp, modn, cy);

n = modn;
}

mpn_mulmod_bnm1 (tp, modn, ap, n, M->p, mn, scratch);
mpn_mulmod_bnm1 (sp, modn, bp, n, M->p, mn, scratch);

/* FIXME: Handle the small n case in some better way. */
if (n + mn < modn)
MPN_ZERO (tp + n + mn, modn - n - mn);
if (n + mn < modn)
MPN_ZERO (sp + n + mn, modn - n - mn);

cy = mpn_sub_n (tp, tp, sp, modn);
MPN_DECR_U (tp, modn, cy);

ASSERT (mpn_zero_p (tp + nn, modn - nn));

mpn_mulmod_bnm1 (sp, modn, ap, n, M->p, mn, scratch);
MPN_COPY (ap, tp, nn);
mpn_mulmod_bnm1 (tp, modn, bp, n, M->p, mn, scratch);

if (n + mn < modn)
MPN_ZERO (sp + n + mn, modn - n - mn);
if (n + mn < modn)
MPN_ZERO (tp + n + mn, modn - n - mn);

cy = mpn_sub_n (tp, tp, sp, modn);
MPN_DECR_U (tp, modn, cy);

ASSERT (mpn_zero_p (tp + nn, modn - nn));
MPN_COPY (bp, tp, nn);

while ( (ap[nn-1] | bp[nn-1]) == 0)
{
nn--;
ASSERT (nn > 0);
}
}
TMP_FREE;

return nn;
}

--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.