subquadratic mpn_hgcd_appr
Niels Möller
nisse at lysator.liu.se
Tue Oct 11 14:17:21 CEST 2011
nisse at lysator.liu.se (Niels Möller) writes:
> So one should really benchmark hgcd + hgcd_adjust vs hgcd_appr +
> hgcd_matrix_apply, to see which one is fastest, and then *both*
> functions should use the fastest variant for each size. The theory is
> that hgcd_appr ought to win for large enough sizes.
I wrote the below function (any suggestion on a better name?)
mp_size_t
mpn_hgcd_big_step (struct hgcd_matrix *M,
mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p,
mp_ptr tp)
{
mp_size_t nn;
if (BELOW_THRESHOLD (n, HGCD_BIG_STEP_THRESHOLD))
{
nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
if (nn > 0)
/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
= 2 (n - 1) */
return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
}
else
{
MPN_COPY (tp, ap + p, n - p);
MPN_COPY (tp + n - p, bp + p, n - p);
if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
return hgcd_matrix_apply (M, ap, bp, n);
}
return 0;
}
I then use this for the first recursive call in mpn_hgcd and mpn_hgcd_appr (it
could be used also for the second call in mpn_hgcd, but it might need
more scratch space, and also the tuning might a bit different).
I tried some tuning, and then HGCD_BIG_STEP_THRESHOLD seems to be
around 1000 limbs, with HGCD_THRESHOLD at 117 and HGCD_APPR_THRESHOLD at
161 limbs. The saving appears to be about 8% at large sizes.
Below I also append the hgcd_matrix_apply function. I wonder if it would
be useful to make this function use other strategies for smaller n.
Alternatives:
1. mullo (typical size 3n/4 x n/4, so it would really be one n/2 x n/4
regular multiplication and one n/4 mullo.)
2. mulmid (typical size also 3n/4 x n/4). This would be useful when
called from hgcd_appr, but not if called from hgcd, because in the
latter case also the low limbs are needed.
The current strategy uses n x n/4 multiplication mod (B^{3n/4} - 1).
Maybe mpn_mulmod_bnm1 could be better optimized for such unbalanced
operands, or maybe one can't get any larger saving from wraparound.
Comments appreciated.
Regards,
/Niels
/* Computes R -= A * B. Result must be non-negative. Normalized down
to size an, and resulting size is returned. */
static mp_size_t
submul (mp_ptr rp, mp_size_t rn,
mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
{ [...] }
/* Computes (a, b) <-- M^{-1} (a; b) */
/* FIXME:
x Take scratch parameter, and figure out scratch need.
x Use some fallback for small M->n?
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
mp_ptr ap, mp_ptr bp,
mp_size_t n)
{
mp_size_t an, bn, un, vn, nn;
mp_size_t mn[2][2];
mp_size_t modn;
mp_ptr tp, sp, scratch;
mp_limb_t cy;
unsigned i, j;
TMP_DECL;
ASSERT ( (ap[n-1] | bp[n-1]) > 0);
an = n;
MPN_NORMALIZE (ap, an);
bn = n;
MPN_NORMALIZE (bp, bn);
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
mp_size_t k;
k = M->n;
MPN_NORMALIZE (M->p[i][j], k);
mn[i][j] = k;
}
ASSERT (mn[0][0] > 0);
ASSERT (mn[1][1] > 0);
ASSERT ( (mn[0][1] | mn[1][0]) > 0);
TMP_MARK;
if (mn[0][1] == 0)
{
mp_size_t qn;
/* A unchanged, M = (1, 0; q, 1) */
ASSERT (mn[0][0] == 1);
ASSERT (M->p[0][0][0] == 1);
ASSERT (mn[1][1] == 1);
ASSERT (M->p[1][1][0] == 1);
/* Put B <-- B - q A */
nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
}
else if (mn[1][0] == 0)
{
/* B unchanged, M = (1, q; 0, 1) */
ASSERT (mn[0][0] == 1);
ASSERT (M->p[0][0][0] == 1);
ASSERT (mn[1][1] == 1);
ASSERT (M->p[1][1][0] == 1);
/* Put A <-- A - q * B */
nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
}
else
{
/* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01.
B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */
un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
nn = MAX (un, vn);
/* In the range of interest, mulmod_bnm1 should always beat mullo. */
modn = mpn_mulmod_bnm1_next_size (nn + 1);
scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
tp = TMP_ALLOC_LIMBS (modn);
sp = TMP_ALLOC_LIMBS (modn);
ASSERT (n <= 2*modn);
if (n > modn)
{
cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
MPN_INCR_U (ap, modn, cy);
cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
MPN_INCR_U (bp, modn, cy);
n = modn;
}
mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
/* FIXME: Handle the small n case in some better way. */
if (n + mn[1][1] < modn)
MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
if (n + mn[0][1] < modn)
MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
cy = mpn_sub_n (tp, tp, sp, modn);
MPN_DECR_U (tp, modn, cy);
ASSERT (mpn_zero_p (tp + nn, modn - nn));
mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
MPN_COPY (ap, tp, nn);
mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
if (n + mn[1][0] < modn)
MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
if (n + mn[0][0] < modn)
MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
cy = mpn_sub_n (tp, tp, sp, modn);
MPN_DECR_U (tp, modn, cy);
ASSERT (mpn_zero_p (tp + nn, modn - nn));
MPN_COPY (bp, tp, nn);
while ( (ap[nn-1] | bp[nn-1]) == 0)
{
nn--;
ASSERT (nn > 0);
}
}
TMP_FREE;
return nn;
}
--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list