Fast mpz_bin_ui

Thu Oct 5 22:25:37 UTC 2017

Hello GMP developers,

The current mpz_bin_ui is not asymptotically fast. I propose replacing it
with a call to mpz_bin_uiui when n fits one word (as already suggested in a
code comment) and a plain divide and conquer product otherwise.

Sample code implementing a divide and conquer product is included at the
end of this mail. The following table gives the speedup ratio (measured on
a Haswell) over the current mpz_bin_ui for computing bin(2^e+1,k), with
increments e *= 2, k = max(k+1, k*1.9) subject to e*k < 1e7. The table
starts at e = 64 since e < 64 would be in the mpz_bin_uiui range.

e / k  1 2 3 5 9 17 32 60 114 216 410 779 1480 2812 5342 10149 19283 36637
69610 132259
64 | 17 2 1.1 1.2 1.1 0.93 0.84 0.8 0.84 1 1.3 1.8 2.3 3.2 4.5 7 12 17 32
50
128 | 31 3.4 1.1 1 1.1 1.1 0.86 0.96 1.1 1.4 1.8 2.4 3.3 4.6 6.4 11 18 28
46 -
256 | 33 3.3 1 1.1 1.1 1 0.96 1.1 1.4 1.9 2.8 3.8 5.6 7.3 10 17 28 43 - -
512 | 22 2.2 1 1.1 1.1 1.2 1.1 1.2 1.6 2.2 2.9 4.4 6.2 9.9 15 27 46 - - -
1024 | 27 2.2 1 1.2 1.3 1.5 1.3 1.4 1.7 2.4 3.3 4.9 7.1 12 18 - - - - -
2048 | 29 2.1 1.1 1.4 1.7 2 1.8 1.7 1.9 2.5 3.7 5.6 8.3 15 - - - - - -
4096 | 34 1.8 1.1 1.4 1.6 2.2 1.9 1.8 2 2.8 4.1 6.3 9.9 - - - - - - -
8192 | 37 1.7 1.1 1.4 1.8 2.3 2 1.9 2.2 3.2 4.5 7.4 - - - - - - - -
16384 | 31 1.7 1.1 1.4 1.8 2.4 2.1 2 2.1 3.2 4.9 - - - - - - - - -
32768 | 30 1.6 1.1 1.4 1.9 2.5 2.2 2.1 2.1 3 - - - - - - - - - -
65536 | 30 1.5 1.1 1.4 2 2.8 2.5 2.3 2.2 - - - - - - - - - - -
131072 | 23 1.5 1.1 1.5 2 3.5 2.6 2.3 - - - - - - - - - - - -
262144 | 14 1.4 1.1 1.5 1.9 3 2.5 - - - - - - - - - - - - -
524288 | 15 1.6 1.1 1.5 2 3 - - - - - - - - - - - - - -
1048576 | 12 1.5 1.1 1.5 2 - - - - - - - - - - - - - - -

The divide and conquer product is consistently faster than the current
mpz_bin_ui, except for a minor slowdown (<20%) for a limited range of k
when n is small (2-4 limbs). It might be possible to avoid the minor
slowdown with low level hacks. The only trick I did implement is to
precompute n^2 for large n which is slightly faster (but it is slightly
slower for small n, at least with this code).

Related to this, it would be useful to have a public function that computes
rising factorials (say mpz_rising_ui). Then mpz_bin_ui could be implemented
very simply by calling either mpz_bin_uiui or
mpz_rising_ui+mpz_fac_ui+mpz_divexact.

I'm not planning to submit a patch myself, but feel free to steal this
(trivial) code. Or better yet do something even more clever (however,
mpz_bin_ui for multi-limb n is probably not worth the effort to optimize as
much as mpz_bin_uiui...).

Fredrik

static void
prod1(mpz_t r, mpz_t t, mpz_srcptr n, unsigned long a, unsigned long b)
{
    if (b - a <= 50)
    {
        mpz_sub_ui(r, n, a);
        a++;
        while (a < b)
        {
            mpz_sub_ui(t, n, a);
            mpz_mul(r, r, t);
            a++;
        }
    }
    else
    {
        mpz_t u;
        mpz_init(u);
        prod1(r, t, n, a, a + (b - a) / 2);
        prod1(u, t, n, a + (b - a) / 2, b);
        mpz_mul(r, r, u);
        mpz_clear(u);
    }
}

static void
prod2(mpz_t r, mpz_t t, mpz_srcptr n, mpz_srcptr n2,
    unsigned long a, unsigned long b)
{
    if (b - a <= 5)
    {
        if ((b - a) % 2)
        {
            mpz_sub_ui(r, n, a);
            a += 1;
        }
        else
        {
            mpz_add_ui(r, n2, a * (a + 1));
            mpz_submul_ui(r, n, 2 * a + 1);
            a += 2;
        }

        while (a < b)
        {
            mpz_add_ui(t, n2, a * (a + 1));
            mpz_submul_ui(t, n, 2 * a + 1);
            mpz_mul(r, r, t);
            a += 2;
        }
    }
    else
    {
        mpz_t u;
        mpz_init(u);
        prod2(r, t, n, n2, a, a + (b - a) / 2);
        prod2(u, t, n, n2, a + (b - a) / 2, b);
        mpz_mul(r, r, u);
        mpz_clear(u);
    }
}

void
mpz_bin_ui2(mpz_t r, mpz_srcptr n, unsigned long k)
{
    if (k == 0)
    {
        mpz_set_ui(r, 1);
    }
    else if (k == 1)
    {
        mpz_set(r, n);
    }
    else if (k == 2)
    {
        mpz_t t;
        mpz_init(t);
        mpz_mul(t, n, n);
        mpz_sub(r, t, n);
        mpz_tdiv_q_2exp(r, r, 1);
        mpz_clear(t);
    }
    else
    {
        mpz_t t, u, v;
        mpz_init(t);
        mpz_init(u);

        if (mpz_sizeinbase(n, 2) < 256)
        {
            prod1(t, u, n, 0, k);
        }
        else
        {
            mpz_init(v);
            mpz_mul(v, n, n);
            prod2(t, u, n, v, 0, k);
            mpz_clear(v);
        }
        mpz_fac_ui(u, k);
        mpz_divexact(r, t, u);
        mpz_clear(t);
        mpz_clear(u);
    }
}

(And yes, there is a small bug in this code: a * (a + 1) can overflow if k
is very large, which is not too hard to fix.)