Fast constant-time gcd computation and modular inversion

Niels Möller nisse at lysator.liu.se
Tue Aug 30 21:57:05 CEST 2022

```Torbjörn Granlund <tg at gmplib.org> writes:

> nisse at lysator.liu.se (Niels Möller) writes:
>
>     count = (49 * bits + 57) / 17;
>
> Odd.

For sure. This isn't based on local progress of the algorithm (there
ain't no guaranteed progress for a short sequence of reduction steps),
but based on rather complex analysis of the number of steps needed for

> I think your measurements are a bit optimistic, though.  My measruments
> from slightly modified timing code suggest 4.5x slowdown, and more for
> really small operands.

Maybe, I tried to keep the timing really simple and portable.

> This will still completely outclass the current sec code.

I've now implemented inverse too. See updated code below. When I run it,
I get

invert size 1, ref 0.293 (us), djb 1.145 (us)
invert size 2, ref 0.721 (us), djb 1.659 (us)
invert size 3, ref 0.994 (us), djb 2.375 (us)
[...]
invert size 17, ref 5.157 (us), djb 18.538 (us)
invert size 18, ref 5.207 (us), djb 19.064 (us)
invert size 19, ref 5.702 (us), djb 21.286 (us)

so a slowdown of 3--4 compared to mpz_invert. This timing excludes the
precomputation of a few needed per-modulo constants.

I haven't digged deeper into the performance, but I would expect that
the "steps" function is significantly faster than hgcd2, since it's
straight and simple code, with no unpredictable branches. But since
average progress for each computed transformation is less, we'll need a
larger number of iterations in the outer loop (and for side-channel
silence, we always go for the worst case, no data-dependent condition to
exit early as soon as g reaches 0). That implies that the constant for
the O(n^2) part is larger, even if the O(n) part might be same or smaller.

I think the inverse flavor is still rather neat, main loop is

for (delta = 1;count > STEPS;count -= STEPS)
{
delta = steps (&M, STEPS, delta, fp[0], gp[0]);
matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp);
matrix_vector_mul_mod (&M, Bp, n+2, up, vp, tp);
}

I.e., process low limbs to get a (signed) transform matrix. Apply matrix
to numbers and cofactors. That's all. The case of large quotients, which
has been messy in all previous variants, is handled incrementally by the
"delta" counter, which will grow larger whenever we have to process a
larger number of trailing zeros, and in that case, we'll get more
progress sometimes later.

This version represents the cofactors too as two's complement. They are
reduced mod m for each matrix multiply, so they don't grow large, and
I'm thinking that it might be simpler and/or more efficient to instead
keep them in unsigned representation.

I'm thinking I'll try to implement and benchmark this for Nettle's ecc
functions first, before attempting to update GMP function. The reason is
that (i) I really want to do needed precomputations for all moduli of
interest for Nettle at compile time, which would complicate the GMP
interface if it is supported at all, and (ii) in some cases I want
inversion to operate on numbers in montgomery representation, and then
I'd like to fold any needed additional power of two into the precomputed
constant.

Regards,
/Niels

-----8<-------

/* Side channel silent gcd (work in progress)

This is is free software; you can redistribute it and/or modify
it under the terms of either:

Software Foundation; either version 3 of the License, or (at your
option) any later version.

or

Foundation; either version 2 of the License, or (at your option) any
later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,

#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <gmp.h>

#if GMP_NUMB_BITS < GMP_LIMB_BITS
#error Nails not supported.
#endif

#define BITCNT_BITS (sizeof(mp_bitcnt_t) * CHAR_BIT)

#define STEPS (GMP_LIMB_BITS - 2)

#define MAX(a,b) ((a) >= (b) ? (a) : (b))

struct matrix {
/* Matrix elements interpreted as signed two's complement. Absolute
value of elements is at most 2^STEPS. */
mp_limb_t a[2][2];
};

/* Conditionally set (a, b) <-- (b, -a) */
#define CND_NEG_SWAP_LIMB(cnd, a, b) do {\
mp_limb_t __cnd_sum = (a) + (b);	 \
mp_limb_t __cnd_diff = (a) - (b);	 \
(a) -= __cnd_diff & -(cnd);		 \
(b) -= __cnd_sum & -(cnd);		 \
} while (0)

/* Perform STEPS elementary reduction steps on (delta, f, g). In the
least significant STEPS bits of f, g matter. Note that mp_bitcnt_t
is an unsigned type, but is used as two's complement. */
static mp_bitcnt_t
steps(struct matrix *M, unsigned k, mp_bitcnt_t delta, mp_limb_t f, mp_limb_t g)
{
mp_limb_t a00, a01, a10, a11;
assert (f & 1);

/* Identity matrix. */
a00 = a11 = 1;
a01 = a10 = 0;

/* Preserve invariant (f ; g) = 2^{-i} M (f_orig, g_orig) */
for (; k-- > 0; delta++)
{
mp_limb_t odd = g & 1;
mp_limb_t swap = odd & ~(delta >> (BITCNT_BITS-1));

/* Swap; (f'; g') = (g; -f) = (0,1;-1,0) (g;f) */
CND_NEG_SWAP_LIMB(swap, f, g);
CND_NEG_SWAP_LIMB(swap, a00, a10);
CND_NEG_SWAP_LIMB(swap, a01, a11);

/* Conditional negation. */
delta = (delta ^ - (mp_bitcnt_t) swap) + swap;

/* Cancel low bit and shift. */
g += f & -odd;
a10 += a00 & -odd;
a11 += a01 & -odd;

g >>= 1;
a00 <<= 1;
a01 <<= 1;
}
M->a[0][0] = a00; M->a[0][1] = a01;
M->a[1][0] = a10; M->a[1][1] = a11;
return delta;
}

/* Set R = (u * F + v * G), treating all numbers as two's complement.
No overlap allowed. */
static mp_limb_t
mp_limb_t u, mp_limb_t v) {
mp_limb_t f_sign = fp[n-1] >> (GMP_LIMB_BITS - 1);
mp_limb_t g_sign = gp[n-1] >> (GMP_LIMB_BITS - 1);
mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1);
mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1);
mp_limb_t hi = mpn_mul_1 (rp, fp, n, u) - ((-f_sign) & u);
hi -= ((-u_sign) & fp[n-1]) + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n-1);

hi += mpn_addmul_1 (rp, gp, n, v) - ((-g_sign) & v);
hi -= ((-v_sign) & gp[n-1]) + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n-1);

return hi;
}

/* Update (f'; g') = M (f; g) / 2^{shift}, where all numbers are two's complement. */
static void
matrix_vector_mul (const struct matrix *M, unsigned shift,
mp_size_t n, mp_limb_t *fp, mp_limb_t *gp, mp_limb_t *tp)
{
mp_limb_t g_hi = add_add_mul (tp + n, fp, gp, n, M->a[1][0], M->a[1][1]);
mp_limb_t lo = mpn_rshift (fp, tp, n, shift);
assert (lo == 0);
fp[n-1] += (f_hi << (GMP_LIMB_BITS - shift));
lo = mpn_rshift (gp, tp + n, n, shift);
assert (lo == 0);
gp[n-1] += (g_hi << (GMP_LIMB_BITS - shift));
}

static void __attribute__((noreturn))
die (const char *msg)
{
fprintf(stderr, "%s\n", msg);
abort();
}

static mp_limb_t *
xalloc_limbs(mp_size_t n)
{
mp_limb_t *p = malloc(n * sizeof(*p));
if (!p)
die("Out of memory");

mpn_zero (p, n);
return p;
}

static void
cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n,
mp_limb_t* tp)
{
mpn_lshift (tp, ap, n, 1);
mpn_cnd_sub_n (cnd, rp, ap, tp, n);
}

static mp_bitcnt_t
iterations (mp_size_t n)
{
mp_bitcnt_t bits = n * GMP_LIMB_BITS;
assert (bits >= 46);
return (49 * bits + 57) / 17;
}

static void
djb_gcd (mpz_t g, const mpz_t a, const mpz_t b)
{
mp_size_t n = MAX(mpz_size(a), mpz_size(b));
mp_bitcnt_t count;
mp_limb_t *fp, *gp, *tp;
mp_bitcnt_t delta;
struct matrix M;

assert (mpz_odd_p (a));
count = iterations (n);

/* Extra limb needed for sign bit. */
fp = xalloc_limbs (n+1);
gp = xalloc_limbs (n+1);
tp = xalloc_limbs (2*(n+1));

mpn_copyi (fp, mpz_limbs_read (a), mpz_size (a));
mpn_copyi (gp, mpz_limbs_read (b), mpz_size (b));

for (delta = 1;count > STEPS;count -= STEPS)
{
delta = steps (&M, STEPS, delta, fp[0], gp[0]);
matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp);
}
delta = steps (&M, count, delta, fp[0], gp[0]);
matrix_vector_mul (&M, count, n+1, fp, gp, tp);

assert (mpn_zero_p (gp, n+1));
cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n+1, tp);
assert (fp[n] == 0);
mpn_copyi (mpz_limbs_write (g, n), fp, n);
mpz_limbs_finish (g, n);

free (fp);
free (gp);
free (tp);
}

struct invert_info {
/* B^{n+1} / 2 mod m, for reductions mod m, where n is the (unsigned) size of m. */
mp_limb_t *Bp;
/* 2^{-k} mod m */
mp_limb_t *ip;
};

void
invert_precompute (struct invert_info *info, const mpz_t m)
{
mp_size_t n = mpz_size(m);
mp_bitcnt_t k;
mpz_t t;

info->Bp = xalloc_limbs (n);
info->ip = xalloc_limbs (n);

mpz_init(t);
mpz_setbit (t, (n+1) * GMP_LIMB_BITS);
mpz_mod (t, t, m);
mpn_copyi (info->Bp, mpz_limbs_read (t), mpz_size (t));

k = iterations (n);
mpz_set_ui (t, 0);
mpz_setbit (t, k);
/* FIXME: Using mpz_invert is cheating. Instead, first compute m' =
m^-1 (mod 2^k) via Newton/Hensel. We can then get the inverse via

2^{-k} (mod m) = (2^k - m') * m + 1)/2^k. */
mpz_invert (t, t, m);
mpn_copyi (info->ip, mpz_limbs_read (t), mpz_size (t));

mpz_clear (t);
}

/* Update (u'; v') = M (u; v) (mod m), where all numbers (but Bp) are
two's complement. The precomputed constant Bp is n-2 limbs. */
static void
matrix_vector_mul_mod (const struct matrix *M, const mp_limb_t *Bp,
mp_size_t n, mp_limb_t *up, mp_limb_t *vp, mp_limb_t *tp)
{
mp_limb_t v_hi = add_add_mul (tp + n, up, vp, n, M->a[1][0], M->a[1][1]);

mp_limb_t u_sign = tp[n-1] >> (GMP_LIMB_BITS - 1);
mp_limb_t v_sign = tp[2*n-1] >> (GMP_LIMB_BITS - 1);

/* Products are expected to fit in n limbs, since row sums of the
matrix have absolute value <= B/4, while the most significant
limbs of u, v, interpreted as two's complement, have absolute
value < 2. */
assert (u_hi == - u_sign);
assert (v_hi == - v_sign);

up[n-2] = mpn_mul_1 (up, Bp, n-2, tp[n-1]);
u_hi = -mpn_cnd_sub_n (u_sign, up + 1, up + 1, Bp, n-2);
up[n-1] = u_hi + mpn_add_n (up, up, tp, n-1);

vp[n-2] = mpn_mul_1 (vp, Bp, n-2, tp[2*n-1]);
v_hi = -mpn_cnd_sub_n (v_sign, vp + 1, vp + 1, Bp, n-2);
vp[n-1] = v_hi + mpn_add_n (vp, vp, tp + n, n-1);
}

static int
one_p (const mp_limb_t *p, mp_size_t n)
{
mp_limb_t diff = p[0] ^ 1;
mp_size_t i;
for (i = 1; i < n; i++)
diff |= p[i];

return diff == 0;
}

/* Inverse of a modulo m, m odd */
static int
djb_invert(mpz_t r, const mpz_t a, const mpz_t m,
const mp_limb_t *Bp, const mp_limb_t *ip)
{
mp_size_t n = mpz_size(m);
mp_limb_t *fp, *gp, *up, *vp, *tp, *scratch;
mp_bitcnt_t count;
mp_bitcnt_t delta;
struct matrix M;
int success;

assert (mpz_size (a) <= n);
assert (mpz_odd_p (m));

count = iterations (n);

/* Extra limb needed for sign bit. */
fp = xalloc_limbs (n+1);
gp = xalloc_limbs (n+1);
/* Cofactors are represented with one extra limb and two bits, |u,v| < 2 B^{n+1} */
up = xalloc_limbs (n+2);
vp = xalloc_limbs (n+2);
tp = xalloc_limbs (2*(n+2));
scratch = xalloc_limbs (MAX(mpn_sec_mul_itch (n+2, n), mpn_sec_div_r_itch (2*n+2, n)));

mpn_copyi (fp, mpz_limbs_read (m), mpz_size (m));
mpn_copyi (gp, mpz_limbs_read (a), mpz_size (a));

/* Maintain invariant a * u = 2^i f (mod m), a * v = 2^i g (mod m) */
vp[0] = 1;

for (delta = 1;count > STEPS;count -= STEPS)
{
delta = steps (&M, STEPS, delta, fp[0], gp[0]);
matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp);
matrix_vector_mul_mod (&M, Bp, n+2, up, vp, tp);
}
delta = steps (&M, count, delta, fp[0], gp[0]);
matrix_vector_mul (&M, count, n+1, fp, gp, tp);
matrix_vector_mul_mod (&M, Bp, n+2, up, vp, tp);

assert (mpn_zero_p (gp, n+1));

/* Now f = ±1 (if the inverse exists), and a * u = 2^k f (mod m) */
cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), up, up, n + 2, tp);
cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n + 1, tp);

success = one_p (fp, n+1);

/* Make u non-negative */

mpn_sec_mul (tp, up, n+2, ip, n, scratch);
mpn_sec_div_r (tp, 2*n+2, mpz_limbs_read (m), n, scratch);

mpn_copyi (mpz_limbs_write (r, n), tp, n);
mpz_limbs_finish (r, n);

free (fp);
free (gp);
free (up);
free (vp);
free (tp);
free (scratch);

return success;
}

static void
test_gcd(gmp_randstate_t rands)
{
mp_size_t n;
mpz_t a, b, g, ref;

mpz_inits (a, b, g, ref, NULL);

for (n = 1; n < 20; n++)
{
unsigned count;
clock_t clocks_ref = 0;
clock_t clocks_djb = 0;
for (count = 0; count < 1000; count++)
{
clock_t time_start, time_ref, time_djb;
if (count & 1)
{
mpz_rrandomb (a, rands, GMP_LIMB_BITS * n);
mpz_rrandomb (b, rands, GMP_LIMB_BITS * n);
}
else
{
mpz_urandomb (a, rands, GMP_LIMB_BITS * n);
mpz_urandomb (b, rands, GMP_LIMB_BITS * n);
}
mpz_setbit (a, 0);

time_start = clock();
mpz_gcd (ref, a, b);
time_ref = clock();
djb_gcd (g, a, b);
time_djb = clock();

if (mpz_cmp (g, ref) != 0)
{
gmp_fprintf (stderr,
"size = %zd\na   = 0x%Zx\nb   = 0x%Zx\nref = 0x%Zx\ng   = 0x%Zx\n",
(size_t) n, a, b, ref, g);
die("Gcd failed");
}
clocks_ref += (time_ref - time_start);
clocks_djb += (time_djb - time_ref);
}
fprintf (stderr, "gcd size %zd, ref %.3f (us), djb %.3f (us)\n", (size_t) n,
clocks_ref * (1e6 / CLOCKS_PER_SEC) / count,
clocks_djb * (1e6 / CLOCKS_PER_SEC) / count);

}
mpz_clears (a, b, g, ref, NULL);
}

static void
test_invert(gmp_randstate_t rands)
{
mp_size_t n;
mpz_t a, m, x, ref;

mpz_inits (a, m, x, ref, NULL);
for (n = 1; n < 20; n++)
{
unsigned count, success_count;
clock_t clocks_ref = 0;
clock_t clocks_djb = 0;
for (count = 0, success_count = 0; count < 1000; count++)
{
struct invert_info info;
clock_t time_start, time_ref, time_djb;
int success, ref_success;

if (count & 1)
{
mpz_rrandomb (a, rands, GMP_LIMB_BITS * n);
mpz_rrandomb (m, rands, GMP_LIMB_BITS * n);
}
else
{
mpz_urandomb (a, rands, GMP_LIMB_BITS * n);
mpz_urandomb (m, rands, GMP_LIMB_BITS * n);
}
mpz_setbit (m, 0);

invert_precompute (&info, m);

time_start = clock();
ref_success = mpz_invert (ref, a, m);
time_ref = clock();
success = djb_invert (x, a, m, info.Bp, info.ip);
time_djb = clock();
if (ref_success != success || (success && mpz_cmp (x, ref) != 0))
{
gmp_fprintf (stderr,
"size = %zd ref_success = %d, success = %d\n"
"a   = 0x%Zx\nm   = 0x%Zx\nref = 0x%Zx\nx   = 0x%Zx\n",
(size_t) n, ref_success, success, a, m, ref, x);
die("Invert failed");
}
if (success)
{
success_count++;
clocks_ref += (time_ref - time_start);
clocks_djb += (time_djb - time_ref);
}
free (info.Bp);
free (info.ip);
}
fprintf (stderr, "invert size %zd, ref %.3f (us), djb %.3f (us)\n", (size_t) n,
clocks_ref * (1e6 / CLOCKS_PER_SEC) / count,
clocks_djb * (1e6 / CLOCKS_PER_SEC) / count);
}

mpz_clears (a, m, x, ref, NULL);
}

int
main (int argc, char **argv)
{
gmp_randstate_t rands;
gmp_randinit_default (rands);
test_gcd (rands);
test_invert (rands);
gmp_randclear (rands);
}

--
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.
```