Small gcdext_1

Fri Oct 11 05:43:13 UTC 2019

nisse at lysator.liu.se (Niels Möller) writes:

> It turns out the explicit division is easily eliminated, v_div_g = s0 +
> s1 at the end of the binary loop (when u == v). And in the euclid loop,
> u == 0 implies v_div_g = s0.

Updated version below, without input_v and corresponding divisions. Also
one of the binary variants use binv. While debugging, I found that the
accumulated shift may well exceed GMP_LIMB_BITS (shouldn't be so
surprising), but I think it must be < 2 * GMP_LIMB_BITS. And there are a
few corner cases.

I guess it's possible to do all cofactor shifts as part of the loop, by
shifting only one bit per iteration, and making the subtraction
conditional on both nubmers being odd. That would be like a single-limb
variant of mpn_sec_invert (but with a stop condition to exit early).
Unclear if that can be a winner, with less progress per iteration in the
main loop. And it may produce a cofactor that's unnecessarily large in
the g > 1 case.

BTW, the binv/redc part uses an umulhi operation. Maybe we should add
that to longlong.h?

Regards,
/Niels

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

/* v must be odd. Returns d0 = g = gcd(u,v).
   If u = 0 (mod v), returns d1 = 0.
   Otherwise, returns d1 = (u/g)^-1 mod (v/g)
*/
static mp_double_limb_t
modinvert_euclid(mp_limb_t u, mp_limb_t v)
{
  mp_limb_t s0, s1;

  s0 = 1; s1 = 0;

  assert (v & 1);

  u %= v;

  /* Maintain

     U = t1 u + t0 v
     V = s1 u + s0 v = s1 (u - q*v) + s1 q v + s0 v = s1(u - qv) + (s0 + q s1) v

     where U, V are the inputs and the matrix has determinant 1.
     Inverting gives

     u =  s0 U - t0 V
     v = -s1 U + t1 V
  */
  for (;;)
    {
      mp_limb_t q;
      if (u == 0)
	{
	  mp_double_limb_t r;
	  r.d0 = v;
	  /* s0 == V / v */
	  if (s1 > 0)
	    s1 = s0 - s1;
	  r.d1 = s1;

	  return r;
	}

      q = v / u;
      v -= q*u;
      s1 += q*s0;

      if (v == 0)
	{
	  mp_double_limb_t r;
	  r.d0 = u;
	  r.d1 = s0;
	  return r;
	}
      q = u / v;
      u -= q*v;
      s0 += q*s1;
    }
}

static mp_double_limb_t
modinvert_binary (mp_limb_t u, mp_limb_t v)
{
  mp_limb_t s0, s1, v_div_g;
  mp_double_limb_t r;
  int shift;

  assert (v & 1);
  if (u == 0)
    {
      mp_double_limb_t r;
      r.d0 = v;
      r.d1 = 0;
      return r;
    }
  count_trailing_zeros (shift, u);
  u >>= shift;
  s1 = 0;
  s0 = 1;

  /* Maintain

     U = t1 u + t0 v
     V = s1 u + s0 v = [2^k s1 (u-v)/ 2^k + (s0 + s1) v]

     where U, V are the inputs and the matrix has determinant 2^{shift}.
  */

  while (u != v)
    {
      int count;
      if (u > v)
	{
	  u -= v;
	  count_trailing_zeros (count, u);
	  u >>= count;
	  s0 += s1;
	  s1 <<= count;
	  shift += count;
	}
      else
	{
	  v -= u;
	  count_trailing_zeros (count, v);
	  v >>= count;
	  s1 += s0;
	  s0 <<= count;
	  shift += count;
	}
    }
  /* 2^{shift} g = s0 U */
  v_div_g = s0 + s1;

  if (v_div_g == 1)
    /* u == 0 (mod v) */
    s0 = 0;
  else
    {
      for (; shift > 0; shift--)
	{
	  if (s0 & 1)
	    s0 = s0 / 2 + (v_div_g / 2) + 1;
	  else
	    s0 /= 2;
	}
    }

  r.d0 = v;
  r.d1 = s0;

  return r;
}

static mp_double_limb_t
modinvert_binary_mask (mp_limb_t u, mp_limb_t v)
{
  mp_limb_t s0, s1, v_div_g, sign;
  mp_double_limb_t r;
  int shift;

  assert (v & 1);
  if (u == 0)
    {
      mp_double_limb_t r;
      r.d0 = v;
      r.d1 = 0;
      return r;
    }
  count_trailing_zeros (shift, u);
  u >>= shift;
  s1 = 0;
  s0 = 1;

  /* Maintain

     U = t1 u + t0 v
     V = s1 u + s0 v = [2^k s1 (u-v)/ 2^k + (s0 + s1) v]

     where U, V are the inputs and the matrix has determinant 2^{shift}.
  */
  u >>= 1;
  v >>= 1;
  sign = 0;

  while (u != v)
    {
      int count;
      mp_limb_t d =  u - v;
      mp_limb_t vgtu = LIMB_HIGHBIT_TO_MASK (d);
      mp_limb_t sx;

      /* When v < u (vgtu == 0), the updates are:

	   (u; v)   <-- ( (u - v) >> count; v)    (det = +(1<<count) for corr. M factor)
	   (s1, s0) <-- (s1 << count, s0 + s1)

	 and when v > 0, the updates are

	   (u; v)   <-- ( (v - u) >> count; u)    (det = -(1<<count))
	   (s1, s0) <-- (s0 << count, s0 + s1)
      */

      /* v <-- min (u, v) */
      v += (vgtu & d);

      /* u <-- |u - v| */
      u = (d ^ vgtu) - vgtu;

      count_trailing_zeros (count, d);

      sign ^= vgtu;

      sx = vgtu & (s0 - s1);
      s0 += s1;
      s1 += sx;

      count++;
      u >>= count;
      s1 <<= count;
      shift += count;
    }
  v = (v<<1) | 1;

  /* 2^{shift} g = s0 U */
  v_div_g = s0 + s1;

  if (v_div_g == 1)
    /* u == 0 (mod v) */
    s0 = 0;
  else
    {
      mp_limb_t vinv, q, p1, p0;
      assert (s0 > 0);
      assert (shift > 0);
      if (shift >= GMP_LIMB_BITS) 
	{
	  binvert_limb (vinv, v_div_g);
	  q = -s0 * vinv;
	  umul_ppmm (p1, p0, q, v_div_g);
	  s0 = p1 + 1;
	  shift -= GMP_LIMB_BITS;
	  if (shift == 0)
	    goto done;
	}
      else
	{
	  int vinv_bits;

	  vinv = binvert_limb_table[(v_div_g/2) & 0x7f];
	  for (vinv_bits = 8; vinv_bits < shift; vinv_bits *= 2)
	    vinv = 2*vinv - vinv * vinv * v_div_g;
	}

      q = (-s0 * vinv) << (GMP_LIMB_BITS - shift);
      umul_ppmm (p1, p0, q, v_div_g);

      assert ((s0 << (GMP_LIMB_BITS - shift)) + p0 == 0);
      s0 = (s0 >> shift) + p1 + (q > 0);
    done:
      s0 = (sign & (v_div_g + 1)) + (sign ^ s0);
    }

  r.d0 = v;
  r.d1 = s0;

  return r;
}

static int
modinvert_validate (mp_limb_t u, mp_limb_t v, mp_limb_t g, mp_limb_t s)
{
  mp_limb_t p1, p0, q, r;
  u %= v;
  if (u == 0)
    return g == v && s == 0;
  if (u % g || v %g)
    return 0;

  u /= g;
  v /= g;
  if (s >= v)
    return 0;

  umul_ppmm (p1, p0, s, u);
  udiv_qrnnd (q, r, p1, p0, v);
  return r == 1;
}

#define COUNT 1000000
int
main (int argc, char **argv)
{
  gmp_randstate_t rands;
  int i;
  gmp_randinit_default (rands);
  for (i = 0; i < COUNT; i++)
    {
      unsigned u_bits = 1 + gmp_urandomm_ui (rands, GMP_NUMB_BITS);
      unsigned v_bits = 1 + gmp_urandomm_ui (rands, GMP_NUMB_BITS);
      mp_limb_t u = gmp_urandomb_ui(rands, u_bits);
      mp_limb_t v = gmp_urandomb_ui(rands, v_bits) | 1;
      mp_double_limb_t ref, r;
      ref = modinvert_euclid (u, v);
      if (!modinvert_validate (u, v, ref.d0, ref.d1))
	{
	  gmp_printf ("modinvert_euclid failed: u = %Mx v = %Mx\n"
		      "  g = %Mx, s = %Mx\n", u, v, ref.d0, ref.d1);
	  exit(EXIT_FAILURE);
	}
      r = modinvert_binary (u,v);
      if (r.d0 != ref.d0 || r.d1 != ref.d1)
	{
	  gmp_printf ("modinvert_binary failed: u = %Mx v = %Mx\n"
		      "  g = %Mx, s = %Mx\n", u, v, r.d0, r.d1);
	  exit(EXIT_FAILURE);
	}
      r = modinvert_binary_mask (u,v);
      if (r.d0 != ref.d0 || r.d1 != ref.d1)
	{
	  gmp_printf ("modinvert_binary_mask failed: u = %Mx v = %Mx\n"
		      "  g = %Mx, s = %Mx\n", u, v, r.d0, r.d1);
	  exit(EXIT_FAILURE);
	}
    }
  return EXIT_SUCCESS;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.