hgcd1/2

Sun Sep 1 08:39:43 UTC 2019

I've been plaing a bit with table based hgcd2, but to keep things
simple, I started with hgcd1 (reducing full limbs to half limbs, with an
associated matrix of half limb elements). Complete code below, the main
loop is

  for (;;) 
    {
      mp_limb_t a_hi;
      mp_limb_t b_hi;
      mp_limb_t q;
      int c;
      if (a < b) 
	{
	  MP_LIMB_T_SWAP (a,b);
	  MP_LIMB_T_SWAP (u00, u01);
	  MP_LIMB_T_SWAP (u10, u11);
	  swapped ^= 1;
	}
      count_leading_zeros (c, a);
      assert (c + TAB_BITS <= GMP_LIMB_BITS);
      a_hi = a >> (GMP_LIMB_BITS - TAB_BITS - c);
      b_hi = (b >> (GMP_LIMB_BITS - TAB_BITS - c));
      if (UNLIKELY(b_hi == 0))
	{
	  q = a / b;
	}
      else 
	q = q_tab[b_hi - 1][a_hi - (1U << (TAB_BITS - 1))];

      assert (q > 0);
      assert (a >= q*b);
      a -= q*b;
      u01 += q*u00;
      u11 += q*u10;
      if (a < REDUCED_LIMIT)
	{
	  u01 -= u00;
	  u11 -= u10;
	  break;
	}      
    }

I think it's promising, and also a bit simpler than the current div1
code. Some concerns:

1. I think the approximate q means that we will often reduce from a > b
   to a close to but larger than b, so we need two iterations to get to
   a < b. Might improve progress to add some kind of adjustment step
   after a - q*b, checking either if result >= b or < 0.

2. There are diferent options for the UNLIKELY branch. One could count
   leading zeros of b, shift, and look up a quotient, and apply

   a -= q * b << k;

   for suitable q and k. Likely cheaper than a division, but we'll then
   do multiple iterations if size difference is large (which I think is
   ok).

3. We get poor accuracy for q when b_hi == 1. One could make that less
   likely by using a somewhat asymmetric table, say looking up 3 bits of
   a but 6 bits of b.

4. One could also go in the other direction and do something closer to
   left-to-right binary, and in each step always use q = 2^k for
   suitable k, based on count_leading_zeros on both a and b.

Regards,
/Niels

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

#define REDUCED_BITS (GMP_NUMB_BITS / 2 + 1)
#define REDUCED_LIMIT ((mp_limb_t) 1 << (GMP_NUMB_BITS / 2 + 1))
#define MATRIX_LIMIT ((mp_limb_t) 1 << (GMP_NUMB_BITS / 2 - 1))

#define TAB_BITS 4
#define ABS_DIFF(a,b) ((a < b) ? b - a : a - b)

static int 
hgcd1_ref(struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b) 
{
  mp_limb_t u00, u01, u10, u11;

  if (a < REDUCED_LIMIT || b < REDUCED_LIMIT || ABS_DIFF(a,b) < REDUCED_LIMIT)
    return 0;

  u00 = u11 = 1;
  u01 = u10 = 0;

  if (a < b)
    goto a_less_b;
  for (;;)
    {
      mp_limb_t q;

      assert(a > b);
      q = a / b;
      a -= q*b;
      u01 += q*u00;
      u11 += q*u10;
      if (a < REDUCED_LIMIT)
	{
	  u01 -= u00;
	  u11 -= u10;
	  break;
	}      
    a_less_b:
      assert (a < b);
      q = b / a;
      b -= q * a;
      u00 += q * u01;
      u10 += q * u11;

      if (b < REDUCED_LIMIT)
	{
	  u00 -= u01;
	  u10 -= u11;
	  break;
	}
    }
  m->u[0][0] = u00;
  m->u[0][1] = u01;
  m->u[1][0] = u10;
  m->u[1][1] = u11;
  return 1;
}

#if TAB_BITS == 3
/* 1,a1,a0.xxx / b2,b1,b0.xxx > 1,a1,a0 / b2,b1,b0 + 1 
 * Need to round b up, except when a_hi == b_hi, since we always have a > b.
 */
unsigned char q_tab[7][4] = {
  /* a    = 4, 5, 6, 7
   /  b */
  { /* 1 */ 2, 2, 3, 3 },
  { /* 2 */ 1, 1, 2, 2 },
  { /* 3 */ 1, 1, 1, 1 },
  { /* 4 */ 1, 1, 1, 1 },
  { /* 5 */ 0, 1, 1, 1 },
  { /* 6 */ 0, 0, 1, 1 },
  { /* 7 */ 0, 0, 0, 1 },
};
#elif TAB_BITS == 4
unsigned char q_tab[15][8] = {
  /* a    = 8, 9,10,11,12,13,14,15
   /  b */
  { /* 1 */ 4, 4, 5, 5, 6, 6, 7, 7 },
  { /* 2 */ 2, 3, 3, 3, 4, 4, 4, 5 },
  { /* 3 */ 2, 2, 2, 2, 3, 3, 3, 3 },
  { /* 4 */ 1, 1, 2, 2, 2, 2, 2, 3 },
  { /* 5 */ 1, 1, 1, 1, 2, 2, 2, 2 },
  { /* 6 */ 1, 1, 1, 1, 1, 1, 2, 2 },
  { /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1 },
  { /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1 },
  { /* 9 */ 0, 1, 1, 1, 1, 1, 1, 1 },
  { /*10 */ 0, 0, 1, 1, 1, 1, 1, 1 },
  { /*11 */ 0, 0, 0, 1, 1, 1, 1, 1 },
  { /*12 */ 0, 0, 0, 0, 1, 1, 1, 1 },
  { /*13 */ 0, 0, 0, 0, 0, 1, 1, 1 },
  { /*14 */ 0, 0, 0, 0, 0, 0, 1, 1 },
  { /*15 */ 0, 0, 0, 0, 0, 0, 0, 1 },
};
#else
#error unsupported table size
#endif

static int 
hgcd1_tab(struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b) 
{
  mp_limb_t u00, u01, u10, u11;
  int swapped;
  if (a < REDUCED_LIMIT || b < REDUCED_LIMIT || ABS_DIFF(a,b) < REDUCED_LIMIT)
    return 0;

  u00 = u11 = 1;
  u01 = u10 = 0;

  swapped = 0;
  for (;;) 
    {
      mp_limb_t a_hi;
      mp_limb_t b_hi;
      mp_limb_t q;
      int c;
      if (a < b) 
	{
	  MP_LIMB_T_SWAP (a,b);
	  MP_LIMB_T_SWAP (u00, u01);
	  MP_LIMB_T_SWAP (u10, u11);
	  swapped ^= 1;
	}
      count_leading_zeros (c, a);
      assert (c + TAB_BITS <= GMP_LIMB_BITS);
      a_hi = a >> (GMP_LIMB_BITS - TAB_BITS - c);
      b_hi = (b >> (GMP_LIMB_BITS - TAB_BITS - c));
      if (UNLIKELY(b_hi == 0))
	{
	  q = a / b;
	}
      else 
	q = q_tab[b_hi - 1][a_hi - (1U << (TAB_BITS - 1))];

      assert (q > 0);
      assert (a >= q*b);
      a -= q*b;
      u01 += q*u00;
      u11 += q*u10;
      if (a < REDUCED_LIMIT)
	{
	  u01 -= u00;
	  u11 -= u10;
	  break;
	}      
    }
  if (swapped) 
    {
      MP_LIMB_T_SWAP (u00, u01);
      MP_LIMB_T_SWAP (u10, u11);
    }
  m->u[0][0] = u00;
  m->u[0][1] = u01;
  m->u[1][0] = u10;
  m->u[1][1] = u11;
  return 1;
}

static int
hgcd1_valid (const struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b)
{
  mp_limb_t u00, u01, u10, u11;
  mp_limb_t s0, s1, t0, t1;
  mp_limb_t alpha, beta;
  u00 = m->u[0][0];
  u01 = m->u[0][1];
  u10 = m->u[1][0];
  u11 = m->u[1][1];
  if (u00 >= MATRIX_LIMIT
      || u01 >= MATRIX_LIMIT
      || u10 >= MATRIX_LIMIT
      || u11 >= MATRIX_LIMIT)
    return 0;

  /* M^-1 = (u11, -u01; -u10,u00) */
  umul_ppmm (s1, s0, u11, a);
  umul_ppmm (t1, t0, u01, b);
  if (t1 > s1 || (t1 == s1 && t0 > s0))
    return 0;
  sub_ddmmss (s1, alpha, s1, s0, t1, t0);
  if (s1 != 0)
    return 0;
  if (alpha < REDUCED_LIMIT)
    return 0;

  umul_ppmm (s1, s0, u00, b);
  umul_ppmm (t1, t0, u10, a);
  if (t1 > s1 || (t1 == s1 && t0 > s0))
    return 0;
  sub_ddmmss (s1, beta, s1, s0, t1, t0);
  if (s1 != 0)
    return 0;
  if (beta < REDUCED_LIMIT)
    return 0;

  return ABS_DIFF(alpha, beta) < REDUCED_LIMIT;
}

int main (int argc, char **argv)
{
  gmp_randstate_t rands;
  unsigned i;
  gmp_randinit_default (rands);
  gmp_randseed_ui (rands, 17);

  for (i = 0; i < 10000; i++) 
    {
      mp_limb_t a, b;
      struct hgcd_matrix1 m;
      a = gmp_urandomb_ui (rands, GMP_NUMB_BITS);
      b = gmp_urandomb_ui (rands, GMP_NUMB_BITS);
      if (hgcd1_ref (&m, a, b) && !hgcd1_valid (&m, a, b))
	{
	  gmp_printf("hgcd1 failed: i = %u, a = %Mx b = %Mx\n"
		     "  M = (%Mx, %Mx;%Mx, %Mx)\n",
		     i, a, b, m.u[0][0], m.u[0][1], m.u[1][0], m.u[1][1]);
	  abort();
	}
      if (hgcd1_tab (&m, a, b) && !hgcd1_valid (&m, a, b))
	{
	  gmp_printf("hgcd1 failed: i = %u, a = %Mx b = %Mx\n"
		     "  M = (%Mx, %Mx;%Mx, %Mx)\n",
		     i, a, b, m.u[0][0], m.u[0][1], m.u[1][0], m.u[1][1]);
	  abort();
	}
    }
  return 0;  
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.