# hgcd1/2

Niels Möller nisse at lysator.liu.se
Sun Sep 1 08:39:43 UTC 2019

```I've been plaing a bit with table based hgcd2, but to keep things
simple, I started with hgcd1 (reducing full limbs to half limbs, with an
associated matrix of half limb elements). Complete code below, the main
loop is

for (;;)
{
mp_limb_t a_hi;
mp_limb_t b_hi;
mp_limb_t q;
int c;
if (a < b)
{
MP_LIMB_T_SWAP (a,b);
MP_LIMB_T_SWAP (u00, u01);
MP_LIMB_T_SWAP (u10, u11);
swapped ^= 1;
}
assert (c + TAB_BITS <= GMP_LIMB_BITS);
a_hi = a >> (GMP_LIMB_BITS - TAB_BITS - c);
b_hi = (b >> (GMP_LIMB_BITS - TAB_BITS - c));
if (UNLIKELY(b_hi == 0))
{
q = a / b;
}
else
q = q_tab[b_hi - 1][a_hi - (1U << (TAB_BITS - 1))];

assert (q > 0);
assert (a >= q*b);
a -= q*b;
u01 += q*u00;
u11 += q*u10;
if (a < REDUCED_LIMIT)
{
u01 -= u00;
u11 -= u10;
break;
}
}

I think it's promising, and also a bit simpler than the current div1
code. Some concerns:

1. I think the approximate q means that we will often reduce from a > b
to a close to but larger than b, so we need two iterations to get to
a < b. Might improve progress to add some kind of adjustment step
after a - q*b, checking either if result >= b or < 0.

2. There are diferent options for the UNLIKELY branch. One could count
leading zeros of b, shift, and look up a quotient, and apply

a -= q * b << k;

for suitable q and k. Likely cheaper than a division, but we'll then
do multiple iterations if size difference is large (which I think is
ok).

3. We get poor accuracy for q when b_hi == 1. One could make that less
likely by using a somewhat asymmetric table, say looking up 3 bits of
a but 6 bits of b.

4. One could also go in the other direction and do something closer to
left-to-right binary, and in each step always use q = 2^k for
suitable k, based on count_leading_zeros on both a and b.

Regards,
/Niels

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

#define REDUCED_BITS (GMP_NUMB_BITS / 2 + 1)
#define REDUCED_LIMIT ((mp_limb_t) 1 << (GMP_NUMB_BITS / 2 + 1))
#define MATRIX_LIMIT ((mp_limb_t) 1 << (GMP_NUMB_BITS / 2 - 1))

#define TAB_BITS 4
#define ABS_DIFF(a,b) ((a < b) ? b - a : a - b)

static int
hgcd1_ref(struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b)
{
mp_limb_t u00, u01, u10, u11;

if (a < REDUCED_LIMIT || b < REDUCED_LIMIT || ABS_DIFF(a,b) < REDUCED_LIMIT)
return 0;

u00 = u11 = 1;
u01 = u10 = 0;

if (a < b)
goto a_less_b;
for (;;)
{
mp_limb_t q;

assert(a > b);
q = a / b;
a -= q*b;
u01 += q*u00;
u11 += q*u10;
if (a < REDUCED_LIMIT)
{
u01 -= u00;
u11 -= u10;
break;
}
a_less_b:
assert (a < b);
q = b / a;
b -= q * a;
u00 += q * u01;
u10 += q * u11;

if (b < REDUCED_LIMIT)
{
u00 -= u01;
u10 -= u11;
break;
}
}
m->u = u00;
m->u = u01;
m->u = u10;
m->u = u11;
return 1;
}

#if TAB_BITS == 3
/* 1,a1,a0.xxx / b2,b1,b0.xxx > 1,a1,a0 / b2,b1,b0 + 1
* Need to round b up, except when a_hi == b_hi, since we always have a > b.
*/
unsigned char q_tab = {
/* a    = 4, 5, 6, 7
/  b */
{ /* 1 */ 2, 2, 3, 3 },
{ /* 2 */ 1, 1, 2, 2 },
{ /* 3 */ 1, 1, 1, 1 },
{ /* 4 */ 1, 1, 1, 1 },
{ /* 5 */ 0, 1, 1, 1 },
{ /* 6 */ 0, 0, 1, 1 },
{ /* 7 */ 0, 0, 0, 1 },
};
#elif TAB_BITS == 4
unsigned char q_tab = {
/* a    = 8, 9,10,11,12,13,14,15
/  b */
{ /* 1 */ 4, 4, 5, 5, 6, 6, 7, 7 },
{ /* 2 */ 2, 3, 3, 3, 4, 4, 4, 5 },
{ /* 3 */ 2, 2, 2, 2, 3, 3, 3, 3 },
{ /* 4 */ 1, 1, 2, 2, 2, 2, 2, 3 },
{ /* 5 */ 1, 1, 1, 1, 2, 2, 2, 2 },
{ /* 6 */ 1, 1, 1, 1, 1, 1, 2, 2 },
{ /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1 },
{ /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1 },
{ /* 9 */ 0, 1, 1, 1, 1, 1, 1, 1 },
{ /*10 */ 0, 0, 1, 1, 1, 1, 1, 1 },
{ /*11 */ 0, 0, 0, 1, 1, 1, 1, 1 },
{ /*12 */ 0, 0, 0, 0, 1, 1, 1, 1 },
{ /*13 */ 0, 0, 0, 0, 0, 1, 1, 1 },
{ /*14 */ 0, 0, 0, 0, 0, 0, 1, 1 },
{ /*15 */ 0, 0, 0, 0, 0, 0, 0, 1 },
};
#else
#error unsupported table size
#endif

static int
hgcd1_tab(struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b)
{
mp_limb_t u00, u01, u10, u11;
int swapped;
if (a < REDUCED_LIMIT || b < REDUCED_LIMIT || ABS_DIFF(a,b) < REDUCED_LIMIT)
return 0;

u00 = u11 = 1;
u01 = u10 = 0;

swapped = 0;
for (;;)
{
mp_limb_t a_hi;
mp_limb_t b_hi;
mp_limb_t q;
int c;
if (a < b)
{
MP_LIMB_T_SWAP (a,b);
MP_LIMB_T_SWAP (u00, u01);
MP_LIMB_T_SWAP (u10, u11);
swapped ^= 1;
}
assert (c + TAB_BITS <= GMP_LIMB_BITS);
a_hi = a >> (GMP_LIMB_BITS - TAB_BITS - c);
b_hi = (b >> (GMP_LIMB_BITS - TAB_BITS - c));
if (UNLIKELY(b_hi == 0))
{
q = a / b;
}
else
q = q_tab[b_hi - 1][a_hi - (1U << (TAB_BITS - 1))];

assert (q > 0);
assert (a >= q*b);
a -= q*b;
u01 += q*u00;
u11 += q*u10;
if (a < REDUCED_LIMIT)
{
u01 -= u00;
u11 -= u10;
break;
}
}
if (swapped)
{
MP_LIMB_T_SWAP (u00, u01);
MP_LIMB_T_SWAP (u10, u11);
}
m->u = u00;
m->u = u01;
m->u = u10;
m->u = u11;
return 1;
}

static int
hgcd1_valid (const struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b)
{
mp_limb_t u00, u01, u10, u11;
mp_limb_t s0, s1, t0, t1;
mp_limb_t alpha, beta;
u00 = m->u;
u01 = m->u;
u10 = m->u;
u11 = m->u;
if (u00 >= MATRIX_LIMIT
|| u01 >= MATRIX_LIMIT
|| u10 >= MATRIX_LIMIT
|| u11 >= MATRIX_LIMIT)
return 0;

/* M^-1 = (u11, -u01; -u10,u00) */
umul_ppmm (s1, s0, u11, a);
umul_ppmm (t1, t0, u01, b);
if (t1 > s1 || (t1 == s1 && t0 > s0))
return 0;
sub_ddmmss (s1, alpha, s1, s0, t1, t0);
if (s1 != 0)
return 0;
if (alpha < REDUCED_LIMIT)
return 0;

umul_ppmm (s1, s0, u00, b);
umul_ppmm (t1, t0, u10, a);
if (t1 > s1 || (t1 == s1 && t0 > s0))
return 0;
sub_ddmmss (s1, beta, s1, s0, t1, t0);
if (s1 != 0)
return 0;
if (beta < REDUCED_LIMIT)
return 0;

return ABS_DIFF(alpha, beta) < REDUCED_LIMIT;
}

int main (int argc, char **argv)
{
gmp_randstate_t rands;
unsigned i;
gmp_randinit_default (rands);
gmp_randseed_ui (rands, 17);

for (i = 0; i < 10000; i++)
{
mp_limb_t a, b;
struct hgcd_matrix1 m;
a = gmp_urandomb_ui (rands, GMP_NUMB_BITS);
b = gmp_urandomb_ui (rands, GMP_NUMB_BITS);
if (hgcd1_ref (&m, a, b) && !hgcd1_valid (&m, a, b))
{
gmp_printf("hgcd1 failed: i = %u, a = %Mx b = %Mx\n"
"  M = (%Mx, %Mx;%Mx, %Mx)\n",
i, a, b, m.u, m.u, m.u, m.u);
abort();
}
if (hgcd1_tab (&m, a, b) && !hgcd1_valid (&m, a, b))
{
gmp_printf("hgcd1 failed: i = %u, a = %Mx b = %Mx\n"
"  M = (%Mx, %Mx;%Mx, %Mx)\n",
i, a, b, m.u, m.u, m.u, m.u);
abort();
}
}
return 0;
}

--
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.

```