# gcd_11

Torbjörn Granlund tg at gmplib.org
Sat Aug 17 08:59:00 UTC 2019

nisse at lysator.liu.se (Niels Möller) writes:

After discussing several interesting gcd algorithms with Torbjörn, I've
made this observation that I think is kind-of interesting.

Let me show the most algorithm I came up with in this discussion will
Niels!

I have long thought that table-driven GCD should be feasible.  Every
time I have failed to construct an algorithm with small enough table.

Now, by starting at the simple observation that, for the odd a and b in
Stein's "binary algorithm", either a-b or a+b is divisible by at least 4
(while the other is always divisible by 2 but not 4), I made some
progress.

How about choosing not just between a-b and a+b, but between a+mb for
small m \in Z?  For every 2nd m we have 4 | a+mb, for every 4th m we 8 |
a+mb have, etc.  That means that some small m will allow us to divide
out many zero bit.

Fortunately, such divisibility depends on just the low few bits of a and
b, i.e., we can pre-compute suitable m and table them, and then retrieve
a great m by indexing with the few low bits of each a and b we come
across!

Here is the main algorithm, with a given table and k, the latter being
log_2 of the table dimension:

static mp_limb_t
gcd_bin_tabkm (mp_limb_t x, mp_limb_t y, signed char *tab, size_t k)
{
int cnt;

// The main algorithm cannot handle full words, use plain Stein in order to
// chop away a few bits.
while ((x | y) >> (62 - k))
{
if (x > y)
{
x -= y;
count_trailing_zeros (cnt, x);
x >>= cnt;
}
else if (y > x)
{
y -= x;
count_trailing_zeros (cnt, y);
y >>= cnt;
}
else
return x;
}

mp_limb_signed_t a = x;
mp_limb_signed_t b = y;
size_t mask = (2 << k) - 2;

// Main loop.
while (1)
{
mp_limb_signed_t m;
if (a > b)
{
// a = a + bm
m = tab[((a & mask) << (k - 1)) + ((b & mask) >> 1)];

a = a + b * m;
count_trailing_zeros (cnt, a);
a >>= cnt;
if (a < 0) a = -a;
if (a == 0)
return b;
}
else
{
// b = b + am
m = tab[((b & mask) << (k - 1)) + ((a & mask) >> 1)];

b = b + a * m;
count_trailing_zeros (cnt, b);
b >>= cnt;
if (b < 0) b = -b;
if (b == 0)
return a;
}
}
}

Here are some variants which pass gradually large tables:

static mp_limb_t
gcd_bin_tab4 (mp_limb_t a, mp_limb_t b, size_t *np)
{
static signed char tab[4] = {
-1,  1,
1, -1,
};
return gcd_bin_tabkm (a, b, np, tab, 1);
}

static mp_limb_t
gcd_bin_tab16 (mp_limb_t a, mp_limb_t b, size_t *np)
{
static signed char tab[16] = {
#if ! defined (TAB_VARIANT) || TAB_VARIANT == 4 || TAB_VARIANT > 4
-1, -3, -5, -7,
-3, -1, -7, -5,
-5, -7, -1, -3,
-7, -5, -3, -1,
#endif
};
return gcd_bin_tabkm (a, b, np, tab, 2);
}

static mp_limb_t
gcd_bin_tab64 (mp_limb_t a, mp_limb_t b, size_t *np)
{
static signed char tab[64] = {
#if ! defined (TAB_VARIANT) || TAB_VARIANT == 7 || TAB_VARIANT > 8
-1,-11,-13, -7, -9, -3, -5,  1,
-3, -1, -7, -5,-11, -9,  1,-13,
-5, -7, -1, -3,-13,  1, -9,-11,
-7,-13,-11, -1,  1, -5, -3, -9,
-9, -3, -5,  1, -1,-11,-13, -7,
-11, -9,  1,-13, -3, -1, -7, -5,
-13,  1, -9,-11, -5, -7, -1, -3,
1, -5, -3, -9, -7,-13,-11, -1,
#endif
};
return gcd_bin_tabkm (a, b, np, tab, 3);
}

static mp_limb_t
gcd_bin_tab256 (mp_limb_t a, mp_limb_t b, size_t *np)
{
static signed char tab[256] = {
#if ! defined (TAB_VARIANT) || TAB_VARIANT == 14 || TAB_VARIANT > 16
-1,-11,-13,-23,-25, -3, -5,-15,-17,-27,  3, -7, -9,-19,-21,  1,
-3, -1, -7, -5,-11, -9,-15,-13,-19,-17,-23,-21,-27,-25,  1,  3,
-5,-23, -1,-19,  3,-15,-25,-11,-21, -7,-17, -3,-13,  1, -9,-27,
-7,-13,-27, -1,-15,-21, -3, -9,-23,  3,-11,-17,  1, -5,-19,-25,
-9, -3,-21,-15, -1,-27,-13, -7,-25,-19, -5,  1,-17,-11,  3,-23,
-11,-25,-15,  3,-19, -1,-23, -5,-27, -9,  1,-13, -3,-17, -7,-21,
-13,-15, -9,-11, -5, -7, -1, -3,  3,  1,-25,-27,-21,-23,-17,-19,
-15, -5, -3,-25,-23,-13,-11, -1,  1,-21,-19, -9, -7,  3,-27,-17,
-17,-27,  3, -7, -9,-19,-21,  1, -1,-11,-13,-23,-25, -3, -5,-15,
-19,-17,-23,-21,-27,-25,  1,  3, -3, -1, -7, -5,-11, -9,-15,-13,
-21, -7,-17, -3,-13,  1, -9,-27, -5,-23, -1,-19,  3,-15,-25,-11,
-23,  3,-11,-17,  1, -5,-19,-25, -7,-13,-27, -1,-15,-21, -3, -9,
-25,-19, -5,  1,-17,-11,  3,-23, -9, -3,-21,-15, -1,-27,-13, -7,
-27, -9,  1,-13, -3,-17, -7,-21,-11,-25,-15,  3,-19, -1,-23, -5,
3,  1,-25,-27,-21,-23,-17,-19,-13,-15, -9,-11, -5, -7, -1, -3,
1,-21,-19, -9, -7,  3,-27,-17,-15, -5, -3,-25,-23,-13,-11, -1,
#endif
};
return gcd_bin_tabkm (a, b, np, tab, 4);
}

What about progress of the algorithm?

Algorithm                         Iterations  Iter/Bit  Bit/Iter  Max iter
plain binary                   :   7587840328   0.697    1.435       60
binary+-                       :   6611310442   0.607    1.647       59
binary tab4                    :   6657355138   0.611    1.636       59
binary tab16                   :   4803839583   0.441    2.267       42
binary tab64                   :   4364358404   0.401    2.495       43
binary tab256                  :   4148274655   0.381    2.625       44
binary tab1024                 :   3951334269   0.363    2.756       37
binary tab4096                 :   3884273290   0.357    2.803       38

With the largest table above "table256", we get about 2.6
bits/iteration, compared to 1.4 for the plain binary algorithm.

So is this practical?  We haven't made a good implementation yet.  Such
an implementation would need to eliminate all data-dependent branches.

--
Torbjörn
Please encrypt, key id 0xC8601622