Jacobi symbol using Lehmer's algorithm. (Was: Re: asymptotically fast Jacobi symbol)

Niels Möller nisse at lysator.liu.se
Sun Jan 24 22:16:27 CET 2010

I wrote:

> I'm looking into left-to-right Jacobi now, although I'm not sure how
> much time I will have for it. I think it should run at almost the same
> speed as gcd (either Lehmer or sub-quadratic). Just one additional table
> lookup per quotient.

Below is my current code. So far still quadratic (based on Lehmer's
algorithm). In my intitial benchmark, it seems to beat mpz_jacobi in
current GMP for n >= 3.

The jacobi logic is implemented using a state of 6 bits (current
exponent for (-1)^e, two least significant bits of the current
remainders, and one bit saying which remainder we're currently
subtracting from the other.

Each time a multiple of one remainder in the sequence subtracted from
the other (that would be the quotient or some number smaller than the
quotient), the state is updated using a table lookup based on the
current state, the least two bits of the subtracted multiple/quotient,
and one bit saying which remainder is subtracted from which. I.e, the
table consists of 2^9 values of 6 bits each. I have special code for the
small cases n = 1 and 2 (which it might be better to rewrite using the
binary algorithm), and then the lehmer code is basically a copy of the
corresponding gcd code with appropriate calls to jacobi_update added,

Should be a fairly easy exercise to extend to a subquadratic algorithm,
using hgcd rather than hgcd2 for inputs above some threshold.

Regards,
/Niels

/* Jacobi symbol calculation based on Lehmer's GCD algorithm. Using a
method suggested by Schönhage, the Jacobi symbol is computed from
the least significant two bits of the quotients and the
coresponding remainders. */

/* Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008 Free Software
Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

/* Written by Niels Möller. The Lehmer code copied from GMP, with just
appropriate calls to jacobi_update added. */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <gmp.h>
#include <gmp-impl.h>
#include <longlong.h>

#define USE_JACOBI_TABLE 1

static inline mp_limb_t
div1 (mp_ptr rp,
mp_limb_t n0,
mp_limb_t d0)
{
mp_limb_t q = 0;

if ((mp_limb_signed_t) n0 < 0)
{
int cnt;
for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
{
d0 = d0 << 1;
}

q = 0;
while (cnt)
{
q <<= 1;
if (n0 >= d0)
{
n0 = n0 - d0;
q |= 1;
}
d0 = d0 >> 1;
cnt--;
}
}
else
{
int cnt;
for (cnt = 0; n0 >= d0; cnt++)
{
d0 = d0 << 1;
}

q = 0;
while (cnt)
{
d0 = d0 >> 1;
q <<= 1;
if (n0 >= d0)
{
n0 = n0 - d0;
q |= 1;
}
cnt--;
}
}
*rp = n0;
return q;
}

/* Two-limb division optimized for small quotients.  */
static inline mp_limb_t
div2 (mp_ptr rp,
mp_limb_t nh, mp_limb_t nl,
mp_limb_t dh, mp_limb_t dl)
{
mp_limb_t q = 0;

if ((mp_limb_signed_t) nh < 0)
{
int cnt;
for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
{
dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
dl = dl << 1;
}

while (cnt)
{
q <<= 1;
if (nh > dh || (nh == dh && nl >= dl))
{
sub_ddmmss (nh, nl, nh, nl, dh, dl);
q |= 1;
}
dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
dh = dh >> 1;
cnt--;
}
}
else
{
int cnt;
for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
{
dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
dl = dl << 1;
}

while (cnt)
{
dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
dh = dh >> 1;
q <<= 1;
if (nh > dh || (nh == dh && nl >= dl))
{
sub_ddmmss (nh, nl, nh, nl, dh, dl);
q |= 1;
}
cnt--;
}
}

rp = nl;
rp = nh;

return q;
}

/* Schönhage's rules:
*
* Assume r0 = r1 q1 + r2, with r0 odd, and r1 = q2 r2 + r3
*
* If r1 is odd, then
*
*   (r1 | r0) = s(r1, r0) (r0 | r1) = s(r1, r0) (r2, r1)
*
* where s(x,y) = (-1)^{(x-1)(y-1)/4} = (-1)^[x = y = 3 (mod 4)].
*
* If r1 is even, r2 must be odd. We have
*
*   (r1 | r0) = (r1 - r0 | r0) = (-1)^(r0-1)/2 (r0 - r1 | r0)
*             = (-1)^(r0-1)/2 s(r0, r0 - r1) (r0 | r0 - r1)
*             = (-1)^(r0-1)/2 s(r0, r0 - r1) (r1 | r0 - r1)
*
* Now, if r1 = 0 (mod 4), then the sign factor is +1, and repeating
* q1 times gives
*
*   (r1 | r0) = (r1 | r2) = (r3 | r2)
*
* On the other hand, if r1 = 2 (mod 4), the sign factor is
* (-1)^{(r0-1)/2}, and repeating q1 times gives the exponent
*
*   (r0-1)/2 + (r0-r1-1)/2 + ... + (r0 - (q1-1) r1)/2
*   = q1 (r0-1)/2 + q1 (q1-1)/2
*
* and we can summarize the even case as
*
*   (r1 | r0) = t(r1, r0, q1) (r3 | r2)
*
* where t(x,y,q) = (-1)^{[x = 2 (mod 4)] (q(y-1)/2 + y(q-1)/2)}
*
* What about termination? The remainder sequence ends with (0|1) = 1
* (or (0 | r) = 0 if r != 1). What are the possible cases? If r1 is
* odd, r2 may be zero. If r1 is even, then r2 = r0 - q1 r1 is odd and
* hence non-zero. We may have r3 = r1 - q2 r2 = 0.
*
* Examples: (11|15) = - (15|11) = - (4|11)
*            (4|11) =    (4| 3) =   (1| 3)
*            (1| 3) = (3|1) = (0|1) = 1
*
*             (2|7) = (2|1) = (0|1) = 1
*
* Detail:     (2|7) = (2-7|7) = (-1|7)(5|7) = -(7|5) = -(2|5)
*             (2|5) = (2-5|5) = (-1|5)(3|5) =  (5|3) =  (2|3)
*             (2|3) = (2-3|3) = (-1|3)(1|3) = -(3|1) = -(2|1)
*
*/

#if USE_JACOBI_TABLE

typedef unsigned jacobi_bits;

/* Layout:

5  3  1 0
+-+--+--+-+
|d| b| a|e|
+-+--+--+-+

Collected factors are (-1)^e. a and b are the least significant
bits of the current remainders. d (denominator) is 0 if we're
currently subtracting multiplies of a from b, and 1 if we're
subtracting b from a.
*/

#define JACOBI_E(bits) ((bits) & 1)
#define JACOBI_A(bits) (((bits) >> 1) & 3)
#define JACOBI_B(bits) (((bits) >> 3) & 3)
#define JACOBI_D(bits) (((bits) >> 5) & 1)

unsigned jacobi_init (unsigned a, unsigned b)
{
ASSERT (b & 1);

return ((a & 3) << 1) | ((b & 3) << 3) | 0x20;
}

int
jacobi_finish (unsigned bits)
{
if (JACOBI_D (bits))
{
ASSERT (JACOBI_A (bits) == 0);
ASSERT (JACOBI_B (bits) == 1);
}
else
{
ASSERT (JACOBI_A (bits) == 1);
ASSERT (JACOBI_B (bits) == 0);
}

return JACOBI_E(bits) ? -1 : 1;
}

/* q should be two bits. */
unsigned
jacobi_update (unsigned bits, unsigned denominator, unsigned q)
{
static const unsigned char table =
{
0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1f,0x1e,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x00,0x00,0x32,0x33,0x00,0x00,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3f,0x3e,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x00,0x00,0x32,0x33,0x00,0x00,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x08,0x09,0x02,0x03,0x1c,0x1d,0x16,0x17,
0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x18,0x19,0x12,0x13,0x0d,0x0c,0x06,0x07,
0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x08,0x09,0x02,0x03,0x1c,0x1d,0x16,0x17,
0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x18,0x19,0x12,0x13,0x0d,0x0c,0x07,0x06,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2e,0x2f,0x28,0x29,0x2a,0x2b,0x2c,0x2d,
0x00,0x00,0x36,0x37,0x00,0x00,0x33,0x32,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,0x39,0x38,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2e,0x2f,0x28,0x29,0x2a,0x2b,0x2c,0x2d,
0x00,0x00,0x36,0x37,0x00,0x00,0x33,0x32,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,0x38,0x39,
0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x08,0x09,0x1a,0x1b,0x0d,0x0c,0x1e,0x1f,
0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x18,0x19,0x0a,0x0b,0x1d,0x1c,0x0e,0x0f,
0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x08,0x09,0x1a,0x1b,0x0d,0x0c,0x1e,0x1f,
0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x18,0x19,0x0a,0x0b,0x1d,0x1c,0x0f,0x0e,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2c,0x2d,0x2e,0x2f,0x28,0x29,0x2a,0x2b,
0x00,0x00,0x33,0x32,0x00,0x00,0x37,0x36,0x3c,0x3d,0x3e,0x3f,0x38,0x39,0x3b,0x3a,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2c,0x2d,0x2e,0x2f,0x28,0x29,0x2a,0x2b,
0x00,0x00,0x33,0x32,0x00,0x00,0x37,0x36,0x3c,0x3d,0x3e,0x3f,0x38,0x39,0x3a,0x3b,
0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x08,0x09,0x12,0x13,0x1d,0x1c,0x06,0x07,
0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x18,0x19,0x02,0x03,0x0c,0x0d,0x16,0x17,
0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x08,0x09,0x12,0x13,0x1d,0x1c,0x06,0x07,
0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x18,0x19,0x02,0x03,0x0c,0x0d,0x17,0x16,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x28,0x29,
0x00,0x00,0x37,0x36,0x00,0x00,0x32,0x33,0x3e,0x3f,0x38,0x39,0x3a,0x3b,0x3d,0x3c,
0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x28,0x29,
0x00,0x00,0x37,0x36,0x00,0x00,0x32,0x33,0x3e,0x3f,0x38,0x39,0x3a,0x3b,0x3c,0x3d,
};

/* At least one odd. */
ASSERT ((JACOBI_A (bits) | JACOBI_B (bits)) & 1);

bits = table[bits | (denominator << 6) | ((q & 3) << 7)];

ASSERT ((JACOBI_A (bits) | JACOBI_B (bits)) & 1);
return bits;
}

#else /* ! USE_JACOBI_TABLE */

/* We need a suitable abstraction where we push one (partial) quotient
at a time and reduce accordingly. */
typedef struct {
/* Only least significant two bits of remainders. */
unsigned a : 2;
unsigned b : 2;
/* zero, if we're dividing b / a (i.e., subtracting multiples of
a), and one if we're dividing a / b. */
unsigned denominator : 1;
/* Collected factors are (-1)^e */
unsigned e : 1;
} jacobi_bits;

#define JACOBI_A(bits) ((bits).a)
#define JACOBI_B(bits) ((bits).b)

jacobi_bits
jacobi_init (unsigned a, unsigned b)
{
jacobi_bits bits;
ASSERT (b & 1);

bits.a = a;
bits.b = b;
bits.denominator = 1;
bits.e = 0;
return bits;
}

int
jacobi_finish (jacobi_bits bits)
{
if (bits.denominator == 1)
{
ASSERT (bits.b == 1);
ASSERT (bits.a == 0);
}
else
{
ASSERT (bits.a == 1);
ASSERT (bits.b == 0);
}

return bits.e ? -1 : 1;
}

/* q should be two bits. */
jacobi_bits
jacobi_update (jacobi_bits bits, unsigned denominator, unsigned q)
{
unsigned r;

if (LIKELY (denominator != bits.denominator))
{
/* Quotient is complete. */
if (bits.a & bits.b & 1)
{
/* Invoke reciprocity. */
bits.e ^= (bits.a & bits.b) >> 1;
}
else
{
/* We've been subtracting an even number, so do nothing. */
}
bits.denominator = denominator;
}

/* Let the denominator be r1, i.e., we're up to now subtracting multiples of r1 from r0. */
r[bits.denominator] = bits.b;
r[1-bits.denominator] = bits.a;

ASSERT ( (r | r) & 1);

if ((r & 3) == 2)
{
ASSERT (r & 1);

bits.e ^= (q & (r >> 1)) ^ (q >> 1);
}

r -= q * r;
if (bits.denominator)
bits.a = r;
else
bits.b = r;

return bits;
}
#endif /* !USE_JACOBI_TABLE */

static int
mpn_jacobi_1 (mp_limb_t a, mp_limb_t b, jacobi_bits bits)
{
if (a == 0)
return b == 1 ? jacobi_finish (bits) : 0;
else if (b == 0)
return a == 1 ? jacobi_finish (bits) : 0;

if (a < b)
goto subtract_a;

for (;;)
{
mp_limb_t r;
mp_limb_t q;

ASSERT ( (a & 3) == JACOBI_A (bits));
ASSERT ( (b & 3) == JACOBI_B (bits));

ASSERT (a >= b);
q = div1 (&r, a, b);
a = r;

bits = jacobi_update (bits, 1, q);

if (a == 0)
return b == 1 ? jacobi_finish (bits) : 0;

subtract_a:
ASSERT ( (a & 3) == JACOBI_A (bits));
ASSERT ( (b & 3) == JACOBI_B (bits));

ASSERT (a < b);
q = div1 (&r, b, a);
b = r;
bits = jacobi_update (bits, 0, q);

if (b == 0)
return a == 1 ? jacobi_finish (bits) : 0;
}
}

static int
mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, jacobi_bits bits)
{
mp_limb_t ah, al, bh, bl;

al = ap;
ah = ap;
bl = bp;
bh = bp;

if (ah == 0 && al == 0)
return (bh == 0 && bl == 1) ? jacobi_finish (bits) : 0;
else if (bh == 0 && bl == 0)
return (ah == 0 && al == 1) ? jacobi_finish (bits) : 0;

if (ah < bh || (ah == bh && al < bl))
goto subtract_a;

while (ah | bh)
{
mp_limb_t r;
mp_limb_t q;

ASSERT ( (al & 3) == JACOBI_A (bits));
ASSERT ( (bl & 3) == JACOBI_B (bits));

ASSERT (ah > bh || (ah == bh && al >= bl));
q = div2 (r, ah, al, bh, bl);
al = r; ah = r;

bits = jacobi_update (bits, 1, q);

if (ah == 0 && al == 0)
return (bh == 0 && bl == 1) ? jacobi_finish (bits) : 0;

subtract_a:
ASSERT ( (al & 3) == JACOBI_A (bits));
ASSERT ( (bl & 3) == JACOBI_B (bits));

ASSERT (ah < bh || (ah == bh && al < bl));
q = div2 (r, bh, bl, ah, al);
bl = r; bh = r;

bits = jacobi_update (bits, 0, q);

if (bh == 0 && bl == 0)
return (ah == 0 && al == 1) ? jacobi_finish (bits) : 0;
}
return mpn_jacobi_1 (al, bl, bits);
}

int
mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
struct hgcd_matrix1 *M, jacobi_bits *bitsp)
{
mp_limb_t u00, u01, u10, u11;
jacobi_bits bits = *bitsp;

if (ah < 2 || bh < 2)
return 0;

if (ah > bh || (ah == bh && al > bl))
{
sub_ddmmss (ah, al, ah, al, bh, bl);
if (ah < 2)
return 0;

u00 = u01 = u11 = 1;
u10 = 0;
bits = jacobi_update (bits, 1, 1);
}
else
{
sub_ddmmss (bh, bl, bh, bl, ah, al);
if (bh < 2)
return 0;

u00 = u10 = u11 = 1;
u01 = 0;
bits = jacobi_update (bits, 0, 1);
}

if (ah < bh)
goto subtract_a;

for (;;)
{
ASSERT (ah >= bh);
if (ah == bh)
goto done;

if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
{
ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));

break;
}

/* Subtract a -= q b, and multiply M from the right by (1 q ; 0
1), affecting the second column of M. */
ASSERT (ah > bh);
sub_ddmmss (ah, al, ah, al, bh, bl);

if (ah < 2)
goto done;

if (ah <= bh)
{
/* Use q = 1 */
u01 += u00;
u11 += u10;
bits = jacobi_update (bits, 1, 1);
}
else
{
mp_limb_t r;
mp_limb_t q = div2 (r, ah, al, bh, bl);
al = r; ah = r;
if (ah < 2)
{
/* A is too small, but q is correct. */
u01 += q * u00;
u11 += q * u10;
bits = jacobi_update (bits, 1, q);
goto done;
}
q++;
u01 += q * u00;
u11 += q * u10;
bits = jacobi_update (bits, 1, q);
}
subtract_a:
ASSERT (bh >= ah);
if (ah == bh)
goto done;

if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
{
ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));

goto subtract_a1;
}

/* Subtract b -= q a, and multiply M from the right by (1 0 ; q
1), affecting the first column of M. */
sub_ddmmss (bh, bl, bh, bl, ah, al);

if (bh < 2)
goto done;

if (bh <= ah)
{
/* Use q = 1 */
u00 += u01;
u10 += u11;
bits = jacobi_update (bits, 0, 1);
}
else
{
mp_limb_t r;
mp_limb_t q = div2 (r, bh, bl, ah, al);
bl = r; bh = r;
if (bh < 2)
{
/* B is too small, but q is correct. */
u00 += q * u01;
u10 += q * u11;
bits = jacobi_update (bits, 0, q);
goto done;
}
q++;
u00 += q * u01;
u10 += q * u11;
bits = jacobi_update (bits, 0, q);
}
}

/* NOTE: Since we discard the least significant half limb, we don't
get a truly maximal M (corresponding to |a - b| <
2^{GMP_LIMB_BITS +1}). */
/* Single precision loop */
for (;;)
{
ASSERT (ah >= bh);
if (ah == bh)
break;

ah -= bh;
if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
break;

if (ah <= bh)
{
/* Use q = 1 */
u01 += u00;
u11 += u10;
bits = jacobi_update (bits, 1, 1);
}
else
{
mp_limb_t r;
mp_limb_t q = div1 (&r, ah, bh);
ah = r;
if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
{
/* A is too small, but q is correct. */
u01 += q * u00;
u11 += q * u10;
bits = jacobi_update (bits, 1, q);
break;
}
q++;
u01 += q * u00;
u11 += q * u10;
bits = jacobi_update (bits, 1, q);
}
subtract_a1:
ASSERT (bh >= ah);
if (ah == bh)
break;

bh -= ah;
if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
break;

if (bh <= ah)
{
/* Use q = 1 */
u00 += u01;
u10 += u11;
bits = jacobi_update (bits, 0, 1);
}
else
{
mp_limb_t r;
mp_limb_t q = div1 (&r, bh, ah);
bh = r;
if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
{
/* B is too small, but q is correct. */
u00 += q * u01;
u10 += q * u11;
bits = jacobi_update (bits, 0, q);
break;
}
q++;
u00 += q * u01;
u10 += q * u11;
bits = jacobi_update (bits, 0, q);
}
}

done:
M->u = u00; M->u = u01;
M->u = u10; M->u = u11;
*bitsp = bits;

return 1;
}

mp_size_t
mpn_jacobi_subdiv_step (jacobi_bits *bitsp,
mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
mp_size_t an, bn;
unsigned denominator;

ASSERT (n > 0);
ASSERT (ap[n-1] > 0 || bp[n-1] > 0);

an = bn = n;
MPN_NORMALIZE (ap, an);
MPN_NORMALIZE (bp, bn);

if (UNLIKELY (an == 0))
return 0;

else if (UNLIKELY (bn == 0))
return 0;

/* Arrange so that a > b, subtract an -= bn, and maintain
normalization. */
denominator = 1;
if (an < bn)
{
MPN_PTR_SWAP (ap, an, bp, bn);
denominator = 0;
}
else if (an == bn)
{
int c;
MPN_CMP (c, ap, bp, an);
if (UNLIKELY (c == 0))
return 0;
else if (c < 0)
{
MP_PTR_SWAP (ap, bp);
denominator = 0;
}
}

ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
MPN_NORMALIZE (ap, an);
ASSERT (an > 0);

*bitsp = jacobi_update (*bitsp, denominator, 1);

/* Arrange so that a > b, and divide a = q b + r */
/* FIXME: an < bn happens when we have cancellation. If that is the
common case, then we could reverse the roles of a and b to avoid
the swap. */
if (an < bn)
{
MPN_PTR_SWAP (ap, an, bp, bn);
denominator ^= 1;
}
else if (an == bn)
{
int c;
MPN_CMP (c, ap, bp, an);
if (UNLIKELY (c == 0))
return 0;
else if (c < 0)
{
MP_PTR_SWAP (ap, bp);
denominator ^= 1;
}
}

mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
*bitsp = jacobi_update (*bitsp, denominator, tp);

if (mpn_zero_p (ap, bn))
return 0;

return bn;
}

/* Computes (a | b), where b is odd and normalized. */
static int
mpn_jacobi_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
jacobi_bits bits;

ASSERT (n > 0);
ASSERT (bp[n-1] > 0);
ASSERT (bp & 1);

bits = jacobi_init (ap, bp);

while (n > 2)
{
struct hgcd_matrix1 M;
mp_limb_t ah, al, bh, bl;

{
ah = ap[n-1]; al = ap[n-2];
bh = bp[n-1]; bl = bp[n-2];
}
else
{
int shift;

ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
}

/* Try an mpn_nhgcd2 step */
if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M, &bits))
{
n = mpn_hgcd_mul_matrix1_inverse_vector (&M, tp, ap, bp, n);
MP_PTR_SWAP (ap, tp);
}
else
{
/* mpn_hgcd2 has failed. Then either one of a or b is very
small, or the difference is very small. Perform one
subtraction followed by one division. */
n = mpn_jacobi_subdiv_step (&bits, ap, bp, n, tp);
if (!n)
return jacobi_finish (bits);
}
}
if (n == 1)
return mpn_jacobi_1 (ap, bp, bits);
else
return mpn_jacobi_2 (ap, bp, bits);
}

#define MAX_TEST_SIZE 100

static void
test_jacobi (gmp_randstate_ptr rands)
{
mp_limb_t a[MAX_TEST_SIZE];
mp_limb_t b[MAX_TEST_SIZE];
mp_limb_t t[MAX_TEST_SIZE];
mpz_t az, bz;
unsigned i;

mpz_init (az); mpz_init (bz);

for (i = 0; i < 200; i++)
{
mp_limb_t n;
int res, ref;
n = 1 + gmp_urandomm_ui (rands, MAX_TEST_SIZE);
_mpz_realloc (az, n);
_mpz_realloc (bz, n);
mpn_random (a, n);
mpn_random (b, n);
b |= 1;

MPN_COPY (PTR (az), a, n);
MPN_COPY (PTR (bz), b, n);
SIZ (az) = SIZ (bz) = n;

MPN_NORMALIZE (PTR(az), SIZ(az));
MPN_NORMALIZE (PTR(bz), SIZ(bz));

ref = mpz_jacobi (az, bz);
res = mpn_jacobi_lehmer (a, b, n, t);
if (ref != res)
{
gmp_printf ("mpn_jacobi_lehmer failed: n = %ld\n"
"  a = %Zd\n"
"  b = %Zd\n"
"  ref = %d (expected)\n"
(long) n, az, bz, ref, res);
abort();
}
#if 0
gmp_printf ("(%Zd|%Zd) = %d\n", az, bz, res);
#endif
}
mpz_clear (az); mpz_clear (bz);
}

#include "speed.h"

static int
compare_double(const void *ap, const void *bp)
{
double a = * (const double *) ap;
double b = * (const double *) bp;

if (a < b)
return -1;
else if (a > b)
return 1;
else
return 0;
}

static double
median (double *v, size_t n)
{
qsort(v, n, sizeof(*v), compare_double);

return v[n/2];
}

#define MEDIAN_N 5
#define TIME(res, code) do {				\
double time_measurement[MEDIAN_N];			\
unsigned time_i;					\
\
for (time_i = 0; time_i < MEDIAN_N; time_i++)		\
{							\
speed_starttime();				\
code;						\
time_measurement[time_i] = speed_endtime();	\
}							\
res = median(time_measurement, MEDIAN_N);		\
} while (0)

static void
time_jacobi (gmp_randstate_ptr rands, mp_size_t n)
{
mpz_t az;
mpz_t bz;
mp_ptr ap;
mp_ptr bp;
mp_ptr tp;
mp_ptr at;
mp_ptr bt;
int res;
int ref;

double t_ref, t_lehmer;

TMP_DECL;
TMP_MARK;

mpz_init (az); mpz_init (bz);
ap = TMP_ALLOC_LIMBS (n);
bp = TMP_ALLOC_LIMBS (n);
at = TMP_ALLOC_LIMBS (n);
bt = TMP_ALLOC_LIMBS (n);
tp = TMP_ALLOC_LIMBS (n);

_mpz_realloc (az, n);
_mpz_realloc (bz, n);
mpn_random (ap, n);
mpn_random (bp, n);
bp |= 1;

MPN_COPY (PTR (az), ap, n);
MPN_COPY (PTR (bz), bp, n);
SIZ (az) = SIZ (bz) = n;

MPN_NORMALIZE (PTR(az), SIZ(az));
MPN_NORMALIZE (PTR(bz), SIZ(bz));

TIME (t_ref, ref = mpz_jacobi (az, bz));
TIME (t_lehmer, MPN_COPY (at, ap, n); MPN_COPY (bt, bp, n);
res = mpn_jacobi_lehmer (at, bt, n, tp));

printf ("%4ld %5.3f %5.3f %5.3f\n",
(long) n, 1e6*t_ref, 1e6*t_lehmer, t_lehmer / t_ref);
if (ref != res)
{
gmp_printf ("mpn_jacobi_lehmer failed: n = %ld\n"
"  a = %Zd\n"
"  b = %Zd\n"
"  ref = %d (expected)\n"
(long) n, az, bz, ref, res);
abort();
}
TMP_FREE;
mpz_clear (az); mpz_clear (bz);
}

int
main (int argc, char **argv)
{
gmp_randstate_ptr rands;

rands = RANDS;

if (argc == 2)
{
mp_size_t n = atol (argv);
if (n < 1)
return 1;

time_jacobi (rands, n);
}
else
test_jacobi (rands);

return 0;
}

--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.