Jacobi symbol using Lehmer's algorithm. (Was: Re: asymptotically fast Jacobi symbol)

Sun Jan 24 22:16:27 CET 2010

I wrote:

> I'm looking into left-to-right Jacobi now, although I'm not sure how
> much time I will have for it. I think it should run at almost the same
> speed as gcd (either Lehmer or sub-quadratic). Just one additional table
> lookup per quotient.

Below is my current code. So far still quadratic (based on Lehmer's
algorithm). In my intitial benchmark, it seems to beat mpz_jacobi in
current GMP for n >= 3.

The jacobi logic is implemented using a state of 6 bits (current
exponent for (-1)^e, two least significant bits of the current
remainders, and one bit saying which remainder we're currently
subtracting from the other.

Each time a multiple of one remainder in the sequence subtracted from
the other (that would be the quotient or some number smaller than the
quotient), the state is updated using a table lookup based on the
current state, the least two bits of the subtracted multiple/quotient,
and one bit saying which remainder is subtracted from which. I.e, the
table consists of 2^9 values of 6 bits each. I have special code for the
small cases n = 1 and 2 (which it might be better to rewrite using the
binary algorithm), and then the lehmer code is basically a copy of the
corresponding gcd code with appropriate calls to jacobi_update added,

Should be a fairly easy exercise to extend to a subquadratic algorithm,
using hgcd rather than hgcd2 for inputs above some threshold.

Regards,
/Niels

/* Jacobi symbol calculation based on Lehmer's GCD algorithm. Using a
   method suggested by Schönhage, the Jacobi symbol is computed from
   the least significant two bits of the quotients and the
   coresponding remainders. */

/* Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008 Free Software
Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

/* Written by Niels Möller. The Lehmer code copied from GMP, with just
   appropriate calls to jacobi_update added. */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <gmp.h>
#include <gmp-impl.h>
#include <longlong.h>

#define USE_JACOBI_TABLE 1

static inline mp_limb_t
div1 (mp_ptr rp,
      mp_limb_t n0,
      mp_limb_t d0)
{
  mp_limb_t q = 0;

  if ((mp_limb_signed_t) n0 < 0)
    {
      int cnt;
      for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
	{
	  d0 = d0 << 1;
	}

      q = 0;
      while (cnt)
	{
	  q <<= 1;
	  if (n0 >= d0)
	    {
	      n0 = n0 - d0;
	      q |= 1;
	    }
	  d0 = d0 >> 1;
	  cnt--;
	}
    }
  else
    {
      int cnt;
      for (cnt = 0; n0 >= d0; cnt++)
	{
	  d0 = d0 << 1;
	}

      q = 0;
      while (cnt)
	{
	  d0 = d0 >> 1;
	  q <<= 1;
	  if (n0 >= d0)
	    {
	      n0 = n0 - d0;
	      q |= 1;
	    }
	  cnt--;
	}
    }
  *rp = n0;
  return q;
}

/* Two-limb division optimized for small quotients.  */
static inline mp_limb_t
div2 (mp_ptr rp,
      mp_limb_t nh, mp_limb_t nl,
      mp_limb_t dh, mp_limb_t dl)
{
  mp_limb_t q = 0;

  if ((mp_limb_signed_t) nh < 0)
    {
      int cnt;
      for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
	{
	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
	  dl = dl << 1;
	}

      while (cnt)
	{
	  q <<= 1;
	  if (nh > dh || (nh == dh && nl >= dl))
	    {
	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
	      q |= 1;
	    }
	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
	  dh = dh >> 1;
	  cnt--;
	}
    }
  else
    {
      int cnt;
      for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
	{
	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
	  dl = dl << 1;
	}

      while (cnt)
	{
	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
	  dh = dh >> 1;
	  q <<= 1;
	  if (nh > dh || (nh == dh && nl >= dl))
	    {
	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
	      q |= 1;
	    }
	  cnt--;
	}
    }

  rp[0] = nl;
  rp[1] = nh;

  return q;
}

/* Schönhage's rules:
 *
 * Assume r0 = r1 q1 + r2, with r0 odd, and r1 = q2 r2 + r3
 *
 * If r1 is odd, then
 *
 *   (r1 | r0) = s(r1, r0) (r0 | r1) = s(r1, r0) (r2, r1)
 *
 * where s(x,y) = (-1)^{(x-1)(y-1)/4} = (-1)^[x = y = 3 (mod 4)].
 *
 * If r1 is even, r2 must be odd. We have
 *
 *   (r1 | r0) = (r1 - r0 | r0) = (-1)^(r0-1)/2 (r0 - r1 | r0)
 *             = (-1)^(r0-1)/2 s(r0, r0 - r1) (r0 | r0 - r1) 
 *             = (-1)^(r0-1)/2 s(r0, r0 - r1) (r1 | r0 - r1)
 *
 * Now, if r1 = 0 (mod 4), then the sign factor is +1, and repeating
 * q1 times gives
 *
 *   (r1 | r0) = (r1 | r2) = (r3 | r2)
 *
 * On the other hand, if r1 = 2 (mod 4), the sign factor is
 * (-1)^{(r0-1)/2}, and repeating q1 times gives the exponent
 *
 *   (r0-1)/2 + (r0-r1-1)/2 + ... + (r0 - (q1-1) r1)/2
 *   = q1 (r0-1)/2 + q1 (q1-1)/2  
 *
 * and we can summarize the even case as
 *
 *   (r1 | r0) = t(r1, r0, q1) (r3 | r2)
 *
 * where t(x,y,q) = (-1)^{[x = 2 (mod 4)] (q(y-1)/2 + y(q-1)/2)}
 *
 * What about termination? The remainder sequence ends with (0|1) = 1
 * (or (0 | r) = 0 if r != 1). What are the possible cases? If r1 is
 * odd, r2 may be zero. If r1 is even, then r2 = r0 - q1 r1 is odd and
 * hence non-zero. We may have r3 = r1 - q2 r2 = 0.
 *
 * Examples: (11|15) = - (15|11) = - (4|11)
 *            (4|11) =    (4| 3) =   (1| 3)
 *            (1| 3) = (3|1) = (0|1) = 1
 *
 *             (2|7) = (2|1) = (0|1) = 1
 *
 * Detail:     (2|7) = (2-7|7) = (-1|7)(5|7) = -(7|5) = -(2|5)
 *             (2|5) = (2-5|5) = (-1|5)(3|5) =  (5|3) =  (2|3)
 *             (2|3) = (2-3|3) = (-1|3)(1|3) = -(3|1) = -(2|1)
 *
 */

#if USE_JACOBI_TABLE

typedef unsigned jacobi_bits;

/* Layout:

     5  3  1 0
   +-+--+--+-+
   |d| b| a|e|
   +-+--+--+-+

   Collected factors are (-1)^e. a and b are the least significant
   bits of the current remainders. d (denominator) is 0 if we're
   currently subtracting multiplies of a from b, and 1 if we're
   subtracting b from a.   
*/

#define JACOBI_E(bits) ((bits) & 1)
#define JACOBI_A(bits) (((bits) >> 1) & 3)
#define JACOBI_B(bits) (((bits) >> 3) & 3)
#define JACOBI_D(bits) (((bits) >> 5) & 1)

unsigned jacobi_init (unsigned a, unsigned b)
{
  ASSERT (b & 1);

  return ((a & 3) << 1) | ((b & 3) << 3) | 0x20;
}

int
jacobi_finish (unsigned bits)
{
  if (JACOBI_D (bits))
    {
      ASSERT (JACOBI_A (bits) == 0);
      ASSERT (JACOBI_B (bits) == 1);
    }
  else
    {
      ASSERT (JACOBI_A (bits) == 1);
      ASSERT (JACOBI_B (bits) == 0);
    }

  return JACOBI_E(bits) ? -1 : 1;
}

/* q should be two bits. */
unsigned
jacobi_update (unsigned bits, unsigned denominator, unsigned q)
{
  static const unsigned char table[512] =
    {
      0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
      0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
      0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
      0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1f,0x1e,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
      0x00,0x00,0x32,0x33,0x00,0x00,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3f,0x3e,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
      0x00,0x00,0x32,0x33,0x00,0x00,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
      0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x08,0x09,0x02,0x03,0x1c,0x1d,0x16,0x17,
      0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x18,0x19,0x12,0x13,0x0d,0x0c,0x06,0x07,
      0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x08,0x09,0x02,0x03,0x1c,0x1d,0x16,0x17,
      0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x18,0x19,0x12,0x13,0x0d,0x0c,0x07,0x06,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2e,0x2f,0x28,0x29,0x2a,0x2b,0x2c,0x2d,
      0x00,0x00,0x36,0x37,0x00,0x00,0x33,0x32,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,0x39,0x38,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2e,0x2f,0x28,0x29,0x2a,0x2b,0x2c,0x2d,
      0x00,0x00,0x36,0x37,0x00,0x00,0x33,0x32,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,0x38,0x39,
      0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x08,0x09,0x1a,0x1b,0x0d,0x0c,0x1e,0x1f,
      0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x18,0x19,0x0a,0x0b,0x1d,0x1c,0x0e,0x0f,
      0x00,0x00,0x12,0x13,0x00,0x00,0x16,0x17,0x08,0x09,0x1a,0x1b,0x0d,0x0c,0x1e,0x1f,
      0x00,0x00,0x02,0x03,0x00,0x00,0x06,0x07,0x18,0x19,0x0a,0x0b,0x1d,0x1c,0x0f,0x0e,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2c,0x2d,0x2e,0x2f,0x28,0x29,0x2a,0x2b,
      0x00,0x00,0x33,0x32,0x00,0x00,0x37,0x36,0x3c,0x3d,0x3e,0x3f,0x38,0x39,0x3b,0x3a,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2c,0x2d,0x2e,0x2f,0x28,0x29,0x2a,0x2b,
      0x00,0x00,0x33,0x32,0x00,0x00,0x37,0x36,0x3c,0x3d,0x3e,0x3f,0x38,0x39,0x3a,0x3b,
      0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x08,0x09,0x12,0x13,0x1d,0x1c,0x06,0x07,
      0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x18,0x19,0x02,0x03,0x0c,0x0d,0x16,0x17,
      0x00,0x00,0x0a,0x0b,0x00,0x00,0x1e,0x1f,0x08,0x09,0x12,0x13,0x1d,0x1c,0x06,0x07,
      0x00,0x00,0x1a,0x1b,0x00,0x00,0x0e,0x0f,0x18,0x19,0x02,0x03,0x0c,0x0d,0x17,0x16,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x28,0x29,
      0x00,0x00,0x37,0x36,0x00,0x00,0x32,0x33,0x3e,0x3f,0x38,0x39,0x3a,0x3b,0x3d,0x3c,
      0x00,0x00,0x22,0x23,0x00,0x00,0x26,0x27,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x28,0x29,
      0x00,0x00,0x37,0x36,0x00,0x00,0x32,0x33,0x3e,0x3f,0x38,0x39,0x3a,0x3b,0x3c,0x3d,
    };

  /* At least one odd. */
  ASSERT ((JACOBI_A (bits) | JACOBI_B (bits)) & 1);

  bits = table[bits | (denominator << 6) | ((q & 3) << 7)];

  ASSERT ((JACOBI_A (bits) | JACOBI_B (bits)) & 1);
  return bits;  
}

#else /* ! USE_JACOBI_TABLE */

/* We need a suitable abstraction where we push one (partial) quotient
   at a time and reduce accordingly. */ 
typedef struct {
  /* Only least significant two bits of remainders. */
  unsigned a : 2;
  unsigned b : 2;
  /* zero, if we're dividing b / a (i.e., subtracting multiples of
     a), and one if we're dividing a / b. */
  unsigned denominator : 1;
  /* Collected factors are (-1)^e */
  unsigned e : 1;
} jacobi_bits;

#define JACOBI_A(bits) ((bits).a)
#define JACOBI_B(bits) ((bits).b)

jacobi_bits
jacobi_init (unsigned a, unsigned b)
{
  jacobi_bits bits;
  ASSERT (b & 1);

  bits.a = a;
  bits.b = b;
  bits.denominator = 1;
  bits.e = 0;
  return bits;
}

int
jacobi_finish (jacobi_bits bits)
{
  if (bits.denominator == 1)
    {
      ASSERT (bits.b == 1);
      ASSERT (bits.a == 0);
    }
  else
    {
      ASSERT (bits.a == 1);
      ASSERT (bits.b == 0);
    }

  return bits.e ? -1 : 1;
}

/* q should be two bits. */
jacobi_bits
jacobi_update (jacobi_bits bits, unsigned denominator, unsigned q)
{
  unsigned r[2];

  if (LIKELY (denominator != bits.denominator))
    {
      /* Quotient is complete. */
      if (bits.a & bits.b & 1)
	{
	  /* Invoke reciprocity. */
	  bits.e ^= (bits.a & bits.b) >> 1; 
	}
      else
	{
	  /* We've been subtracting an even number, so do nothing. */
	}
      bits.denominator = denominator;      
    }

  /* Let the denominator be r1, i.e., we're up to now subtracting multiples of r1 from r0. */
  r[bits.denominator] = bits.b;
  r[1-bits.denominator] = bits.a;

  ASSERT ( (r[0] | r[1]) & 1);

  if ((r[1] & 3) == 2)
    {
      ASSERT (r[0] & 1);

      bits.e ^= (q & (r[0] >> 1)) ^ (q >> 1);
    }

  r[0] -= q * r[1];
  if (bits.denominator)
    bits.a = r[0];
  else
    bits.b = r[0];

  return bits;
}
#endif /* !USE_JACOBI_TABLE */

static int
mpn_jacobi_1 (mp_limb_t a, mp_limb_t b, jacobi_bits bits)
{
  if (a == 0)
    return b == 1 ? jacobi_finish (bits) : 0;
  else if (b == 0)
    return a == 1 ? jacobi_finish (bits) : 0;

  if (a < b)
    goto subtract_a;

  for (;;)
    {
      mp_limb_t r;
      mp_limb_t q;

      ASSERT ( (a & 3) == JACOBI_A (bits));
      ASSERT ( (b & 3) == JACOBI_B (bits));

      ASSERT (a >= b);
      q = div1 (&r, a, b);
      a = r;

      bits = jacobi_update (bits, 1, q);

      if (a == 0)
	return b == 1 ? jacobi_finish (bits) : 0;

    subtract_a:
      ASSERT ( (a & 3) == JACOBI_A (bits));
      ASSERT ( (b & 3) == JACOBI_B (bits));

      ASSERT (a < b);
      q = div1 (&r, b, a);
      b = r;
      bits = jacobi_update (bits, 0, q);

      if (b == 0)
	return a == 1 ? jacobi_finish (bits) : 0;
    }
}

static int
mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, jacobi_bits bits)
{
  mp_limb_t ah, al, bh, bl;

  al = ap[0];
  ah = ap[1];
  bl = bp[0];
  bh = bp[1];

  if (ah == 0 && al == 0)
    return (bh == 0 && bl == 1) ? jacobi_finish (bits) : 0;
  else if (bh == 0 && bl == 0)
    return (ah == 0 && al == 1) ? jacobi_finish (bits) : 0;

  if (ah < bh || (ah == bh && al < bl))
    goto subtract_a;

  while (ah | bh)
    {
      mp_limb_t r[2];
      mp_limb_t q;

      ASSERT ( (al & 3) == JACOBI_A (bits));
      ASSERT ( (bl & 3) == JACOBI_B (bits));

      ASSERT (ah > bh || (ah == bh && al >= bl));
      q = div2 (r, ah, al, bh, bl);
      al = r[0]; ah = r[1];

      bits = jacobi_update (bits, 1, q);

      if (ah == 0 && al == 0)
	return (bh == 0 && bl == 1) ? jacobi_finish (bits) : 0;

    subtract_a:
      ASSERT ( (al & 3) == JACOBI_A (bits));
      ASSERT ( (bl & 3) == JACOBI_B (bits));

      ASSERT (ah < bh || (ah == bh && al < bl));
      q = div2 (r, bh, bl, ah, al);
      bl = r[0]; bh = r[1];

      bits = jacobi_update (bits, 0, q);

      if (bh == 0 && bl == 0)
	return (ah == 0 && al == 1) ? jacobi_finish (bits) : 0;
    }
  return mpn_jacobi_1 (al, bl, bits);
}

int
mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
		  struct hgcd_matrix1 *M, jacobi_bits *bitsp)
{
  mp_limb_t u00, u01, u10, u11;
  jacobi_bits bits = *bitsp;

  if (ah < 2 || bh < 2)
    return 0;

  if (ah > bh || (ah == bh && al > bl))
    {
      sub_ddmmss (ah, al, ah, al, bh, bl);
      if (ah < 2)
	return 0;

      u00 = u01 = u11 = 1;
      u10 = 0;
      bits = jacobi_update (bits, 1, 1);
    }
  else
    {
      sub_ddmmss (bh, bl, bh, bl, ah, al);
      if (bh < 2)
	return 0;

      u00 = u10 = u11 = 1;
      u01 = 0;
      bits = jacobi_update (bits, 0, 1);
    }

  if (ah < bh)
    goto subtract_a;

  for (;;)
    {
      ASSERT (ah >= bh);
      if (ah == bh)
	goto done;

      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
	{
	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));

	  break;
	}

      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
	 1), affecting the second column of M. */
      ASSERT (ah > bh);
      sub_ddmmss (ah, al, ah, al, bh, bl);

      if (ah < 2)
	goto done;

      if (ah <= bh)
	{
	  /* Use q = 1 */
	  u01 += u00;
	  u11 += u10;
	  bits = jacobi_update (bits, 1, 1);
	}
      else
	{
	  mp_limb_t r[2];
	  mp_limb_t q = div2 (r, ah, al, bh, bl);
	  al = r[0]; ah = r[1];
	  if (ah < 2)
	    {
	      /* A is too small, but q is correct. */
	      u01 += q * u00;
	      u11 += q * u10;
	      bits = jacobi_update (bits, 1, q);
	      goto done;
	    }
	  q++;
	  u01 += q * u00;
	  u11 += q * u10;
	  bits = jacobi_update (bits, 1, q);
	}
    subtract_a:
      ASSERT (bh >= ah);
      if (ah == bh)
	goto done;

      if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
	{
	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));

	  goto subtract_a1;
	}

      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
	 1), affecting the first column of M. */
      sub_ddmmss (bh, bl, bh, bl, ah, al);

      if (bh < 2)
	goto done;

      if (bh <= ah)
	{
	  /* Use q = 1 */
	  u00 += u01;
	  u10 += u11;
	  bits = jacobi_update (bits, 0, 1);
	}
      else
	{
	  mp_limb_t r[2];
	  mp_limb_t q = div2 (r, bh, bl, ah, al);
	  bl = r[0]; bh = r[1];
	  if (bh < 2)
	    {
	      /* B is too small, but q is correct. */
	      u00 += q * u01;
	      u10 += q * u11;
	      bits = jacobi_update (bits, 0, q);
	      goto done;
	    }
	  q++;
	  u00 += q * u01;
	  u10 += q * u11;
	  bits = jacobi_update (bits, 0, q);
	}
    }

  /* NOTE: Since we discard the least significant half limb, we don't
     get a truly maximal M (corresponding to |a - b| <
     2^{GMP_LIMB_BITS +1}). */
  /* Single precision loop */
  for (;;)
    {
      ASSERT (ah >= bh);
      if (ah == bh)
	break;

      ah -= bh;
      if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
	break;

      if (ah <= bh)
	{
	  /* Use q = 1 */
	  u01 += u00;
	  u11 += u10;
	  bits = jacobi_update (bits, 1, 1);
	}
      else
	{
	  mp_limb_t r;
	  mp_limb_t q = div1 (&r, ah, bh);
	  ah = r;
	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
	    {
	      /* A is too small, but q is correct. */
	      u01 += q * u00;
	      u11 += q * u10;
	      bits = jacobi_update (bits, 1, q);
	      break;
	    }
	  q++;
	  u01 += q * u00;
	  u11 += q * u10;
	  bits = jacobi_update (bits, 1, q);
	}
    subtract_a1:
      ASSERT (bh >= ah);
      if (ah == bh)
	break;

      bh -= ah;
      if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
	break;

      if (bh <= ah)
	{
	  /* Use q = 1 */
	  u00 += u01;
	  u10 += u11;
	  bits = jacobi_update (bits, 0, 1);
	}
      else
	{
	  mp_limb_t r;
	  mp_limb_t q = div1 (&r, bh, ah);
	  bh = r;
	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
	    {
	      /* B is too small, but q is correct. */
	      u00 += q * u01;
	      u10 += q * u11;
	      bits = jacobi_update (bits, 0, q);	  
	      break;
	    }
	  q++;
	  u00 += q * u01;
	  u10 += q * u11;
	  bits = jacobi_update (bits, 0, q);	  
	}
    }

 done:
  M->u[0][0] = u00; M->u[0][1] = u01;
  M->u[1][0] = u10; M->u[1][1] = u11;
  *bitsp = bits;

  return 1;
}

mp_size_t
mpn_jacobi_subdiv_step (jacobi_bits *bitsp,
			mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
  mp_size_t an, bn;
  unsigned denominator;

  ASSERT (n > 0);
  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);

  an = bn = n;
  MPN_NORMALIZE (ap, an);
  MPN_NORMALIZE (bp, bn);

  if (UNLIKELY (an == 0))
    return 0;

  else if (UNLIKELY (bn == 0))
    return 0;

  /* Arrange so that a > b, subtract an -= bn, and maintain
     normalization. */
  denominator = 1;
  if (an < bn)
    {
      MPN_PTR_SWAP (ap, an, bp, bn);
      denominator = 0;
    }
  else if (an == bn)
    {
      int c;
      MPN_CMP (c, ap, bp, an);
      if (UNLIKELY (c == 0))
	return 0;
      else if (c < 0)
	{
	  MP_PTR_SWAP (ap, bp);
	  denominator = 0;
	}
    }

  ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
  MPN_NORMALIZE (ap, an);
  ASSERT (an > 0);

  *bitsp = jacobi_update (*bitsp, denominator, 1);

  /* Arrange so that a > b, and divide a = q b + r */
  /* FIXME: an < bn happens when we have cancellation. If that is the
     common case, then we could reverse the roles of a and b to avoid
     the swap. */
  if (an < bn)
    {
      MPN_PTR_SWAP (ap, an, bp, bn);
      denominator ^= 1;
    }
  else if (an == bn)
    {
      int c;
      MPN_CMP (c, ap, bp, an);
      if (UNLIKELY (c == 0))
	return 0;
      else if (c < 0)
	{
	  MP_PTR_SWAP (ap, bp);
	  denominator ^= 1;
	}
    }

  mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
  *bitsp = jacobi_update (*bitsp, denominator, tp[0]);

  if (mpn_zero_p (ap, bn))
    return 0;

  return bn;
}

/* Computes (a | b), where b is odd and normalized. */
static int
mpn_jacobi_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
  jacobi_bits bits;

  ASSERT (n > 0);
  ASSERT (bp[n-1] > 0);
  ASSERT (bp[0] & 1);

  bits = jacobi_init (ap[0], bp[0]);

  while (n > 2)
    {
      struct hgcd_matrix1 M;
      mp_limb_t ah, al, bh, bl;
      mp_limb_t mask;

      mask = ap[n-1] | bp[n-1];
      ASSERT (mask > 0);

      if (mask & GMP_NUMB_HIGHBIT)
	{
	  ah = ap[n-1]; al = ap[n-2];
	  bh = bp[n-1]; bl = bp[n-2];
	}
      else
	{
	  int shift;

	  count_leading_zeros (shift, mask);
	  ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
	  al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
	  bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
	  bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
	}

      /* Try an mpn_nhgcd2 step */
      if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M, &bits))
	{
	  n = mpn_hgcd_mul_matrix1_inverse_vector (&M, tp, ap, bp, n);
	  MP_PTR_SWAP (ap, tp);
	}
      else
	{
	  /* mpn_hgcd2 has failed. Then either one of a or b is very
	     small, or the difference is very small. Perform one
	     subtraction followed by one division. */
	  n = mpn_jacobi_subdiv_step (&bits, ap, bp, n, tp);
	  if (!n)
	    return jacobi_finish (bits);
	}
    }
  if (n == 1)
    return mpn_jacobi_1 (ap[0], bp[0], bits);
  else
    return mpn_jacobi_2 (ap, bp, bits);
}

#define MAX_TEST_SIZE 100

static void
test_jacobi (gmp_randstate_ptr rands)
{
  mp_limb_t a[MAX_TEST_SIZE];
  mp_limb_t b[MAX_TEST_SIZE];
  mp_limb_t t[MAX_TEST_SIZE];
  mpz_t az, bz;
  unsigned i;

  mpz_init (az); mpz_init (bz);

  for (i = 0; i < 200; i++)
    {
      mp_limb_t n;
      int res, ref;
      n = 1 + gmp_urandomm_ui (rands, MAX_TEST_SIZE);
      _mpz_realloc (az, n);
      _mpz_realloc (bz, n);
      mpn_random (a, n);
      mpn_random (b, n);
      b[0] |= 1;

      MPN_COPY (PTR (az), a, n);
      MPN_COPY (PTR (bz), b, n);
      SIZ (az) = SIZ (bz) = n;

      MPN_NORMALIZE (PTR(az), SIZ(az));
      MPN_NORMALIZE (PTR(bz), SIZ(bz));

      ref = mpz_jacobi (az, bz);
      res = mpn_jacobi_lehmer (a, b, n, t);
      if (ref != res)
	{
	  gmp_printf ("mpn_jacobi_lehmer failed: n = %ld\n"
		      "  a = %Zd\n"
		      "  b = %Zd\n"
		      "  ref = %d (expected)\n"
		      "  res = %d (bad)\n",
		      (long) n, az, bz, ref, res);
	  abort();
	}
#if 0
      gmp_printf ("(%Zd|%Zd) = %d\n", az, bz, res);
#endif
    }
  mpz_clear (az); mpz_clear (bz);
}

#include "speed.h"

static int
compare_double(const void *ap, const void *bp)
{
  double a = * (const double *) ap;
  double b = * (const double *) bp;

  if (a < b)
    return -1;
  else if (a > b)
    return 1;
  else
    return 0;
}

static double
median (double *v, size_t n)
{
  qsort(v, n, sizeof(*v), compare_double);

  return v[n/2];
}

#define MEDIAN_N 5
#define TIME(res, code) do {				\
  double time_measurement[MEDIAN_N];			\
  unsigned time_i;					\
							\
  for (time_i = 0; time_i < MEDIAN_N; time_i++)		\
    {							\
      speed_starttime();				\
      code;						\
      time_measurement[time_i] = speed_endtime();	\
    }							\
  res = median(time_measurement, MEDIAN_N);		\
} while (0)

static void
time_jacobi (gmp_randstate_ptr rands, mp_size_t n)
{
  mpz_t az;
  mpz_t bz;
  mp_ptr ap;
  mp_ptr bp;
  mp_ptr tp;
  mp_ptr at;
  mp_ptr bt;
  int res;
  int ref;

  double t_ref, t_lehmer;

  TMP_DECL;
  TMP_MARK;

  mpz_init (az); mpz_init (bz);
  ap = TMP_ALLOC_LIMBS (n);
  bp = TMP_ALLOC_LIMBS (n);
  at = TMP_ALLOC_LIMBS (n);
  bt = TMP_ALLOC_LIMBS (n);
  tp = TMP_ALLOC_LIMBS (n);

  _mpz_realloc (az, n);
  _mpz_realloc (bz, n);
  mpn_random (ap, n);
  mpn_random (bp, n);
  bp[0] |= 1;

  MPN_COPY (PTR (az), ap, n);
  MPN_COPY (PTR (bz), bp, n);
  SIZ (az) = SIZ (bz) = n;

  MPN_NORMALIZE (PTR(az), SIZ(az));
  MPN_NORMALIZE (PTR(bz), SIZ(bz));

  TIME (t_ref, ref = mpz_jacobi (az, bz));
  TIME (t_lehmer, MPN_COPY (at, ap, n); MPN_COPY (bt, bp, n);
	res = mpn_jacobi_lehmer (at, bt, n, tp));

  printf ("%4ld %5.3f %5.3f %5.3f\n",
	  (long) n, 1e6*t_ref, 1e6*t_lehmer, t_lehmer / t_ref);
  if (ref != res)
    {
      gmp_printf ("mpn_jacobi_lehmer failed: n = %ld\n"
		  "  a = %Zd\n"
		  "  b = %Zd\n"
		  "  ref = %d (expected)\n"
		  "  res = %d (bad)\n",
		  (long) n, az, bz, ref, res);
      abort();      
    }
  TMP_FREE;
  mpz_clear (az); mpz_clear (bz);
}

int
main (int argc, char **argv)
{
  gmp_randstate_ptr rands;

  rands = RANDS;

  if (argc == 2)
    {
      mp_size_t n = atol (argv[1]);
      if (n < 1)
	return 1;

      time_jacobi (rands, n);
    }
  else
    test_jacobi (rands);

  return 0;  
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.