sqrt algorithm

Sat Jul 25 19:56:13 UTC 2015

nisse at lysator.liu.se (Niels Möller) writes:

> For the divisions in the sqrt algorithms, I'm not sure of exactly how
> the sizes of the numerator E, the denominator H, and the quotient,
> relate, but they ought to all be pretty close to k.

Below is an updated version of the code, which seems to work correctly.
The following division sizes are used:

  en   hn  fn  qn
  k+2 k+1 k-1 k+1
  k+1 k   k-1 k+1
  k+1 k+1 k   k+1
  k+1 k+1 k-1 k

Here, en is the size of the residual, hn is the size of the high
"half" of the root, fn is the number of fractional limbs. qn is the
size of the quotient, as computed by mpn_tdiv_qr, i.e., qn = en - hn + 1
+ fn.

And the division is Q = floor (B^{fn} E / H), where E = {ep, en}, H =
{hp, hn}, and E is at most a few bits larger than H.

I'll look into how these cases fit approximate division. I suspect that
in a few of the cases we use one more limb of E and H than we really
need.

Regards,
/Niels

---------8<------------------

#include <stdio.h>
#include <stdlib.h>

#define WANT_ASSERT 1

#include <gmp.h>
#include <gmp-impl.h>
#include <longlong.h>

#if GMP_NUMB_BITS != 64
#error Unsupported limb size
#endif

static int verbose = 0;

static mp_limb_t *
xalloc_limbs (mp_size_t n)
{
  mp_limb_t *p = malloc (n * sizeof (*p));
  if (!p)
    {
      fprintf (stderr, "Virtual memory exhasuted!\n");
      abort ();
    }
  return p;
}

/* Borrowed from factor.c */
static mp_limb_t
sqrt_1 (mp_limb_t a)
{
  mp_limb_t x;
  unsigned c;
  ASSERT (a > 0);
  count_leading_zeros (c, a);
  x = (mp_limb_t) 1 << ((65 - c)/2);
  for (;;)
    {
      mp_limb_t y = (x + a/x) / 2;
      if (y >= x)
	return x;

      x = y;
    }
}

static mp_limb_t
sqrt_2 (mp_limb_t ah, mp_limb_t al)
{
  mp_limb_t x;
  unsigned c;

  ASSERT (ah > 0);

  count_leading_zeros (c, ah);
  c &= ~1;

  if (c)
    {
      x = sqrt_1 ( (ah << c) + (al >> (64 - c))) + 1;
      x <<= (64 - c) / 2;
    }
  /* Need to handle these cases separately, to exclude x == ah in the
     loop */
  else if (ah == GMP_NUMB_MAX)
    return GMP_NUMB_MAX;
  else if (ah == GMP_NUMB_MAX - 1)
    return GMP_NUMB_MAX - (al == 0);
  else
    x = (sqrt_1 (ah) << 32) | 0xffffffffUL;

  /* Do we need more than one iteration? */
  for (;;)
    {
      mp_limb_t q, r, y;
      udiv_qrnnd (q, r, ah, al, x);
      y = x + q;
      if (y < x)
	y = (y / 2) | ((mp_limb_t) 1 << 63);
      else
	y = y / 2;

      if (y >= x)
	return x;

      x = y;
    }
}

/* Inputs: A, size an, approximative square root X, size ceil(an/2).
   Output: residual R = A - X^2, size floor (an/2) + 1. Usually
   returns zero, but in case the subtraction underflows, returns 1 and
   sets R = A - (X-1)^2 = A - X^2 + 2X - 1. */
static int
sqrt_residual (mp_limb_t *rp, const mp_limb_t *ap, mp_size_t an,
	       const mp_limb_t *xp)
{
  mp_size_t xn = (an+1)/2;
  mp_size_t rn = an/2 + 1;
  mp_ptr tp = xalloc_limbs (2*xn);
  /* FIXME: Exploit cancellation, using mullo or sqrmod_bnm1. */
  mpn_sqr (tp, xp, xn);
  mpn_sub_n (rp, ap, tp, rn);

  free (tp);
  if (rp[rn-1] & GMP_NUMB_HIGHBIT)
    {
      /* Underflow */ 
      mp_limb_t cy = mpn_addmul_1 (rp, xp, xn, 2);
      if (rn > xn)
	{
	  ASSERT (rn == xn + 1);
	  rp[rn-1] += cy;
	}
      mpn_sub_1 (rp, rp, rn, 1);
      ASSERT (! (rp[rn-1] & GMP_NUMB_HIGHBIT));
      return 1;
    }
  return 0;
}

/* Inputs: A, size an, X, an approximative square root of B A, size
   floor(an/2) + 1. Output: residual R = B A - X^2, size ceil (an/2) +
   1. Usually returns zero, but in case the subtraction underflows,
   returns 1 and sets R = B A - (X-1)^2 = B A - X^2 + 2X - 1. */

static int
sqrtB_residual (mp_limb_t *rp, const mp_limb_t *ap, mp_size_t an,
		const mp_limb_t *xp)
{
  mp_size_t xn = an/2 + 1;
  mp_size_t rn = (an+1)/2 + 1;
  mp_ptr tp = xalloc_limbs (2*xn);
  /* FIXME: Exploit cancellation, using mullo or sqrmod_bnm1. */
  mpn_sqr (tp, xp, xn);
  mpn_add_1 (tp+1, tp+1, rn-1, tp[0] != 0);
  mpn_sub_n (rp+1, ap, tp+1, rn-1);
  rp[0] = - tp[0];

  free (tp);
  if (rp[rn-1] & GMP_NUMB_HIGHBIT)
    {
      /* Underflow */
      mp_limb_t cy = mpn_addmul_1 (rp, xp, xn, 2);
      if (rn > xn)
	{
	  ASSERT (rn == xn + 1);
	  rp[rn-1] += cy;
	}
      mpn_sub_1 (rp, rp, rn, 1);
      ASSERT (! (rp[rn-1] & GMP_NUMB_HIGHBIT));
      return 1;
    }
  return 0;
}

static void
saturate_add_1 (mp_ptr rp, mp_size_t n, mp_size_t k, mp_limb_t b)
{
  if (mpn_add_1 (rp + k, rp + k, n - k, b))
    {
      /* Unlikely overflow, saturate to maximum value. */
      mp_size_t i;
      for (i = 0; i < n; i++)
	rp[i] = GMP_NUMB_MAX;
    }
}

/* Computes floor(sqrt(B^{n-2} A), an (n-1)-limb number. */
static void
sqrt_nm1 (mp_limb_t *xp, const mp_limb_t *ap, mp_size_t n)
{
  ASSERT (n >= 2);
  ASSERT (ap[n-1] > 0);
  if (n == 2)
    xp[0] = sqrt_2 (ap[1], ap[0]);
  else if (n == 3)
    {
      unsigned c;
      mp_limb_t ah, al, h, eh, el;
      mp_limb_t t[3];
      ah = ap[2];
      al = ap[1];
      c = __builtin_clzl (ah) & ~1;
      if (c)
	{
	  ah = (ah << c) | (al >> (64 - c));
	  al = (al << c) | (ap[0] >> (64 - c));
	}
      h = sqrt_2 (ah, al);
      umul_ppmm (eh, el, h, h);
      sub_ddmmss (eh, el, ah, al, eh, el);
      t[0] = ap[0] << c;
      t[1] = el;
      t[2] = eh;
      mpn_divmod_1 (t, t, 3, h);
      c /= 2;
      mpn_rshift (t, t, 3, c + 1);
      ASSERT (t[2] == 0);
      ASSERT (t[1] <= 1);
      if (c)
	add_ssaaaa (xp[1], xp[0], t[1], t[0], h >> c, h << (64 - c));
      else if (t[1] && h == GMP_NUMB_MAX)
	xp[0] = xp[1] = GMP_NUMB_MAX;
      else
	{
	  xp[0] = t[0];
	  xp[1] = h + t[1];
	  ASSERT (xp[1] >= h);
	}
    }
  else
    {
      mp_limb_t *ep;
      mp_limb_t *qp;
      mp_size_t k = n/2;

      sqrt_nm1 (xp + k-1, ap + k-1, n-k+1);

      ep = xalloc_limbs (n + k + 1); /* n */
      qp = ep + n; /* k+1 */

      mpn_zero (ep, k-1);

      if (n & 1)
	{
	  /* E = B A - H^2, residual of k+2 limbs, quotient of size k+1. */
	  if (sqrtB_residual (ep + k-1, ap, n, xp + k-1))
	    /* Unlikely overflow, high part of root too large */
	    ASSERT_NOCARRY (mpn_sub_1 (xp + k-1, xp + k-1, k+1, 1));

	  /*  Q = B^{k-1} E / H
             k+1         k+2 k+1    sizes, but Q ought to fit in k limbs
	   */
	  mpn_tdiv_qr (qp, ep, 0, ep, n, xp + k-1, k+1);
	}
      else
	{
	  /* E = A - H^2, residual and quotient both of k+1 limbs. */
	  if (sqrt_residual (ep + k-1, ap, n, xp + k-1))
	    /* Unlikely overflow, high part of root too large */
	    ASSERT_NOCARRY (mpn_sub_1 (xp + k-1, xp + k-1, k, 1));

	  /*  Q = B^{k-1} E / H 
             k+1         k+1  k    sizes, but Q ought to fit in k limbs
	   */
	  mpn_tdiv_qr (qp, ep, 0, ep, n, xp + k-1, k);
	}

      ASSERT (qp[k] == 0);
      mpn_rshift (qp, qp, k, 1);
      mpn_copyi (xp, qp, k-1);

      saturate_add_1 (xp, n-1, k-1, qp[k-1]);

      free (ep);
    }
}

/* Computes floor(sqrt(B^{n-1} A), an n-limb bumber. */
static void
sqrt_n (mp_limb_t *xp, const mp_limb_t *ap, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (ap[n-1] > 0);

  if (n == 1)
    xp[0] = sqrt_1 (ap[0]);

  else if (n == 2)
    {
      mp_limb_t h = sqrt_2 (ap[1], ap[0]);
      mp_limb_t xh, xl, q, r;
      mp_limb_t eh, el;
      umul_ppmm (eh, el, h, h);
      sub_ddmmss (eh, el, ap[1], ap[0], eh, el);
      ASSERT (eh <= 1);

      /* A single Newton step may produce a root which is one too large */
      udiv_qrnnd (q, r, (el >> 33) | (eh << 31), el << 31, h);
      xh = h >> 32;
      xl = h << 32;
      xl += q;
      xh += xl < q;

      xp[0] = xl;
      xp[1] = xh;
    }
  else
    {
      mp_limb_t *ep;
      mp_limb_t *qp;
      mp_size_t k = n/2;

      sqrt_n (xp + n-k-1, ap + n-k-1, k+1);
      ep = xalloc_limbs (2*n - k); /* n */
      qp = ep + n; /* n - k */

      mpn_zero (ep, n-k-1);

      if (n & 1)
	{
	  /* E = A - H^2, residual of k+1 limbs, quotient also k+1 limbs. */
	  if (sqrt_residual (ep + k, ap, n, xp + k))
	    /* Unlikely overflow, high part of root too large */
	    ASSERT_NOCARRY (mpn_sub_1 (xp + k, xp + k, k+1, 1));

	  /*  Q = B^k E / H
             k+1     k+1 k+1    sizes
	   */
	  mpn_tdiv_qr (qp, ep, 0, ep, n, xp + k, k+1);
	}
      else
	{
	  /* E = B A - H^2, residual of k+1 limbs, and quotient k limbs. */
	  if (sqrtB_residual (ep + k - 1, ap, n, xp + k - 1))
	    /* Unlikely overflow, high part of root too large */
	    ASSERT_NOCARRY (mpn_sub_1 (xp + k-1, xp + k-1, k+1, 1));

	  /*  Q = B^{k-1} E / H
              k          k+1 k+1    sizes
	   */
	  mpn_tdiv_qr (qp, ep, 0, ep, n, xp + k - 1, k + 1);
	}
      mpn_rshift (qp, qp, n-k, 1);
      mpn_copyi (xp, qp, n-k-1);
      saturate_add_1 (xp, n, n-k-1, qp[n-k-1]);

      free (ep);
    }
}

/* Computes X = floor (sqrt ({ap, n})), ceil(n/2) limbs, and remainder
   R = A - X^2 */
static mp_size_t
sqrtrem (mp_limb_t *xp, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n)
{
  ASSERT (n > 0);
  if (n == 1)
    {
      mp_limb_t x = sqrt_1 (ap[0]);
      mp_limb_t r = ap[0] - x*x;
      xp[0] = x;
      rp[0] = r;
      return (r > 0);
    }
  else if (n == 2)
    {
      mp_limb_t x = sqrt_2 (ap[1], ap[0]);
      mp_limb_t rh, rl;
      xp[0] = x;
      umul_ppmm (rh, rl, x, x);
      sub_ddmmss (rh, rl, ap[1], ap[0], rh, rl);
      rp[0] = rl;
      rp[1] = rh;
      if (!rh)
	return (rl > 0);
      return 2;
    }
  else
    {
      mp_size_t k = n/2;
      mp_size_t rn = k + 1;
      mp_size_t xn = (n+1)/2;
      mp_limb_t cy;

      if (n & 1)
	/* Compute sqrt based on the (n+1)/2 high limbs */
	sqrt_n (xp, ap + k, k+1);
      else
	/* Compute sqrt based on the n/2+1 high limbs */
	sqrt_nm1 (xp, ap + k-1, k+1);

      if (sqrt_residual (rp, ap, n, xp))
	{
	  /* x is one too large */
	  if (verbose)
	      fprintf (stderr, "-");

	  ASSERT_NOCARRY (mpn_sub_1 (xp, xp, xn, 1));
	}
      else
	{
	  /* T = 2X + 1 */
	  mp_ptr tp = xalloc_limbs (rn);
	  int adjust = 0;

	  cy = mpn_lshift (tp, xp, xn, 1);
	  if (n & 1)
	    ASSERT (!cy);
	  else
	    {
	      ASSERT (rn == xn + 1);
	      tp[rn - 1] = cy;
	    }
	  tp[0] |= 1;
	  while (mpn_cmp (rp, tp, rn) >= 0)
	    {
	      if (verbose)
		{
		  fprintf (stderr, "+");
		  adjust++;
		  if (adjust > 10)
		    abort ();
		}

	      ASSERT_NOCARRY (mpn_sub_n (rp, rp, tp, rn));
	      ASSERT_NOCARRY (mpn_add_1 (xp, xp, xn, 1));
	      ASSERT_NOCARRY (mpn_add_1 (tp, tp, rn, 2));
	    }
	  free (tp);
	}

      while (rn > 0 && !rp[rn-1])
	rn--;
      return rn;
    }
}

static int
check_sqrt (const mpz_t a, const mpz_t x, const mpz_t r)
{
  mpz_t t;

  mpz_init (t);
  mpz_add_ui (t, x, 1);
  mpz_mul (t, t, t);
  if (mpz_cmp (t, a) < 0)
    {
    fail:
      mpz_clear (t);
      return 0;
    }

  mpz_mul (t, x, x);
  if (mpz_cmp (t, a) > 0)
    goto fail;

  mpz_sub (t, a, t);
  if (mpz_cmp (t, r) != 0)
    goto fail;

  mpz_clear (t);
  return 1;
}

#define MAX_SIZE 20

int
main (int argc, char **argv)
{
  gmp_randstate_t rands;
  mp_limb_t x[(MAX_SIZE+1)/2];
  mp_limb_t r[MAX_SIZE/2+1];
  mpz_t a;

  unsigned c;

  if (argc > 1)
    verbose = 1;

  mpz_init (a);
  gmp_randinit_default (rands);
  for (c = 0; c < 100000; c++)
    {
      unsigned bits = 1 + gmp_urandomm_ui (rands, GMP_NUMB_BITS * MAX_SIZE);
      mpz_t t1, t2;
      mp_size_t an, xn, rn;

      mpz_rrandomb (a, rands, bits);
      an = mpz_size (a);
      if (verbose)
	gmp_fprintf (stderr, "%u, size %u, A = %Zd", c, (unsigned) an, a);
      xn = (an + 1)/2;
      rn = sqrtrem (x, r, mpz_limbs_read (a), an);
      ASSERT (rn == 0 || r[rn-1]);

      if (verbose)
	fprintf (stderr, "\n");

      if (!check_sqrt (a, mpz_roinit_n (t1, x, xn), mpz_roinit_n (t2, r, rn)))
	{
	  gmp_fprintf (stderr, "Test %u failed, size %u:\n"
		       "  a: %Zd\n"
		       "  x: %Nd\n"
		       "  r: %Nd\n",
		       c, (unsigned) an, a, x, xn, r, rn);
	  abort ();
	}
    }

  mpz_clear (a);
  gmp_randclear (rands);

  return EXIT_SUCCESS;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.