sqrt algorithm

Wed Aug 26 16:36:05 UTC 2015

tg at gmplib.org (Torbjörn Granlund) writes:

> I'd suggest that you make ytour code require normalisation, then see if
> it can be mad faster.

Below is a variant using normalization, and with the 2n=n+n split, which
provides less margin.

The intermediate results (passed to sqrt_residual and sqrtB_residual)
are up to 4 units too large, compared to at most 1 too large in the
previous version. I haven't tried to really analyze the error.

Regards,
/Niels

-----8<-------

#include <stdio.h>
#include <stdlib.h>

#define WANT_ASSERT 1

#include <gmp.h>
#include <gmp-impl.h>
#include <longlong.h>

#if GMP_NUMB_BITS != 64
#error Unsupported limb size
#endif

static int verbose = 0;

static mp_limb_t *
xalloc_limbs (mp_size_t n)
{
  mp_limb_t *p = malloc (n * sizeof (*p));
  if (!p)
    {
      fprintf (stderr, "Virtual memory exhasuted!\n");
      abort ();
    }
  return p;
}

/* Borrowed from factor.c */
static mp_limb_t
sqrt_1 (mp_limb_t a)
{
  mp_limb_t x;
  unsigned c;
  ASSERT (a > 0);
  count_leading_zeros (c, a);
  x = (mp_limb_t) 1 << ((65 - c)/2);
  for (;;)
    {
      mp_limb_t y = (x + a/x) / 2;
      if (y >= x)
	return x;

      x = y;
    }
}

static mp_limb_t
sqrt_2 (mp_limb_t ah, mp_limb_t al)
{
  mp_limb_t x;
  unsigned c;

  ASSERT (ah > 0);

  count_leading_zeros (c, ah);
  c &= ~1;

  if (c)
    {
      x = sqrt_1 ( (ah << c) + (al >> (64 - c))) + 1;
      x <<= (64 - c) / 2;
    }
  /* Need to handle these cases separately, to exclude x == ah in the
     loop */
  else if (ah == GMP_NUMB_MAX)
    return GMP_NUMB_MAX;
  else if (ah == GMP_NUMB_MAX - 1)
    return GMP_NUMB_MAX - (al == 0);
  else
    x = (sqrt_1 (ah) << 32) | 0xffffffffUL;

  /* Do we need more than one iteration? */
  for (;;)
    {
      mp_limb_t q, r, y;
      udiv_qrnnd (q, r, ah, al, x);
      y = x + q;
      if (y < x)
	y = (y / 2) | ((mp_limb_t) 1 << 63);
      else
	y = y / 2;

      if (y >= x)
	return x;

      x = y;
    }
}

/* Inputs: A, size an, approximative square root X, size ceil(an/2).
   Output: residual R = A - X^2, size floor (an/2) + 1. Usually
   returns zero, but in case the subtraction underflows, returns 1 and
   sets R = A - (X-1)^2 = A - X^2 + 2X - 1 or returns 2 and sets
   R = A - (X-2)^2 = A - X^2 + 4X - 2. */
static mp_limb_t
sqrt_residual (mp_limb_t *rp, const mp_limb_t *ap, mp_size_t an,
	       const mp_limb_t *xp)
{
  mp_size_t xn = (an+1)/2;
  mp_size_t rn = an/2 + 1;
  mp_ptr tp = xalloc_limbs (2*xn);
  /* FIXME: Exploit cancellation, using mullo or sqrmod_bnm1. */
  mpn_sqr (tp, xp, xn);
  mpn_sub_n (rp, ap, tp, rn);

  free (tp);
  if (rp[rn-1] & GMP_NUMB_HIGHBIT)
    {
      /* Underflow */ 
      mp_size_t e;
      mp_limb_t cy;
      for (e = 1; e < 5; e++)
	{
	  cy = mpn_addmul_1 (rp, xp, xn, 2);
	  if (rn > xn)
	    {
	      ASSERT (rn == xn + 1);
	      rp[rn-1] += cy;
	    }
	  mpn_sub_1 (rp, rp, rn, 2*e-1);
	  if (! (rp[rn-1] & GMP_NUMB_HIGHBIT))
	    return e;
	}
      abort ();
    }
  return 0;
}

/* Inputs: A, size an, X, an approximative square root of B A, size
   floor(an/2) + 1. Output: residual R = B A - X^2, size ceil (an/2) +
   1. Usually returns zero, but in case the subtraction underflows,
   returns 1 and sets R = B A - (X-1)^2 = B A - X^2 + 2X - 1. */

static int
sqrtB_residual (mp_limb_t *rp, const mp_limb_t *ap, mp_size_t an,
		const mp_limb_t *xp)
{
  mp_size_t xn = an/2 + 1;
  mp_size_t rn = (an+1)/2 + 1;
  mp_ptr tp = xalloc_limbs (2*xn);
  /* FIXME: Exploit cancellation, using mullo or sqrmod_bnm1. */
  mpn_sqr (tp, xp, xn);
  mpn_add_1 (tp+1, tp+1, rn-1, tp[0] != 0);
  mpn_sub_n (rp+1, ap, tp+1, rn-1);
  rp[0] = - tp[0];

  free (tp);
  if (rp[rn-1] & GMP_NUMB_HIGHBIT)
    {
      /* Underflow */
      mp_size_t e;
      mp_limb_t cy;
      for (e = 1; e < 5; e++)
	{
	  cy = mpn_addmul_1 (rp, xp, xn, 2);
	  if (rn > xn)
	    {
	      ASSERT (rn == xn + 1);
	      rp[rn-1] += cy;
	    }
	  mpn_sub_1 (rp, rp, rn, 2*e-1);
	  if (! (rp[rn-1] & GMP_NUMB_HIGHBIT))
	    return e;
	}
      abort ();
    }
  return 0;
}

static void
saturate_add_1 (mp_ptr rp, mp_size_t n, mp_size_t k, mp_limb_t b)
{
  if (mpn_add_1 (rp + k, rp + k, n - k, b))
    {
      /* Unlikely overflow, saturate to maximum value. */
      mp_size_t i;
      for (i = 0; i < n; i++)
	rp[i] = GMP_NUMB_MAX;
    }
}

/* Computes floor(sqrt(B^n A), an n-limb bumber. */
static void
sqrt_n (mp_limb_t *xp, const mp_limb_t *ap, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (ap[n-1] >> (GMP_NUMB_BITS - 2));

  if (n == 1)
    xp[0] = sqrt_2 (ap[0], 0);

#if 0
  else if (n == 2)
    {
      mp_limb_t h = sqrt_2 (ap[1], ap[0]);
      mp_limb_t xh, xl, q, r;
      mp_limb_t eh, el;
      umul_ppmm (eh, el, h, h);
      sub_ddmmss (eh, el, ap[1], ap[0], eh, el);
      ASSERT (eh <= 1);

      /* A single Newton step may produce a root which is one too large */
      udiv_qrnnd (q, r, (el >> 33) | (eh << 31), el << 31, h);
      xh = h >> 32;
      xl = h << 32;
      xl += q;
      xh += xl < q;

      xp[0] = xl;
      xp[1] = xh;
    }
#endif
  else
    {
      mp_limb_t *ep;
      mp_limb_t *qp;
      mp_size_t k = n/2;

      sqrt_n (xp + k, ap + k, n - k);
      ep = xalloc_limbs (n + k + 3); /* n+1 */
      qp = ep + n+1; /* k+2 */

      mpn_zero (ep, k);

      if (n & 1)
	{
	  /* E = B A - H^2, residual of k+2 limbs, quotient also k+1 limbs. */
	  mp_limb_t cy = sqrtB_residual (ep + k, ap, n, xp + k);
	  /* cy > 0 is unlikely, high part of root too large */
	  ASSERT_NOCARRY (mpn_sub_1 (xp + k, xp + k, k+1, cy));

	  /*  Q = B^k E / H
             k+2     k+2 k+1    sizes, but Q should fit in k+1 limbs
	   */
	  mpn_tdiv_qr (qp, ep, 0, ep, n+1, xp + k, k+1);
	  ASSERT (qp[k+1] == 0);
	}
      else
	{
	  /* E = A - H^2, residual of k+1 limbs, and quotient k limbs. */
	  mp_limb_t cy = sqrt_residual (ep + k, ap, n, xp + k);

	  /* cy > 0 is unlikely, high part of root too large */
	  ASSERT_NOCARRY (mpn_sub_1 (xp + k, xp + k, k, cy));

	  /*  Q = B^k E / H
             k+2      k+1 k    sizes, but Q ought to fit in k+1 limbs
	   */
	  mpn_tdiv_qr (qp, ep, 0, ep, n+1, xp + k, k);
	}
      ASSERT (qp[k+1] == 0);
      mpn_rshift (qp, qp, k+1, 1);
      mpn_copyi (xp, qp, k);
      saturate_add_1 (xp, n, k, qp[k]);

      free (ep);
    }
}

/* Computes X = floor (sqrt ({ap, n})), ceil(n/2) limbs, and remainder
   R = A - X^2 */
static mp_size_t
sqrtrem (mp_limb_t *xp, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n)
{
  ASSERT (n > 0);
  ASSERT (ap[n-1] > 0);
  if (n == 1)
    {
      mp_limb_t x = sqrt_1 (ap[0]);
      mp_limb_t r = ap[0] - x*x;
      xp[0] = x;
      rp[0] = r;
      return (r > 0);
    }
  else if (n == 2)
    {
      mp_limb_t x = sqrt_2 (ap[1], ap[0]);
      mp_limb_t rh, rl;
      xp[0] = x;
      umul_ppmm (rh, rl, x, x);
      sub_ddmmss (rh, rl, ap[1], ap[0], rh, rl);
      rp[0] = rl;
      rp[1] = rh;
      if (!rh)
	return (rl > 0);
      return 2;
    }
  else
    {
      mp_size_t k = n/2;
      mp_size_t rn = k + 1;
      mp_size_t xn = (n+1)/2;
      mp_limb_t cy;
      mp_ptr tp;
      unsigned shift;
      if (ap[n-1] >> (GMP_NUMB_BITS - 2))
	{
	  shift = 0;
	  sqrt_n (xp, ap + k, n - k);
	}
      else
	{
	  count_leading_zeros (shift, ap[n-1]);
	  shift >>= 1;
	  ASSERT (shift > 0);
	  tp = xalloc_limbs (n-k+1);
	  mpn_lshift (tp, ap + k-1, n-k+1, 2*shift);
	  sqrt_n (xp, tp + 1, n-k);
	  free (tp);
	}
      if (n & 1)
	shift += GMP_NUMB_BITS / 2;
      mpn_rshift (xp, xp, xn, shift);

      cy = sqrt_residual (rp, ap, n, xp);
      if (cy)
	{
	  /* x is too large */
	  ASSERT_NOCARRY (mpn_sub_1 (xp, xp, xn, cy));
	  if (verbose)
	    for (; cy; cy--)
	      fprintf (stderr, "-");
	}
      else
	{
	  /* T = 2X + 1 */
	  mp_ptr tp = xalloc_limbs (rn);
	  int adjust = 0;

	  cy = mpn_lshift (tp, xp, xn, 1);
	  if (n & 1)
	    ASSERT (!cy);
	  else
	    {
	      ASSERT (rn == xn + 1);
	      tp[rn - 1] = cy;
	    }
	  tp[0] |= 1;
	  while (mpn_cmp (rp, tp, rn) >= 0)
	    {
	      if (verbose)
		{
		  fprintf (stderr, "+");
		  adjust++;
		  if (adjust > 10)
		    abort ();
		}

	      ASSERT_NOCARRY (mpn_sub_n (rp, rp, tp, rn));
	      ASSERT_NOCARRY (mpn_add_1 (xp, xp, xn, 1));
	      ASSERT_NOCARRY (mpn_add_1 (tp, tp, rn, 2));
	    }
	  free (tp);
	}

      while (rn > 0 && !rp[rn-1])
	rn--;
      return rn;
    }
}

static int
check_sqrt (const mpz_t a, const mpz_t x, const mpz_t r)
{
  mpz_t t;

  mpz_init (t);
  mpz_add_ui (t, x, 1);
  mpz_mul (t, t, t);
  if (mpz_cmp (t, a) < 0)
    {
    fail:
      mpz_clear (t);
      return 0;
    }

  mpz_mul (t, x, x);
  if (mpz_cmp (t, a) > 0)
    goto fail;

  mpz_sub (t, a, t);
  if (mpz_cmp (t, r) != 0)
    goto fail;

  mpz_clear (t);
  return 1;
}

#define MAX_SIZE 20

int
main (int argc, char **argv)
{
  gmp_randstate_t rands;
  mp_limb_t x[(MAX_SIZE+1)/2];
  mp_limb_t r[MAX_SIZE/2+1];
  mpz_t a;

  unsigned c;

  if (argc > 1)
    verbose = 1;

  mpz_init (a);
  gmp_randinit_default (rands);
  for (c = 0; c < 100000; c++)
    {
      unsigned bits = 1 + gmp_urandomm_ui (rands, GMP_NUMB_BITS * MAX_SIZE);
      mpz_t t1, t2;
      mp_size_t an, xn, rn;

      mpz_rrandomb (a, rands, bits);
      an = mpz_size (a);
      if (verbose)
	gmp_fprintf (stderr, "%u, size %u, A = %Zd", c, (unsigned) an, a);
      xn = (an + 1)/2;
      rn = sqrtrem (x, r, mpz_limbs_read (a), an);
      ASSERT (rn == 0 || r[rn-1]);

      if (verbose)
	fprintf (stderr, "\n");

      if (!check_sqrt (a, mpz_roinit_n (t1, x, xn), mpz_roinit_n (t2, r, rn)))
	{
	  gmp_fprintf (stderr, "Test %u failed, size %u:\n"
		       "  a: %Zd\n"
		       "  x: %Nd\n"
		       "  r: %Nd\n",
		       c, (unsigned) an, a, x, xn, r, rn);
	  abort ();
	}
    }

  mpz_clear (a);
  gmp_randclear (rands);

  return EXIT_SUCCESS;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.