sqrt algorithm
Niels Möller
nisse at lysator.liu.se
Wed Aug 26 16:36:05 UTC 2015
tg at gmplib.org (Torbjörn Granlund) writes:
> I'd suggest that you make ytour code require normalisation, then see if
> it can be mad faster.
Below is a variant using normalization, and with the 2n=n+n split, which
provides less margin.
The intermediate results (passed to sqrt_residual and sqrtB_residual)
are up to 4 units too large, compared to at most 1 too large in the
previous version. I haven't tried to really analyze the error.
Regards,
/Niels
-----8<-------
#include <stdio.h>
#include <stdlib.h>
#define WANT_ASSERT 1
#include <gmp.h>
#include <gmp-impl.h>
#include <longlong.h>
#if GMP_NUMB_BITS != 64
#error Unsupported limb size
#endif
static int verbose = 0;
static mp_limb_t *
xalloc_limbs (mp_size_t n)
{
mp_limb_t *p = malloc (n * sizeof (*p));
if (!p)
{
fprintf (stderr, "Virtual memory exhasuted!\n");
abort ();
}
return p;
}
/* Borrowed from factor.c */
static mp_limb_t
sqrt_1 (mp_limb_t a)
{
mp_limb_t x;
unsigned c;
ASSERT (a > 0);
count_leading_zeros (c, a);
x = (mp_limb_t) 1 << ((65 - c)/2);
for (;;)
{
mp_limb_t y = (x + a/x) / 2;
if (y >= x)
return x;
x = y;
}
}
static mp_limb_t
sqrt_2 (mp_limb_t ah, mp_limb_t al)
{
mp_limb_t x;
unsigned c;
ASSERT (ah > 0);
count_leading_zeros (c, ah);
c &= ~1;
if (c)
{
x = sqrt_1 ( (ah << c) + (al >> (64 - c))) + 1;
x <<= (64 - c) / 2;
}
/* Need to handle these cases separately, to exclude x == ah in the
loop */
else if (ah == GMP_NUMB_MAX)
return GMP_NUMB_MAX;
else if (ah == GMP_NUMB_MAX - 1)
return GMP_NUMB_MAX - (al == 0);
else
x = (sqrt_1 (ah) << 32) | 0xffffffffUL;
/* Do we need more than one iteration? */
for (;;)
{
mp_limb_t q, r, y;
udiv_qrnnd (q, r, ah, al, x);
y = x + q;
if (y < x)
y = (y / 2) | ((mp_limb_t) 1 << 63);
else
y = y / 2;
if (y >= x)
return x;
x = y;
}
}
/* Inputs: A, size an, approximative square root X, size ceil(an/2).
Output: residual R = A - X^2, size floor (an/2) + 1. Usually
returns zero, but in case the subtraction underflows, returns 1 and
sets R = A - (X-1)^2 = A - X^2 + 2X - 1 or returns 2 and sets
R = A - (X-2)^2 = A - X^2 + 4X - 2. */
static mp_limb_t
sqrt_residual (mp_limb_t *rp, const mp_limb_t *ap, mp_size_t an,
const mp_limb_t *xp)
{
mp_size_t xn = (an+1)/2;
mp_size_t rn = an/2 + 1;
mp_ptr tp = xalloc_limbs (2*xn);
/* FIXME: Exploit cancellation, using mullo or sqrmod_bnm1. */
mpn_sqr (tp, xp, xn);
mpn_sub_n (rp, ap, tp, rn);
free (tp);
if (rp[rn-1] & GMP_NUMB_HIGHBIT)
{
/* Underflow */
mp_size_t e;
mp_limb_t cy;
for (e = 1; e < 5; e++)
{
cy = mpn_addmul_1 (rp, xp, xn, 2);
if (rn > xn)
{
ASSERT (rn == xn + 1);
rp[rn-1] += cy;
}
mpn_sub_1 (rp, rp, rn, 2*e-1);
if (! (rp[rn-1] & GMP_NUMB_HIGHBIT))
return e;
}
abort ();
}
return 0;
}
/* Inputs: A, size an, X, an approximative square root of B A, size
floor(an/2) + 1. Output: residual R = B A - X^2, size ceil (an/2) +
1. Usually returns zero, but in case the subtraction underflows,
returns 1 and sets R = B A - (X-1)^2 = B A - X^2 + 2X - 1. */
static int
sqrtB_residual (mp_limb_t *rp, const mp_limb_t *ap, mp_size_t an,
const mp_limb_t *xp)
{
mp_size_t xn = an/2 + 1;
mp_size_t rn = (an+1)/2 + 1;
mp_ptr tp = xalloc_limbs (2*xn);
/* FIXME: Exploit cancellation, using mullo or sqrmod_bnm1. */
mpn_sqr (tp, xp, xn);
mpn_add_1 (tp+1, tp+1, rn-1, tp[0] != 0);
mpn_sub_n (rp+1, ap, tp+1, rn-1);
rp[0] = - tp[0];
free (tp);
if (rp[rn-1] & GMP_NUMB_HIGHBIT)
{
/* Underflow */
mp_size_t e;
mp_limb_t cy;
for (e = 1; e < 5; e++)
{
cy = mpn_addmul_1 (rp, xp, xn, 2);
if (rn > xn)
{
ASSERT (rn == xn + 1);
rp[rn-1] += cy;
}
mpn_sub_1 (rp, rp, rn, 2*e-1);
if (! (rp[rn-1] & GMP_NUMB_HIGHBIT))
return e;
}
abort ();
}
return 0;
}
static void
saturate_add_1 (mp_ptr rp, mp_size_t n, mp_size_t k, mp_limb_t b)
{
if (mpn_add_1 (rp + k, rp + k, n - k, b))
{
/* Unlikely overflow, saturate to maximum value. */
mp_size_t i;
for (i = 0; i < n; i++)
rp[i] = GMP_NUMB_MAX;
}
}
/* Computes floor(sqrt(B^n A), an n-limb bumber. */
static void
sqrt_n (mp_limb_t *xp, const mp_limb_t *ap, mp_size_t n)
{
ASSERT (n >= 1);
ASSERT (ap[n-1] >> (GMP_NUMB_BITS - 2));
if (n == 1)
xp[0] = sqrt_2 (ap[0], 0);
#if 0
else if (n == 2)
{
mp_limb_t h = sqrt_2 (ap[1], ap[0]);
mp_limb_t xh, xl, q, r;
mp_limb_t eh, el;
umul_ppmm (eh, el, h, h);
sub_ddmmss (eh, el, ap[1], ap[0], eh, el);
ASSERT (eh <= 1);
/* A single Newton step may produce a root which is one too large */
udiv_qrnnd (q, r, (el >> 33) | (eh << 31), el << 31, h);
xh = h >> 32;
xl = h << 32;
xl += q;
xh += xl < q;
xp[0] = xl;
xp[1] = xh;
}
#endif
else
{
mp_limb_t *ep;
mp_limb_t *qp;
mp_size_t k = n/2;
sqrt_n (xp + k, ap + k, n - k);
ep = xalloc_limbs (n + k + 3); /* n+1 */
qp = ep + n+1; /* k+2 */
mpn_zero (ep, k);
if (n & 1)
{
/* E = B A - H^2, residual of k+2 limbs, quotient also k+1 limbs. */
mp_limb_t cy = sqrtB_residual (ep + k, ap, n, xp + k);
/* cy > 0 is unlikely, high part of root too large */
ASSERT_NOCARRY (mpn_sub_1 (xp + k, xp + k, k+1, cy));
/* Q = B^k E / H
k+2 k+2 k+1 sizes, but Q should fit in k+1 limbs
*/
mpn_tdiv_qr (qp, ep, 0, ep, n+1, xp + k, k+1);
ASSERT (qp[k+1] == 0);
}
else
{
/* E = A - H^2, residual of k+1 limbs, and quotient k limbs. */
mp_limb_t cy = sqrt_residual (ep + k, ap, n, xp + k);
/* cy > 0 is unlikely, high part of root too large */
ASSERT_NOCARRY (mpn_sub_1 (xp + k, xp + k, k, cy));
/* Q = B^k E / H
k+2 k+1 k sizes, but Q ought to fit in k+1 limbs
*/
mpn_tdiv_qr (qp, ep, 0, ep, n+1, xp + k, k);
}
ASSERT (qp[k+1] == 0);
mpn_rshift (qp, qp, k+1, 1);
mpn_copyi (xp, qp, k);
saturate_add_1 (xp, n, k, qp[k]);
free (ep);
}
}
/* Computes X = floor (sqrt ({ap, n})), ceil(n/2) limbs, and remainder
R = A - X^2 */
static mp_size_t
sqrtrem (mp_limb_t *xp, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n)
{
ASSERT (n > 0);
ASSERT (ap[n-1] > 0);
if (n == 1)
{
mp_limb_t x = sqrt_1 (ap[0]);
mp_limb_t r = ap[0] - x*x;
xp[0] = x;
rp[0] = r;
return (r > 0);
}
else if (n == 2)
{
mp_limb_t x = sqrt_2 (ap[1], ap[0]);
mp_limb_t rh, rl;
xp[0] = x;
umul_ppmm (rh, rl, x, x);
sub_ddmmss (rh, rl, ap[1], ap[0], rh, rl);
rp[0] = rl;
rp[1] = rh;
if (!rh)
return (rl > 0);
return 2;
}
else
{
mp_size_t k = n/2;
mp_size_t rn = k + 1;
mp_size_t xn = (n+1)/2;
mp_limb_t cy;
mp_ptr tp;
unsigned shift;
if (ap[n-1] >> (GMP_NUMB_BITS - 2))
{
shift = 0;
sqrt_n (xp, ap + k, n - k);
}
else
{
count_leading_zeros (shift, ap[n-1]);
shift >>= 1;
ASSERT (shift > 0);
tp = xalloc_limbs (n-k+1);
mpn_lshift (tp, ap + k-1, n-k+1, 2*shift);
sqrt_n (xp, tp + 1, n-k);
free (tp);
}
if (n & 1)
shift += GMP_NUMB_BITS / 2;
mpn_rshift (xp, xp, xn, shift);
cy = sqrt_residual (rp, ap, n, xp);
if (cy)
{
/* x is too large */
ASSERT_NOCARRY (mpn_sub_1 (xp, xp, xn, cy));
if (verbose)
for (; cy; cy--)
fprintf (stderr, "-");
}
else
{
/* T = 2X + 1 */
mp_ptr tp = xalloc_limbs (rn);
int adjust = 0;
cy = mpn_lshift (tp, xp, xn, 1);
if (n & 1)
ASSERT (!cy);
else
{
ASSERT (rn == xn + 1);
tp[rn - 1] = cy;
}
tp[0] |= 1;
while (mpn_cmp (rp, tp, rn) >= 0)
{
if (verbose)
{
fprintf (stderr, "+");
adjust++;
if (adjust > 10)
abort ();
}
ASSERT_NOCARRY (mpn_sub_n (rp, rp, tp, rn));
ASSERT_NOCARRY (mpn_add_1 (xp, xp, xn, 1));
ASSERT_NOCARRY (mpn_add_1 (tp, tp, rn, 2));
}
free (tp);
}
while (rn > 0 && !rp[rn-1])
rn--;
return rn;
}
}
static int
check_sqrt (const mpz_t a, const mpz_t x, const mpz_t r)
{
mpz_t t;
mpz_init (t);
mpz_add_ui (t, x, 1);
mpz_mul (t, t, t);
if (mpz_cmp (t, a) < 0)
{
fail:
mpz_clear (t);
return 0;
}
mpz_mul (t, x, x);
if (mpz_cmp (t, a) > 0)
goto fail;
mpz_sub (t, a, t);
if (mpz_cmp (t, r) != 0)
goto fail;
mpz_clear (t);
return 1;
}
#define MAX_SIZE 20
int
main (int argc, char **argv)
{
gmp_randstate_t rands;
mp_limb_t x[(MAX_SIZE+1)/2];
mp_limb_t r[MAX_SIZE/2+1];
mpz_t a;
unsigned c;
if (argc > 1)
verbose = 1;
mpz_init (a);
gmp_randinit_default (rands);
for (c = 0; c < 100000; c++)
{
unsigned bits = 1 + gmp_urandomm_ui (rands, GMP_NUMB_BITS * MAX_SIZE);
mpz_t t1, t2;
mp_size_t an, xn, rn;
mpz_rrandomb (a, rands, bits);
an = mpz_size (a);
if (verbose)
gmp_fprintf (stderr, "%u, size %u, A = %Zd", c, (unsigned) an, a);
xn = (an + 1)/2;
rn = sqrtrem (x, r, mpz_limbs_read (a), an);
ASSERT (rn == 0 || r[rn-1]);
if (verbose)
fprintf (stderr, "\n");
if (!check_sqrt (a, mpz_roinit_n (t1, x, xn), mpz_roinit_n (t2, r, rn)))
{
gmp_fprintf (stderr, "Test %u failed, size %u:\n"
" a: %Zd\n"
" x: %Nd\n"
" r: %Nd\n",
c, (unsigned) an, a, x, xn, r, rn);
abort ();
}
}
mpz_clear (a);
gmp_randclear (rands);
return EXIT_SUCCESS;
}
--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list