[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Tue Aug 18 20:29:35 UTC 2015
details: /var/hg/gmp/rev/ed9694d3007a
changeset: 16769:ed9694d3007a
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Aug 18 20:44:54 2015 +0200
description:
mpn/generic/rootrem.c: Rearrange loop to enable an initial seed.
details: /var/hg/gmp/rev/838a3505a0b9
changeset: 16770:838a3505a0b9
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Aug 18 20:56:38 2015 +0200
description:
mpn/generic/rootrem.c: Use logbased_root for a 9-bits initial estimate.
details: /var/hg/gmp/rev/55bad9424949
changeset: 16771:55bad9424949
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Aug 18 21:06:03 2015 +0200
description:
ChangeLog
details: /var/hg/gmp/rev/150fe66d26cf
changeset: 16772:150fe66d26cf
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Aug 18 22:28:46 2015 +0200
description:
...
diffstat:
ChangeLog | 10 +
mpn/generic/rootrem.c | 342 ++++++++++++++++++++++++++++++-------------------
2 files changed, 219 insertions(+), 133 deletions(-)
diffs (truncated from 477 to 300 lines):
diff -r 7850a6d24896 -r 150fe66d26cf ChangeLog
--- a/ChangeLog Mon Aug 17 21:47:23 2015 +0200
+++ b/ChangeLog Tue Aug 18 22:28:46 2015 +0200
@@ -1,3 +1,8 @@
+2015-08-04 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/generic/rootrem.c (logbased_root): New function.
+ (mpn_rootrem_internal): Use it to estimate highest 9 bits of the root.
+
2015-08-17 Torbjörn Granlund <torbjorng at google.com>
* acinclude.m4 (X86_64_PATTERN): Add skylake.
@@ -29,6 +34,11 @@
* tune/speed.h (SPEED_ROUTINE_MPN_MULLO_BASECASE): Update.
(SPEED_ROUTINE_MPN_SQRLO): New macro.
+ * mpn/generic/rootrem.c: Avoid divisions if not needed.
+
+ * tests/mpn/t-broot.c: Test also k=1.
+ * mpz/aorsmul_i.c: Move branches out of main line.
+
2015-07-28 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/generic/sqrtrem.c (mpn_dc_sqrt): Support odd sizes.
diff -r 7850a6d24896 -r 150fe66d26cf mpn/generic/rootrem.c
--- a/mpn/generic/rootrem.c Mon Aug 17 21:47:23 2015 +0200
+++ b/mpn/generic/rootrem.c Tue Aug 18 22:28:46 2015 +0200
@@ -50,14 +50,13 @@
static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
mp_limb_t, int);
-#define MPN_RSHIFT(cy,rp,up,un,cnt) \
+#define MPN_RSHIFT(rp,up,un,cnt) \
do { \
if ((cnt) != 0) \
- cy = mpn_rshift (rp, up, un, cnt); \
+ mpn_rshift (rp, up, un, cnt); \
else \
{ \
MPN_COPY_INCR (rp, up, un); \
- cy = 0; \
} \
} while (0)
@@ -125,6 +124,82 @@
}
}
+#define LOGROOT_USED_BITS 8
+#define LOGROOT_NEEDS_TWO_CORRECTIONS 1
+#define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS)
+/* Puts in *rootp some bits of the k^nt root of the number
+ 2^bitn * 1.op ; where op represents the "fractional" bits.
+
+ The returned value is the number of bits of the root minus one;
+ i.e. an approximation of the root will be
+ (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1).
+
+ Currently, only LOGROOT_USED_BITS bits of op are used (the implicit
+ one is not counted).
+ */
+static unsigned
+logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k)
+{
+ /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */
+ static const
+ unsigned char vlog[] = {1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22,
+ 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39, 40, 42, 43,
+ 44, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63,
+ 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82,
+ 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100,
+ 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
+ 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193,
+ 194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206,
+ 207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219,
+ 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232,
+ 232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244,
+ 245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255};
+
+ /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */
+ static const
+ unsigned char vexp[] = {0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 9, 10, 11,
+ 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23,
+ 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35,
+ 36, 37, 37, 38, 39, 40, 41, 41, 42, 43, 44, 45, 45, 46, 47, 48,
+ 49, 50, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61,
+ 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+ 107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122,
+ 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
+ 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
+ 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174,
+ 175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193,
+ 194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213,
+ 214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234,
+ 235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255};
+ mp_bitcnt_t retval;
+
+ if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS))
+ {
+ /* In the unlikely case, we use two divisions and a modulo. */
+ retval = bitn / k;
+ bitn %= k;
+ bitn = (bitn << LOGROOT_USED_BITS |
+ vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
+ }
+ else
+ {
+ bitn = (bitn << LOGROOT_USED_BITS |
+ vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
+ retval = bitn >> LOGROOT_USED_BITS;
+ bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1;
+ }
+ ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS);
+ *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS)
+ | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS;
+ return retval;
+}
+
/* if approx is non-zero, does not compute the final remainder */
static mp_size_t
mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
@@ -132,21 +207,26 @@
{
mp_ptr qp, rp, sp, wp, scratch;
mp_size_t qn, rn, sn, wn, nl, bn;
- mp_limb_t save, save2, cy;
+ mp_limb_t save, save2, cy, uh;
mp_bitcnt_t unb; /* number of significant bits of {up,un} */
mp_bitcnt_t xnb; /* number of significant bits of the result */
mp_bitcnt_t b, kk;
mp_bitcnt_t sizes[GMP_NUMB_BITS + 1];
- int ni, i;
- int c, perf_pow;
- int logk;
+ int ni;
+ int perf_pow;
+ unsigned ulz, snb, c, logk;
TMP_DECL;
- MPN_SIZEINBASE_2EXP(unb, up, un, 1);
- /* unb is the number of bits of the input U */
+ /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */
+ uh = up[un - 1];
+ count_leading_zeros (ulz, uh);
+ ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */
+ unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz;
+ /* unb is the (truncated) logarithm of the input U in base 2*/
- if (unb <= k) /* root is 1 */
+ if (unb < k) /* root is 1 */
{
+ rootp[0] = 1;
if (remp == NULL)
un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */
else
@@ -155,43 +235,48 @@
un -= (remp [un - 1] == 0); /* There should be at most one zero limb,
if we demand u to be normalized */
}
- rootp[0] = 1;
return un;
}
- /* if (unb - k <= k/2) // root is 2 */
+ /* if (unb - k < k/2 + k/16) // root is 2 */
- xnb = (unb - 1) / k + 1; /* ceil (unb / k) */
- /* xnb is the number of bits of the root R */
+ if (ulz == GMP_NUMB_BITS)
+ uh = up[un - 2];
+ else
+ uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz);
+ ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1);
- /* We initialize the algorithm with a 1-bit approximation to zero: since we
- know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that
- r0^k = 2^(k*(xnb-1)), that we subtract to the input. */
- kk = k * (xnb - 1); /* number of truncated bits in the input */
- --kk;
- rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */
+ xnb = logbased_root (rootp, uh, unb, k);
+ snb = LOGROOT_RETURNED_BITS - 1;
+ /* xnb+1 is the number of bits of the root R */
+ /* snb+1 is the number of bits of the current approximation S */
- for (logk = 1; ((k - 1) >> logk) != 0; logk++)
+ kk = k * xnb; /* number of truncated bits in the input */
+
+ /* FIXME: Should we skipp all loops when xnb > snb ? */
+ for (uh = k - 1, logk = 2; (uh >>= 1) != 0; ++logk )
;
- /* logk = ceil(log(k)/log(2)) */
+ /* logk = ceil(log(k)/log(2)) + 1 */
+
+ /* xnb is the number of remaining bits to determine in the kth root */
+ for (ni = 0; (sizes[ni] = xnb) > snb; ++ni)
+ {
+ /* invariant: here we want xnb+1 total bits for the kth root */
- b = xnb - 1; /* number of remaining bits to determine in the kth root */
- ni = 0;
- do
- {
- /* invariant: here we want b+1 total bits for the kth root */
- sizes[ni] = b;
- /* if c is the new value of b, this means that we'll go from a root
- of c+1 bits (say s') to a root of b+1 bits.
- It is proved in the book "Modern Computer Arithmetic" from Brent
+ /* if c is the new value of xnb, this means that we'll go from a
+ root of c+1 bits (say s') to a root of xnb+1 bits.
+ It is proved in the book "Modern Computer Arithmetic" by Brent
and Zimmermann, Chapter 1, that
if s' >= k*beta, then at most one correction is necessary.
- Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that
- c >= ceil((b + log2(k))/2). */
- b = (b + logk + 1) / 2;
- if (b >= sizes[ni])
- b = sizes[ni] - 1; /* add just one bit at a time */
- ni++;
- } while (b != 0);
+ Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that
+ c >= ceil((xnb + log2(k))/2). */
+ if (xnb > logk)
+ xnb = (xnb + logk) / 2;
+ else
+ --xnb; /* add just one bit at a time */
+ }
+
+ *rootp >>= snb - xnb;
+ kk -= xnb;
ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
/* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
@@ -215,24 +300,14 @@
and temporary for mpn_pow_1 */
if (remp == NULL)
- rp = scratch; /* will contain the remainder */
+ rp = scratch; /* will contain the remainder */
else
rp = remp;
sp = rootp;
- MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
- MPN_DECR_U (rp, rn, 2); /* subtract the initial approximation: since
- the non-truncated part is less than 2^k, it
- is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */
- rn -= rp [rn - 1] == 0;
- sp[0] = save = 2; /* initial approximation */
- sn = 1; /* it has one limb */
+ sn = 1; /* Initial approximation has one limb */
- wp[0] = k; /* k * {sp,sn}^(k-1) */
- wn = 1;
- i = ni;
- b = bn = 1;
- do
+ for (b = xnb; ni != 0; --ni)
{
/* 1: loop invariant:
{sp, sn} is the current approximation of the root, which has
@@ -242,58 +317,13 @@
kk = number of truncated bits of the input
*/
- /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
-
- /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
- if (UNLIKELY (rn < wn))
- {
- MPN_ZERO (sp, bn);
- }
- else
- {
- qn = rn - wn; /* expected quotient size */
- if (qn <= bn) { /* Divide only if result is not too big. */
- mpn_div_q (qp, rp, rn, wp, wn, scratch);
- qn += qp[qn] != 0;
- }
-
- /* 5: current buffers: {sp,sn}, {qp,qn}.
- Note: {rp,rn} is not needed any more since we'll compute it from
- scratch at the end of the loop.
- */
-
- /* the quotient should be smaller than 2^b, since the previous
- approximation was correctly rounded toward zero */
- if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
- qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS))))
- {
- for (qn = 1; qn < bn; ++qn)
- sp[qn - 1] = GMP_NUMB_MAX;
- sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS-1 - ((b-1) % GMP_NUMB_BITS));
- }
- else
More information about the gmp-commit
mailing list