[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Jul 1 21:33:57 UTC 2015
details: /var/hg/gmp/rev/dfc1c11202be
changeset: 16729:dfc1c11202be
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jul 01 22:24:22 2015 +0200
description:
mpn/generic/sqrtrem.c: micro-opt
details: /var/hg/gmp/rev/3a3488117909
changeset: 16730:3a3488117909
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jul 01 23:07:48 2015 +0200
description:
gmp-impl.h (MPN_FILL): New macro.
details: /var/hg/gmp/rev/980816b20da6
changeset: 16731:980816b20da6
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jul 01 23:08:37 2015 +0200
description:
mpn/generic/sqrtrem.c (mpn_dc_sqrt): New function.
details: /var/hg/gmp/rev/0b69a2d8f080
changeset: 16732:0b69a2d8f080
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jul 01 23:21:44 2015 +0200
description:
mpn/generic/sqrtrem.c(mpn_dc_sqrt): Use divappr.
details: /var/hg/gmp/rev/0c83a1337406
changeset: 16733:0c83a1337406
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jul 01 23:33:36 2015 +0200
description:
ChangeLog
diffstat:
ChangeLog | 6 +
gmp-impl.h | 48 ++++++------
mpn/generic/sqrtrem.c | 187 ++++++++++++++++++++++++++++++++++++++++++-------
3 files changed, 188 insertions(+), 53 deletions(-)
diffs (truncated from 310 to 300 lines):
diff -r c55534b5d5aa -r 0c83a1337406 ChangeLog
--- a/ChangeLog Mon Jun 29 07:04:04 2015 +0200
+++ b/ChangeLog Wed Jul 01 23:33:36 2015 +0200
@@ -1,3 +1,9 @@
+2015-07-01 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * gmp-impl.h(MPN_FILL): New macro, generalise MPN_ZERO.
+
+ * mpn/generic/sqrtrem.c(mpn_dc_sqrt): New function not computing remainder.
+
2015-06-24 Torbjörn Granlund <torbjorng at google.com>
* mpn/x86_64/fastsse/com.asm: Disalllow zero size operands.
diff -r c55534b5d5aa -r 0c83a1337406 gmp-impl.h
--- a/gmp-impl.h Mon Jun 29 07:04:04 2015 +0200
+++ b/gmp-impl.h Wed Jul 01 23:33:36 2015 +0200
@@ -1825,35 +1825,35 @@
would be good when on a GNU system. */
#if HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc
+#define MPN_FILL(dst, n, f) \
+ do { \
+ mp_ptr __dst = (dst) - 1; \
+ mp_size_t __n = (n); \
+ ASSERT (__n > 0); \
+ do \
+ *++__dst = (f); \
+ while (--__n); \
+ } while (0)
+#endif
+
+#ifndef MPN_FILL
+#define MPN_FILL(dst, n, f) \
+ do { \
+ mp_ptr __dst = (dst); \
+ mp_size_t __n = (n); \
+ ASSERT (__n > 0); \
+ do \
+ *__dst++ = (f); \
+ while (--__n); \
+ } while (0)
+#endif
+
#define MPN_ZERO(dst, n) \
do { \
ASSERT ((n) >= 0); \
if ((n) != 0) \
- { \
- mp_ptr __dst = (dst) - 1; \
- mp_size_t __n = (n); \
- do \
- *++__dst = 0; \
- while (--__n); \
- } \
+ MPN_FILL (dst, n, CNST_LIMB (0)); \
} while (0)
-#endif
-
-#ifndef MPN_ZERO
-#define MPN_ZERO(dst, n) \
- do { \
- ASSERT ((n) >= 0); \
- if ((n) != 0) \
- { \
- mp_ptr __dst = (dst); \
- mp_size_t __n = (n); \
- do \
- *__dst++ = 0; \
- while (--__n); \
- } \
- } while (0)
-#endif
-
/* On the x86s repe/scasl doesn't seem useful, since it takes many cycles to
start up and would need to strip a lot of zeros before it'd be faster
diff -r c55534b5d5aa -r 0c83a1337406 mpn/generic/sqrtrem.c
--- a/mpn/generic/sqrtrem.c Mon Jun 29 07:04:04 2015 +0200
+++ b/mpn/generic/sqrtrem.c Wed Jul 01 23:33:36 2015 +0200
@@ -47,6 +47,7 @@
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
+#define USE_DIVAPPR_Q 1
static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
{
@@ -236,10 +237,7 @@
mpn_rshift (sp, sp, l, 1);
sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
- {
- sp[0] &= ~(approx | 1);
- return 1; /* Remainder is non-zero */
- }
+ return 1; /* Remainder is non-zero */
q >>= 1;
if (c != 0)
c = mpn_add_n (np + l, np + l, sp + l, h);
@@ -250,8 +248,8 @@
if (c < 0)
{
q = mpn_add_1 (sp + l, sp + l, h, q);
-#if HAVE_NATIVE_mpn_addlsh1_n
- c += mpn_addlsh1_n (np, np, sp, n) + 2 * q;
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
+ c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
#else
c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
#endif
@@ -263,6 +261,143 @@
return c;
}
+#if USE_DIVAPPR_Q
+static void
+mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
+{
+ gmp_pi1_t inv;
+ mp_limb_t qh;
+ ASSERT (dn > 2);
+ ASSERT (nn >= dn);
+ ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+ MPN_COPY (scratch, np, nn);
+ invert_pi1 (inv, dp[dn-1], dp[dn-2]);
+ if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
+ qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
+ else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
+ qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
+ else
+ {
+ mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
+ TMP_DECL;
+ TMP_MARK;
+ /* Sadly, scratch is too small. */
+ qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
+ TMP_FREE;
+ }
+ qp [nn - dn] = qh;
+}
+#endif
+
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
+ returns zero if the operand was a perfect square, one otherwise.
+ Assumes {np, 2n} is semi-normalized, i.e. np[2n-1] != 0
+ where B=2^GMP_NUMB_BITS. */
+static int
+mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, int s)
+{
+ mp_limb_t q; /* carry out of {sp, n} */
+ int c; /* carry out of remainder */
+ mp_size_t l, h;
+ mp_ptr qp, tp, scratch;
+ TMP_DECL;
+ TMP_MARK;
+
+ ASSERT (np[2 * n - 1] != 0);
+ ASSERT (n > 2);
+
+ l = (n - 1) / 2;
+ h = n - l;
+ ASSERT (n >= l + 2 && l + 2 >= h && h > l && l > 0);
+ scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
+ tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
+ if (s != 0)
+ {
+ int o = 1; /* Should be o = (l > 1) */;
+ ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o, n + h + 1 + o, 2 * s));
+ }
+ else
+ MPN_COPY (tp, np + l - 1, n + h + 1);
+ q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0);
+ if (q != 0)
+ ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
+ qp = tp + n + 1; /* n + 2 */
+#if USE_DIVAPPR_Q
+ mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
+#else
+ mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
+#endif
+ q += qp [l + 1];
+ c = 1;
+ if (q > 1)
+ {
+ /* FIXME: if s!=0 we will shift later, a noop on this area. */
+ MPN_FILL (sp, l, GMP_NUMB_MAX);
+ }
+ else
+ {
+ /* FIXME: if s!=0 we will shift again later, shift just once. */
+ mpn_rshift (sp, qp + 1, l, 1);
+ sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
+ if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
+ (qp[1] & ((CNST_LIMB(2) << s) - 1))) == 0)
+ {
+ mp_limb_t cy;
+ /* Approximation is not good enough, the extra limb(+ s bits)
+ is smaller than 2. */
+ /* {qp + 1, l + 1} equals 2*{sp, l} */
+ /* FIXME: use mullo or wrap-around. */
+ ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
+ /* Compute the remainder of the previous mpn_div(appr)_q. */
+ cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
+#if USE_DIVAPPR_Q || WANT_ASSERT
+ MPN_DECR_U (tp + 1 + h, l, cy);
+#if USE_DIVAPPR_Q
+ ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
+ if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
+ {
+ /* May happen only if div result was not exact. */
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
+ cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
+#else
+ cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
+#endif
+ ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
+ MPN_DECR_U (sp, l, 1);
+ }
+#else /* WANT_ASSERT */
+ ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
+#endif
+#endif
+ if (mpn_zero_p (tp + l + 1, h - l))
+ {
+ mpn_sqr (scratch, sp, l);
+ c = mpn_cmp (tp + 1, scratch + l, l);
+ if (c == 0)
+ {
+ if (s != 0)
+ {
+ mpn_lshift (tp, np, l, 2 * s);
+ np = tp;
+ }
+ c = mpn_cmp (np, scratch, l);
+ }
+ if (c < 0)
+ {
+ MPN_DECR_U (sp, l, 1);
+ c = 1;
+ }
+ }
+ }
+ }
+ TMP_FREE;
+
+ if (s != 0)
+ mpn_rshift (sp, sp, n, s);
+ return c;
+}
+
mp_size_t
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
@@ -310,38 +445,32 @@
TMP_MARK;
if ((rp == NULL) && (nn > 9))
- {
+ if ((nn & 1) == 0)
+ return mpn_dc_sqrt (sp, np, tn, c);
+ else
+ {
mp_limb_t mask;
- TMP_ALLOC_LIMBS_2 (rp, nn + 2 - (nn & 1),
- tp, tn + 2 - (nn & 1));
- MPN_ZERO (rp, 2);
+ rp = TMP_ALLOC_LIMBS (nn + 1);
+ MPN_ZERO (rp, 1);
if (c != 0)
- mpn_lshift (rp + 2 - (nn & 1), np, nn, 2 * c);
+ mpn_lshift (rp + 1, np, nn, 2 * c);
else
- MPN_COPY (rp + 2 - (nn & 1), np, nn);
- if (nn & 1)
- {
- c += GMP_NUMB_BITS / 2; /* c now represents k */
- mask = (CNST_LIMB (1) << c) - 2;
- }
- else
- mask = GMP_NUMB_MAX - 1;
- rn = tn + 1 - (nn & 1);
- rn += (rp[rn] = mpn_dc_sqrtrem (tp, rp, rn, mask));
- if (c != 0)
- mpn_rshift (sp, tp + 1 - (nn & 1), tn, c);
- else
- MPN_COPY (sp, tp + 1, tn);
- }
- else if (nn % 2 != 0 || c > 0)
+ MPN_COPY (rp + 1, np, nn);
+ c += GMP_NUMB_BITS / 2; /* c now represents k */
+ mask = (CNST_LIMB (1) << c) - 2;
+ rn = tn;
+ rn += (rp[rn] = mpn_dc_sqrtrem (sp, rp, rn, mask));
+ mpn_rshift (sp, sp, tn, c);
+ }
+ else if ((nn & 1) != 0 || c > 0)
{
mp_limb_t mask;
tp = TMP_ALLOC_LIMBS (2 * tn);
More information about the gmp-commit
mailing list