[Gmp-commit] /var/hg/gmp: 5 new changesets

Wed Jul 1 21:33:57 UTC 2015

details:   /var/hg/gmp/rev/dfc1c11202be
changeset: 16729:dfc1c11202be
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Wed Jul 01 22:24:22 2015 +0200
description:
mpn/generic/sqrtrem.c: micro-opt

details:   /var/hg/gmp/rev/3a3488117909
changeset: 16730:3a3488117909
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Wed Jul 01 23:07:48 2015 +0200
description:
gmp-impl.h (MPN_FILL): New macro.

details:   /var/hg/gmp/rev/980816b20da6
changeset: 16731:980816b20da6
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Wed Jul 01 23:08:37 2015 +0200
description:
mpn/generic/sqrtrem.c (mpn_dc_sqrt): New function.

details:   /var/hg/gmp/rev/0b69a2d8f080
changeset: 16732:0b69a2d8f080
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Wed Jul 01 23:21:44 2015 +0200
description:
mpn/generic/sqrtrem.c(mpn_dc_sqrt): Use divappr.

details:   /var/hg/gmp/rev/0c83a1337406
changeset: 16733:0c83a1337406
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Wed Jul 01 23:33:36 2015 +0200
description:
ChangeLog

diffstat:

 ChangeLog             |    6 +
 gmp-impl.h            |   48 ++++++------
 mpn/generic/sqrtrem.c |  187 ++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 188 insertions(+), 53 deletions(-)

diffs (truncated from 310 to 300 lines):

diff -r c55534b5d5aa -r 0c83a1337406 ChangeLog

--- a/ChangeLog	Mon Jun 29 07:04:04 2015 +0200
+++ b/ChangeLog	Wed Jul 01 23:33:36 2015 +0200
@@ -1,3 +1,9 @@
+2015-07-01 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* gmp-impl.h(MPN_FILL): New macro, generalise MPN_ZERO.
+
+	* mpn/generic/sqrtrem.c(mpn_dc_sqrt): New function not computing remainder.
+
 2015-06-24  TorbjÃ¶rn Granlund  <torbjorng at google.com>
 
 	* mpn/x86_64/fastsse/com.asm: Disalllow zero size operands.
diff -r c55534b5d5aa -r 0c83a1337406 gmp-impl.h
--- a/gmp-impl.h	Mon Jun 29 07:04:04 2015 +0200
+++ b/gmp-impl.h	Wed Jul 01 23:33:36 2015 +0200
@@ -1825,35 +1825,35 @@
    would be good when on a GNU system.  */
 
 #if HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc
+#define MPN_FILL(dst, n, f)						\
+  do {									\
+    mp_ptr __dst = (dst) - 1;						\
+    mp_size_t __n = (n);						\
+    ASSERT (__n > 0);							\
+    do									\
+      *++__dst = (f);							\
+    while (--__n);							\
+  } while (0)
+#endif
+
+#ifndef MPN_FILL
+#define MPN_FILL(dst, n, f)						\
+  do {									\
+    mp_ptr __dst = (dst);						\
+    mp_size_t __n = (n);						\
+    ASSERT (__n > 0);							\
+    do									\
+      *__dst++ = (f);							\
+    while (--__n);							\
+  } while (0)
+#endif
+
 #define MPN_ZERO(dst, n)						\
   do {									\
     ASSERT ((n) >= 0);							\
     if ((n) != 0)							\
-      {									\
-	mp_ptr __dst = (dst) - 1;					\
-	mp_size_t __n = (n);						\
-	do								\
-	  *++__dst = 0;							\
-	while (--__n);							\
-      }									\
+      MPN_FILL (dst, n, CNST_LIMB (0));					\
   } while (0)
-#endif
-
-#ifndef MPN_ZERO
-#define MPN_ZERO(dst, n)						\
-  do {									\
-    ASSERT ((n) >= 0);							\
-    if ((n) != 0)							\
-      {									\
-	mp_ptr __dst = (dst);						\
-	mp_size_t __n = (n);						\
-	do								\
-	  *__dst++ = 0;							\
-	while (--__n);							\
-      }									\
-  } while (0)
-#endif
-
 
 /* On the x86s repe/scasl doesn't seem useful, since it takes many cycles to
    start up and would need to strip a lot of zeros before it'd be faster
diff -r c55534b5d5aa -r 0c83a1337406 mpn/generic/sqrtrem.c
--- a/mpn/generic/sqrtrem.c	Mon Jun 29 07:04:04 2015 +0200
+++ b/mpn/generic/sqrtrem.c	Wed Jul 01 23:33:36 2015 +0200
@@ -47,6 +47,7 @@
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
+#define USE_DIVAPPR_Q 1
 
 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
 {
@@ -236,10 +237,7 @@
       mpn_rshift (sp, sp, l, 1);
       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
       if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
-	{
-	  sp[0] &= ~(approx | 1);
-	  return 1; /* Remainder is non-zero */
-	}
+	return 1; /* Remainder is non-zero */
       q >>= 1;
       if (c != 0)
 	c = mpn_add_n (np + l, np + l, sp + l, h);
@@ -250,8 +248,8 @@
       if (c < 0)
 	{
 	  q = mpn_add_1 (sp + l, sp + l, h, q);
-#if HAVE_NATIVE_mpn_addlsh1_n
-	  c += mpn_addlsh1_n (np, np, sp, n) + 2 * q;
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
+	  c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
 #else
 	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
 #endif
@@ -263,6 +261,143 @@
   return c;
 }
 
+#if USE_DIVAPPR_Q
+static void
+mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
+{
+  gmp_pi1_t inv;
+  mp_limb_t qh;
+  ASSERT (dn > 2);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+  MPN_COPY (scratch, np, nn);
+  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
+  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
+    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
+  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
+    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
+  else
+    {
+      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
+      TMP_DECL;
+      TMP_MARK;
+      /* Sadly, scratch is too small. */
+      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
+      TMP_FREE;
+    }
+  qp [nn - dn] = qh;
+}
+#endif
+
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
+   returns zero if the operand was a perfect square, one otherwise.
+   Assumes {np, 2n} is semi-normalized, i.e. np[2n-1] != 0
+   where B=2^GMP_NUMB_BITS.  */
+static int
+mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, int s)
+{
+  mp_limb_t q;			/* carry out of {sp, n} */
+  int c;			/* carry out of remainder */
+  mp_size_t l, h;
+  mp_ptr qp, tp, scratch;
+  TMP_DECL;
+  TMP_MARK;
+
+  ASSERT (np[2 * n - 1] != 0);
+  ASSERT (n > 2);
+
+  l = (n - 1) / 2;
+  h = n - l;
+  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l > 0);
+  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
+  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
+  if (s != 0)
+    {
+      int o = 1; /* Should be o = (l > 1) */;
+      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o, n + h + 1 + o, 2 * s));
+    }
+  else
+    MPN_COPY (tp, np + l - 1, n + h + 1);
+  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0);
+  if (q != 0)
+    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
+  qp = tp + n + 1; /* n + 2 */
+#if USE_DIVAPPR_Q
+  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
+#else
+  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
+#endif
+  q += qp [l + 1];
+  c = 1;
+  if (q > 1)
+    {
+      /* FIXME: if s!=0 we will shift later, a noop on this area. */
+      MPN_FILL (sp, l, GMP_NUMB_MAX);
+    }
+  else
+    {
+      /* FIXME: if s!=0 we will shift again later, shift just once. */
+      mpn_rshift (sp, qp + 1, l, 1);
+      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
+      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
+	   (qp[1] & ((CNST_LIMB(2) << s) - 1))) == 0)
+	{
+	  mp_limb_t cy;
+	  /* Approximation is not good enough, the extra limb(+ s bits)
+	     is smaller than 2. */
+	  /* {qp + 1, l + 1} equals 2*{sp, l} */
+	  /* FIXME: use mullo or wrap-around. */
+	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
+	  /* Compute the remainder of the previous mpn_div(appr)_q. */
+	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
+#if USE_DIVAPPR_Q || WANT_ASSERT
+	  MPN_DECR_U (tp + 1 + h, l, cy);
+#if USE_DIVAPPR_Q
+	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
+	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
+	    {
+	      /* May happen only if div result was not exact. */
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
+	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
+#else
+	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
+#endif
+	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
+	      MPN_DECR_U (sp, l, 1);
+	    }
+#else /* WANT_ASSERT */
+	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
+#endif
+#endif
+	  if (mpn_zero_p (tp + l + 1, h - l))
+	    {
+	      mpn_sqr (scratch, sp, l);
+	      c = mpn_cmp (tp + 1, scratch + l, l);
+	      if (c == 0)
+		{
+		  if (s != 0)
+		    {
+		      mpn_lshift (tp, np, l, 2 * s);
+		      np = tp;
+		    }
+		  c = mpn_cmp (np, scratch, l);
+		}
+	      if (c < 0)
+		{
+		  MPN_DECR_U (sp, l, 1);
+		  c = 1;
+		}
+	    }
+	}
+    }
+  TMP_FREE;
+
+  if (s != 0)
+    mpn_rshift (sp, sp, n, s);
+  return c;
+}
+
 
 mp_size_t
 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
@@ -310,38 +445,32 @@
 
   TMP_MARK;
   if ((rp == NULL) && (nn > 9))
-    {
+    if ((nn & 1) == 0)
+      return mpn_dc_sqrt (sp, np, tn, c);
+    else
+      {
       mp_limb_t mask;
-      TMP_ALLOC_LIMBS_2 (rp, nn + 2 - (nn & 1),
-			 tp, tn + 2 - (nn & 1));
-      MPN_ZERO (rp, 2);
+      rp = TMP_ALLOC_LIMBS (nn + 1);
+      MPN_ZERO (rp, 1);
       if (c != 0)
-	mpn_lshift (rp + 2 - (nn & 1), np, nn, 2 * c);
+	mpn_lshift (rp + 1, np, nn, 2 * c);
       else
-	MPN_COPY (rp + 2 - (nn & 1), np, nn);
-      if (nn & 1)
-	{
-	  c += GMP_NUMB_BITS / 2;		/* c now represents k */
-	  mask = (CNST_LIMB (1) << c) - 2;
-	}
-      else
-	mask = GMP_NUMB_MAX - 1;
-      rn = tn + 1 - (nn & 1);
-      rn += (rp[rn] = mpn_dc_sqrtrem (tp, rp, rn, mask));
-      if (c != 0)
-	mpn_rshift (sp, tp + 1 - (nn & 1), tn, c);
-      else
-	MPN_COPY (sp, tp + 1, tn);
-    }
-  else if (nn % 2 != 0 || c > 0)
+	MPN_COPY (rp + 1, np, nn);
+      c += GMP_NUMB_BITS / 2;		/* c now represents k */
+      mask = (CNST_LIMB (1) << c) - 2;
+      rn = tn;
+      rn += (rp[rn] = mpn_dc_sqrtrem (sp, rp, rn, mask));
+      mpn_rshift (sp, sp, tn, c);
+      }
+  else if ((nn & 1) != 0 || c > 0)
     {
       mp_limb_t mask;
       tp = TMP_ALLOC_LIMBS (2 * tn);