[Gmp-commit] /var/hg/gmp: 4 new changesets

Tue Jul 28 18:53:43 UTC 2015

details:   /var/hg/gmp/rev/ad9c5bcb7b50
changeset: 16747:ad9c5bcb7b50
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Jul 28 20:17:26 2015 +0200
description:
mpn/generic/sqrtrem.c (mpn_dc_sqrt): Support odd sizes.

details:   /var/hg/gmp/rev/4f60cf2d9fc9
changeset: 16748:4f60cf2d9fc9
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Jul 28 20:21:49 2015 +0200
description:
ChangeLog

details:   /var/hg/gmp/rev/92f57b7be5cb
changeset: 16749:92f57b7be5cb
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Jul 28 20:22:30 2015 +0200
description:
Mention in NEWS for sqrt speedup.

details:   /var/hg/gmp/rev/e40e30414801
changeset: 16750:e40e30414801
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Jul 28 20:24:53 2015 +0200
description:
mpn/generic/sqrtrem.c: Add TRACEs for further development.

diffstat:

 ChangeLog             |   6 ++++-
 NEWS                  |   2 +
 mpn/generic/sqrtrem.c |  59 ++++++++++++++++++++++----------------------------
 3 files changed, 33 insertions(+), 34 deletions(-)

diffs (192 lines):

diff -r b9a904a2a7a7 -r e40e30414801 ChangeLog

--- a/ChangeLog	Thu Jul 16 17:23:06 2015 +0200
+++ b/ChangeLog	Tue Jul 28 20:24:53 2015 +0200
@@ -1,3 +1,7 @@
+2015-07-28 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpn/generic/sqrtrem.c (mpn_dc_sqrt): Support odd sizes.
+
 2015-07-16  TorbjÃ¶rn Granlund  <torbjorng at google.com>
 
 	* tune/speed.c: Remove now redundant MPN_FILL.
@@ -22,7 +26,7 @@
 
 	* gmp-impl.h (MPN_FILL): New macro, generalise MPN_ZERO.
 
-	* mpn/generic/sqrtrem.c(mpn_dc_sqrt): New function not computing remainder.
+	* mpn/generic/sqrtrem.c (mpn_dc_sqrt): New function not computing remainder.
 	(mpn_dc_sqrtrem): Use tdiv_q instead of divrem, use given scratch space.
 	(mpn_sqrtrem): Use mpn_dc_sqrt for both even and odd sizes.
 
diff -r b9a904a2a7a7 -r e40e30414801 NEWS
--- a/NEWS	Thu Jul 16 17:23:06 2015 +0200
+++ b/NEWS	Tue Jul 28 20:24:53 2015 +0200
@@ -12,6 +12,8 @@
   SPEEDUPS
   * Speedup for Intel Broadwell.
 
+  * Square root is now faster when the remainder is not needed.
+
   FEATURES
   * New C++ functions gcd and lcm for mpz_class.
 
diff -r b9a904a2a7a7 -r e40e30414801 mpn/generic/sqrtrem.c
--- a/mpn/generic/sqrtrem.c	Thu Jul 16 17:23:06 2015 +0200
+++ b/mpn/generic/sqrtrem.c	Tue Jul 28 20:24:53 2015 +0200
@@ -1,7 +1,7 @@
 /* mpn_sqrtrem -- square root and remainder
 
-   Contributed to the GNU project by Paul Zimmermann (most code) and
-   Torbjorn Granlund (mpn_sqrtrem1).
+   Contributed to the GNU project by Paul Zimmermann (most code),
+   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
 
    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
    MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
@@ -48,6 +48,7 @@
 #include "gmp-impl.h"
 #include "longlong.h"
 #define USE_DIVAPPR_Q 1
+#define TRACE(x)
 
 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
 {
@@ -233,6 +234,7 @@
       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
       if (q != 0)
 	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
+      TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
       mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
       q += scratch[l];
       c = scratch[0] & 1;
@@ -243,6 +245,7 @@
       q >>= 1;
       if (c != 0)
 	c = mpn_add_n (np + l, np + l, sp + l, h);
+      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
       mpn_sqr (np + n, sp, l);
       b = q + mpn_sub_n (np, np, np + n, 2 * l);
       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
@@ -292,12 +295,12 @@
 }
 #endif
 
-/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
    returns zero if the operand was a perfect square, one otherwise.
-   Assumes {np, 2n} is semi-normalized, i.e. np[2n-1] != 0
-   where B=2^GMP_NUMB_BITS.  */
+   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
+   where B=2^GMP_NUMB_BITS. */
 static int
-mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, int nsh)
+mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
 {
   mp_limb_t q;			/* carry out of {sp, n} */
   int c;			/* carry out of remainder */
@@ -306,26 +309,27 @@
   TMP_DECL;
   TMP_MARK;
 
-  ASSERT (np[2 * n - 1] != 0);
+  ASSERT (np[2 * n - 1 - odd] != 0);
   ASSERT (n > 4);
   ASSERT (nsh < GMP_NUMB_BITS / 2);
 
   l = (n - 1) / 2;
   h = n - l;
-  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l > 1);
+  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
   scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
   tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
   if (nsh != 0)
     {
-      int o = 1; /* Should be o = (l > 1) */;
-      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o, n + h + 1 + o, 2 * nsh));
+      int o = l > (1 + odd); /* */
+      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
     }
   else
-    MPN_COPY (tp, np + l - 1, n + h + 1);
+    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
   q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
   if (q != 0)
     ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
   qp = tp + n + 1; /* l + 2 */
+  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
 #if USE_DIVAPPR_Q
   mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
 #else
@@ -344,13 +348,14 @@
       mpn_rshift (sp, qp + 1, l, 1);
       sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
       if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
-	   (qp[1] & ((CNST_LIMB(2) << nsh) - 1))) == 0)
+	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
 	{
 	  mp_limb_t cy;
-	  /* Approximation is not good enough, the extra limb(+ s bits)
-	     is smaller than 2. */
+	  /* Approximation is not good enough, the extra limb(+ nsh bits)
+	     is smaller than needed to absorb the possible error. */
 	  /* {qp + 1, l + 1} equals 2*{sp, l} */
 	  /* FIXME: use mullo or wrap-around. */
+	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
 	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
 	  /* Compute the remainder of the previous mpn_div(appr)_q. */
 	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
@@ -379,6 +384,7 @@
 #endif
 	  if (mpn_zero_p (tp + l + 1, h - l))
 	    {
+	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
 	      mpn_sqr (scratch, sp, l);
 	      c = mpn_cmp (tp + 1, scratch + l, l);
 	      if (c == 0)
@@ -388,7 +394,7 @@
 		      mpn_lshift (tp, np, l, 2 * nsh);
 		      np = tp;
 		    }
-		  c = mpn_cmp (np, scratch, l);
+		  c = mpn_cmp (np, scratch + odd, l - odd);
 		}
 	      if (c < 0)
 		{
@@ -400,8 +406,8 @@
     }
   TMP_FREE;
 
-  if (nsh != 0)
-    mpn_rshift (sp, sp, n, nsh);
+  if ((odd | nsh) != 0)
+    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
   return c;
 }
 
@@ -450,23 +456,10 @@
   }
   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
 
+  if ((rp == NULL) && (nn > 8))
+    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
   TMP_MARK;
-  if ((rp == NULL) && (nn > 8))
-    if ((nn & 1) == 0)
-      return mpn_dc_sqrt (sp, np, tn, c);
-    else
-      {
-	rp = TMP_ALLOC_LIMBS (nn + 1);
-	rp[0] = 0;
-	MPN_COPY (rp + 1, np, nn);
-	/* FIXME: mpn_dc_sqrt already does copies and shifts
-	   internally, it should support c > GMP_NUMB_BITS / 2 ...*/ 
-	rn = mpn_dc_sqrt (sp, rp, tn, c);
-	TMP_FREE;
-	mpn_rshift (sp, sp, tn, GMP_NUMB_BITS / 2);
-	return rn;
-      }
-  else if ((nn & 1) != 0 || c > 0)
+  if (((nn & 1) | c) != 0)
     {
       mp_limb_t mask;
       mp_ptr scratch;