[Gmp-commit] /var/hg/gmp: 4 new changesets

Tue Aug 18 20:29:35 UTC 2015

details:   /var/hg/gmp/rev/ed9694d3007a
changeset: 16769:ed9694d3007a
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Aug 18 20:44:54 2015 +0200
description:
mpn/generic/rootrem.c: Rearrange loop to enable an initial seed.

details:   /var/hg/gmp/rev/838a3505a0b9
changeset: 16770:838a3505a0b9
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Aug 18 20:56:38 2015 +0200
description:
mpn/generic/rootrem.c: Use logbased_root for a 9-bits initial estimate.

details:   /var/hg/gmp/rev/55bad9424949
changeset: 16771:55bad9424949
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Aug 18 21:06:03 2015 +0200
description:
ChangeLog

details:   /var/hg/gmp/rev/150fe66d26cf
changeset: 16772:150fe66d26cf
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Aug 18 22:28:46 2015 +0200
description:
...

diffstat:

 ChangeLog             |   10 +
 mpn/generic/rootrem.c |  342 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 219 insertions(+), 133 deletions(-)

diffs (truncated from 477 to 300 lines):

diff -r 7850a6d24896 -r 150fe66d26cf ChangeLog

--- a/ChangeLog	Mon Aug 17 21:47:23 2015 +0200
+++ b/ChangeLog	Tue Aug 18 22:28:46 2015 +0200
@@ -1,3 +1,8 @@
+2015-08-04 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpn/generic/rootrem.c (logbased_root): New function.
+	(mpn_rootrem_internal): Use it to estimate highest 9 bits of the root.
+
 2015-08-17  TorbjÃ¶rn Granlund  <torbjorng at google.com>
 
 	* acinclude.m4 (X86_64_PATTERN): Add skylake.
@@ -29,6 +34,11 @@
 	* tune/speed.h (SPEED_ROUTINE_MPN_MULLO_BASECASE): Update.
 	(SPEED_ROUTINE_MPN_SQRLO): New macro.
 
+	* mpn/generic/rootrem.c: Avoid divisions if not needed.
+
+	* tests/mpn/t-broot.c: Test also k=1.
+	* mpz/aorsmul_i.c: Move branches out of main line.
+
 2015-07-28 Marco Bodrato <bodrato at mail.dm.unipi.it>
 
 	* mpn/generic/sqrtrem.c (mpn_dc_sqrt): Support odd sizes.
diff -r 7850a6d24896 -r 150fe66d26cf mpn/generic/rootrem.c
--- a/mpn/generic/rootrem.c	Mon Aug 17 21:47:23 2015 +0200
+++ b/mpn/generic/rootrem.c	Tue Aug 18 22:28:46 2015 +0200
@@ -50,14 +50,13 @@
 static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
 				       mp_limb_t, int);
 
-#define MPN_RSHIFT(cy,rp,up,un,cnt) \
+#define MPN_RSHIFT(rp,up,un,cnt) \
   do {									\
     if ((cnt) != 0)							\
-      cy = mpn_rshift (rp, up, un, cnt);				\
+      mpn_rshift (rp, up, un, cnt);					\
     else								\
       {									\
 	MPN_COPY_INCR (rp, up, un);					\
-	cy = 0;								\
       }									\
   } while (0)
 
@@ -125,6 +124,82 @@
     }
 }
 
+#define LOGROOT_USED_BITS 8
+#define LOGROOT_NEEDS_TWO_CORRECTIONS 1
+#define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS)
+/* Puts in *rootp some bits of the k^nt root of the number
+   2^bitn * 1.op ; where op represents the "fractional" bits.
+
+   The returned value is the number of bits of the root minus one;
+   i.e. an approximation of the root will be
+   (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1).
+
+   Currently, only LOGROOT_USED_BITS bits of op are used (the implicit
+   one is not counted).
+ */
+static unsigned
+logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k)
+{
+  /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */
+  static const
+  unsigned char vlog[] = {1,   2,   4,   5,   7,   8,   9,  11,  12,  14,  15,  16,  18,  19,  21,  22,
+			 23,  25,  26,  27,  29,  30,  31,  33,  34,  35,  37,  38,  39,  40,  42,  43,
+			 44,  46,  47,  48,  49,  51,  52,  53,  54,  56,  57,  58,  59,  61,  62,  63,
+			 64,  65,  67,  68,  69,  70,  71,  73,  74,  75,  76,  77,  78,  80,  81,  82,
+			 83,  84,  85,  87,  88,  89,  90,  91,  92,  93,  94,  96,  97,  98,  99, 100,
+			101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
+			118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134,
+			135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+			150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164,
+			165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179,
+			180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193,
+			194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206,
+			207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219,
+			220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232,
+			232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244,
+			245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255};
+
+  /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */
+  static const
+  unsigned char vexp[] = {0,   1,   2,   2,   3,   4,   4,   5,   6,   7,   7,   8,   9,   9,  10,  11,
+			 12,  12,  13,  14,  14,  15,  16,  17,  17,  18,  19,  20,  20,  21,  22,  23,
+			 23,  24,  25,  26,  26,  27,  28,  29,  30,  30,  31,  32,  33,  33,  34,  35,
+			 36,  37,  37,  38,  39,  40,  41,  41,  42,  43,  44,  45,  45,  46,  47,  48,
+			 49,  50,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  59,  60,  61,  61,
+			 62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  73,  74,  75,  75,
+			 76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  86,  87,  88,  89,  90,
+			 91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
+			107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122,
+			123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
+			139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
+			157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174,
+			175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193,
+			194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213,
+			214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234,
+			235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255};
+  mp_bitcnt_t retval;
+
+  if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS))
+    {
+      /* In the unlikely case, we use two divisions and a modulo. */
+      retval = bitn / k;
+      bitn %= k;
+      bitn = (bitn << LOGROOT_USED_BITS |
+	      vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
+    }
+  else
+    {
+      bitn = (bitn << LOGROOT_USED_BITS |
+	      vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
+      retval = bitn >> LOGROOT_USED_BITS;
+      bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1;
+    }
+  ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS);
+  *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS)
+    | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS;
+  return retval;
+}
+
 /* if approx is non-zero, does not compute the final remainder */
 static mp_size_t
 mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
@@ -132,21 +207,26 @@
 {
   mp_ptr qp, rp, sp, wp, scratch;
   mp_size_t qn, rn, sn, wn, nl, bn;
-  mp_limb_t save, save2, cy;
+  mp_limb_t save, save2, cy, uh;
   mp_bitcnt_t unb; /* number of significant bits of {up,un} */
   mp_bitcnt_t xnb; /* number of significant bits of the result */
   mp_bitcnt_t b, kk;
   mp_bitcnt_t sizes[GMP_NUMB_BITS + 1];
-  int ni, i;
-  int c, perf_pow;
-  int logk;
+  int ni;
+  int perf_pow;
+  unsigned ulz, snb, c, logk;
   TMP_DECL;
 
-  MPN_SIZEINBASE_2EXP(unb, up, un, 1);
-  /* unb is the number of bits of the input U */
+  /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */
+  uh = up[un - 1];
+  count_leading_zeros (ulz, uh);
+  ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */
+  unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz;
+  /* unb is the (truncated) logarithm of the input U in base 2*/
 
-  if (unb <= k) /* root is 1 */
+  if (unb < k) /* root is 1 */
     {
+      rootp[0] = 1;
       if (remp == NULL)
 	un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */
       else
@@ -155,43 +235,48 @@
 	  un -= (remp [un - 1] == 0);	/* There should be at most one zero limb,
 				   if we demand u to be normalized  */
 	}
-      rootp[0] = 1;
       return un;
     }
-  /* if (unb - k <= k/2) // root is 2 */
+  /* if (unb - k < k/2 + k/16) // root is 2 */
 
-  xnb = (unb - 1) / k + 1;	/* ceil (unb / k) */
-  /* xnb is the number of bits of the root R */
+  if (ulz == GMP_NUMB_BITS)
+    uh = up[un - 2];
+  else
+    uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz);
+  ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1);
 
-  /* We initialize the algorithm with a 1-bit approximation to zero: since we
-     know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that
-     r0^k = 2^(k*(xnb-1)), that we subtract to the input. */
-  kk = k * (xnb - 1);		/* number of truncated bits in the input */
-  --kk;
-  rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */
+  xnb = logbased_root (rootp, uh, unb, k);
+  snb = LOGROOT_RETURNED_BITS - 1;
+  /* xnb+1 is the number of bits of the root R */
+  /* snb+1 is the number of bits of the current approximation S */
 
-  for (logk = 1; ((k - 1) >> logk) != 0; logk++)
+  kk = k * xnb;		/* number of truncated bits in the input */
+
+  /* FIXME: Should we skipp all loops when xnb > snb ? */
+  for (uh = k - 1, logk = 2; (uh >>= 1) != 0; ++logk )
     ;
-  /* logk = ceil(log(k)/log(2)) */
+  /* logk = ceil(log(k)/log(2)) + 1 */
+  
+  /* xnb is the number of remaining bits to determine in the kth root */
+  for (ni = 0; (sizes[ni] = xnb) > snb; ++ni)
+    {
+      /* invariant: here we want xnb+1 total bits for the kth root */
 
-  b = xnb - 1; /* number of remaining bits to determine in the kth root */
-  ni = 0;
-  do
-    {
-      /* invariant: here we want b+1 total bits for the kth root */
-      sizes[ni] = b;
-      /* if c is the new value of b, this means that we'll go from a root
-	 of c+1 bits (say s') to a root of b+1 bits.
-	 It is proved in the book "Modern Computer Arithmetic" from Brent
+      /* if c is the new value of xnb, this means that we'll go from a
+	 root of c+1 bits (say s') to a root of xnb+1 bits.
+	 It is proved in the book "Modern Computer Arithmetic" by Brent
 	 and Zimmermann, Chapter 1, that
 	 if s' >= k*beta, then at most one correction is necessary.
-	 Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that
-	 c >= ceil((b + log2(k))/2). */
-      b = (b + logk + 1) / 2;
-      if (b >= sizes[ni])
-	b = sizes[ni] - 1;	/* add just one bit at a time */
-      ni++;
-    } while (b != 0);
+	 Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that
+	 c >= ceil((xnb + log2(k))/2). */
+      if (xnb > logk)
+	xnb = (xnb + logk) / 2;
+      else
+	--xnb;	/* add just one bit at a time */
+    }
+
+  *rootp >>= snb - xnb;
+  kk -= xnb;
 
   ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
   /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
@@ -215,24 +300,14 @@
 					 and temporary for mpn_pow_1 */
 
   if (remp == NULL)
-    rp = scratch;     /* will contain the remainder */
+    rp = scratch;	/* will contain the remainder */
   else
     rp = remp;
   sp = rootp;
 
-  MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
-  MPN_DECR_U (rp, rn, 2);	/* subtract the initial approximation: since
-				   the non-truncated part is less than 2^k, it
-				   is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */
-  rn -= rp [rn - 1] == 0;
-  sp[0] = save = 2;		/* initial approximation */
-  sn = 1;			/* it has one limb */
+  sn = 1;		/* Initial approximation has one limb */
 
-  wp[0] = k; /* k * {sp,sn}^(k-1) */
-  wn = 1;
-  i = ni;
-  b = bn = 1;
-  do
+  for (b = xnb; ni != 0; --ni)
     {
       /* 1: loop invariant:
 	 {sp, sn} is the current approximation of the root, which has
@@ -242,58 +317,13 @@
 	 kk = number of truncated bits of the input
       */
 
-      /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
-
-      /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
-      if (UNLIKELY (rn < wn))
-	{
-	  MPN_ZERO (sp, bn);
-	}
-      else
-	{
-	  qn = rn - wn; /* expected quotient size */
-	  if (qn <= bn) { /* Divide only if result is not too big. */
-	    mpn_div_q (qp, rp, rn, wp, wn, scratch);
-	    qn += qp[qn] != 0;
-	  }
-
-      /* 5: current buffers: {sp,sn}, {qp,qn}.
-	 Note: {rp,rn} is not needed any more since we'll compute it from
-	 scratch at the end of the loop.
-       */
-
-      /* the quotient should be smaller than 2^b, since the previous
-	 approximation was correctly rounded toward zero */
-	  if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
-			  qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS))))
-	    {
-	      for (qn = 1; qn < bn; ++qn)
-		sp[qn - 1] = GMP_NUMB_MAX;
-	      sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS-1 - ((b-1) % GMP_NUMB_BITS));
-	    }
-	  else