[Gmp-commit] /var/hg/gmp: mpz/bin_uiui.c: New "basecase" code (by Torbjorn).

Mon Apr 9 17:48:01 CEST 2012

details:   /var/hg/gmp/rev/2c2dd340fb3c
changeset: 14798:2c2dd340fb3c
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Mon Apr 09 17:47:55 2012 +0200
description:
mpz/bin_uiui.c: New "basecase" code (by Torbjorn).

diffstat:

 ChangeLog      |    4 +
 mpz/bin_uiui.c |  504 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 414 insertions(+), 94 deletions(-)

diffs (truncated from 535 to 300 lines):

diff -r ebea11b4a7e6 -r 2c2dd340fb3c ChangeLog

--- a/ChangeLog	Thu Apr 05 10:08:42 2012 +0200
+++ b/ChangeLog	Mon Apr 09 17:47:55 2012 +0200
@@ -1,3 +1,7 @@
+2012-04-02 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpz/bin_uiui.c: New "basecase" code (by Torbjorn).
+
 2012-04-05  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/powerpc64/p7/popcount.asm: Properly extend arg n for mode32.
diff -r ebea11b4a7e6 -r 2c2dd340fb3c mpz/bin_uiui.c
--- a/mpz/bin_uiui.c	Thu Apr 05 10:08:42 2012 +0200
+++ b/mpz/bin_uiui.c	Mon Apr 09 17:47:55 2012 +0200
@@ -1,7 +1,6 @@
 /* mpz_bin_uiui - compute n over k.
 
-Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2006 Free Software Foundation,
-Inc.
+Copyright 2011, 2012 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -22,102 +21,419 @@
 #include "gmp-impl.h"
 #include "longlong.h"
 
+#include "fac_ui.h"
 
-/* Enhancement: It ought to be possible to calculate the size of the final
-   result in advance, to a rough approximation at least, and use it to do
-   just one realloc.  Stirling's approximation n! ~= sqrt(2*pi*n)*(n/e)^n
-   (Knuth section 1.2.5) might be of use.  */
+/* Algorithm:
 
-/* "inc" in the main loop allocates a chunk more space if not already
-   enough, so as to avoid repeated reallocs.  The final step on the other
-   hand requires only one more limb.  */
-#define MULDIV(inc)                                                     \
-  do {                                                                  \
-    ASSERT (rsize <= ralloc);                                           \
-                                                                        \
-    if (rsize == ralloc)                                                \
-      {                                                                 \
-        mp_size_t  new_ralloc = ralloc + (inc);                         \
-        rp = __GMP_REALLOCATE_FUNC_LIMBS (rp, ralloc, new_ralloc);      \
-        ralloc = new_ralloc;                                            \
-      }                                                                 \
-                                                                        \
-    rp[rsize] = mpn_mul_1 (rp, rp, rsize, nacc);                        \
-    MPN_DIVREM_OR_DIVEXACT_1 (rp, rp, rsize+1, kacc);                   \
-    rsize += (rp[rsize] != 0);                                          \
-                                                                        \
-} while (0)
+   Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
+   which are then accumulated into mpn numbers.  The first inner loop
+   accumulates divisor factors, the 2nd inner loop accumulates exactly the same
+   number of dividend factors.  We avoid accumulating more for the divisor,
+   even with its smaller factors, since we else cannot guarantee divisibility.
+
+   Since we know each division will yield an integer, we compute the quotient
+   using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
+   2^t.
+
+   Improvements:
+
+   (1) An obvious improvement to this code would be to compute mod 2^t
+   everywhere.  Unfortunately, we cannot determine t beforehand, unless we
+   invoke some approximation, such as Stirling's formula.  Of course, we don't
+   need t to be tight.  However, it is not clear that this would help much,
+   our numbers are kept reasonably small already.
+
+   (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
+   Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
+   would make it both reasonably accurate and fast.  (We could use a table
+   stored into a limb, perhaps.)  The table should take the removed factors of
+   2 into account (those done on-the-fly in mulN).
+*/
+
+/* This threshold determines how large divisor to accumulate before we call
+   bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
+   since we are just basecase code anyway?  Presumably, this depends on the
+   relative speed of the asymptotically fast code ad this code.  */
+#define SOME_THRESHOLD 20
+
+/* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
+   All versions of MAXFACS don't take this 2 removal into account now, meaning
+   that then, shifting just adds some overhead.  (We remove factors from the
+   completed limb anyway.)  */
+
+static mp_limb_t
+mul1 (mp_limb_t m)
+{
+  return m;
+}
+
+static mp_limb_t
+mul2 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+  return m01;
+}
+
+static mp_limb_t
+mul3 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+  mp_limb_t m2 = (m + 2);
+  return m01 * m2;
+}
+
+static mp_limb_t
+mul4 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+  mp_limb_t m23 = (m + 2) * (m + 3) >> 1;
+  return m01 * m23 >> 1;
+}
+
+static mp_limb_t
+mul5 (mp_limb_t m)
+{
+  mp_limb_t m012 = (m + 0) * (m + 1) * (m + 2) >> 1;
+  mp_limb_t m34 = (m + 3) * (m + 4) >> 1;
+  return m012 * m34 >> 1;
+}
+
+static mp_limb_t
+mul6 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1);
+  mp_limb_t m23 = (m + 2) * (m + 3);
+  mp_limb_t m45 = (m + 4) * (m + 5) >> 1;
+  mp_limb_t m0123 = m01 * m23 >> 3;
+  return m0123 * m45;
+}
+
+static mp_limb_t
+mul7 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1);
+  mp_limb_t m23 = (m + 2) * (m + 3);
+  mp_limb_t m456 = (m + 4) * (m + 5) * (m + 6) >> 1;
+  mp_limb_t m0123 = m01 * m23 >> 3;
+  return m0123 * m456;
+}
+
+static mp_limb_t
+mul8 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1);
+  mp_limb_t m23 = (m + 2) * (m + 3);
+  mp_limb_t m45 = (m + 4) * (m + 5);
+  mp_limb_t m67 = (m + 6) * (m + 7);
+  mp_limb_t m0123 = m01 * m23 >> 3;
+  mp_limb_t m4567 = m45 * m67 >> 3;
+  return m0123 * m4567 >> 1;
+}
+
+typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
+
+static const mulfunc_t mulfunc[] = {0,mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8};
+#define M (numberof(mulfunc)-1)
+
+/* Number of factors-of-2 removed by the corresponding mulN functon.  */
+static const unsigned char tcnttab[] = {0,0,1,1,3,3,4,4,7};
+
+/* Rough computation of how many factors we can multiply together without
+   spilling a limb.  */
+#if 0
+/* This variant is inaccurate and relies on an expensive division.  */
+#define MAXFACS(max,l)							\
+  do {									\
+    int __cnt, __logb;							\
+    count_leading_zeros (__cnt, (mp_limb_t) (l));			\
+    __logb = GMP_LIMB_BITS - __cnt;					\
+    (max) = (GMP_NUMB_BITS + 1) / __logb; /* FIXME: remove division */	\
+  } while (0)
+#else
+
+/* This variant is exact but uses a loop.  It takes the 2 removal of mulN into
+ account.  */
+static const unsigned long ftab[] =
+#if GMP_NUMB_BITS == 64
+  /* 1 to 8 factors per iteration */
+  {0xfffffffffffffffful,0x100000000ul,0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
+#endif
+#if GMP_NUMB_BITS == 32
+  /* 1 to 7 factors per iteration */
+  {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
+#endif
+
+#define MAXFACS(max,l)							\
+  do {									\
+    int __i;								\
+    for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)		\
+      ;									\
+    (max) = __i + 1;							\
+  } while (0)
+#endif
+
+static void
+mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
+{
+  int nmax, kmax, nmaxnow, numfac;
+  mp_ptr np, kp;
+  mp_size_t nn, kn, alloc;
+  mp_limb_t i, j, t, iii, jjj, cy, dinv;
+  mp_bitcnt_t i2cnt, j2cnt;
+  int cnt;
+  mp_size_t maxn;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
+
+  /* FIXME: This allocation might be insufficient, but is usually way too
+     large.  */
+  alloc = SOME_THRESHOLD + MAX (3 * maxn / 2, SOME_THRESHOLD);
+  np = TMP_ALLOC_LIMBS (alloc);
+  kp = TMP_ALLOC_LIMBS (alloc);
+
+  MAXFACS (nmax, n);
+  nmax = MIN (nmax, M);
+  MAXFACS (kmax, k);
+  kmax = MIN (kmax, M);
+
+  j = 1;
+  i = n - k + 1;
+
+  nmax = MIN (nmax, k);
+  kmax = MIN (kmax, k);
+
+  np[0] = 1; nn = 1;
+
+  i2cnt = 0;				/* total low zeros in dividend */
+  j2cnt = 0;				/* total low zeros in divisor */
+
+  while (kmax != 0)
+    {
+      numfac = j;
+
+      jjj = mulfunc[kmax] (j);
+      j += kmax;				/* number of factors used */
+      count_trailing_zeros (cnt, jjj);		/* count low zeros */
+      jjj >>= cnt;				/* remove remaining low zeros */
+      j2cnt += tcnttab[kmax] + cnt;		/* update low zeros count */
+      kp[0] = jjj;				/* store new factors */
+      kn = 1;
+      t = k - j + 1;
+      kmax = MIN (kmax, t);
+
+      while (kmax != 0 && kn <  SOME_THRESHOLD)
+	{
+	  jjj = mulfunc[kmax] (j);
+	  j += kmax;				/* number of factors used */
+	  count_trailing_zeros (cnt, jjj);	/* count low zeros */
+	  jjj >>= cnt;				/* remove remaining low zeros */
+	  j2cnt += tcnttab[kmax] + cnt;		/* update low zeros count */
+	  cy = mpn_mul_1 (kp, kp, kn, jjj);	/* accumulate new factors */
+	  kp[kn] = cy;
+	  kn += cy != 0;
+	  t = k - j + 1;
+	  kmax = MIN (kmax, t);
+	}
+      numfac = j - numfac;
+
+      while (numfac != 0)
+	{
+	  nmaxnow = MIN (nmax, numfac);
+	  iii = mulfunc[nmaxnow] (i);
+	  i += nmaxnow;				/* number of factors used */
+	  count_trailing_zeros (cnt, iii);	/* count low zeros */
+	  iii >>= cnt;				/* remove remaining low zeros */
+	  i2cnt += tcnttab[nmaxnow] + cnt;	/* update low zeros count */
+	  cy = mpn_mul_1 (np, np, nn, iii);	/* accumulate new factors */
+	  np[nn] = cy;
+	  nn += cy != 0;
+	  numfac -= nmaxnow;
+	}
+
+      ASSERT (nn < alloc);
+
+      binvert_limb (dinv, kp[0]);
+      nn += (np[nn - 1] >= kp[kn - 1]);
+      nn -= kn;
+      mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
+    }
+
+  /* Put back the right number of factors of 2.  */
+  cnt = i2cnt - j2cnt;
+  if (cnt != 0)
+    {
+      ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
+      cy = mpn_lshift (np, np, nn, cnt);
+      np[nn] = cy;
+      nn += cy != 0;
+    }
+