[Gmp-commit] /var/hg/gmp: mpz/bin_uiui.c: Rewrite.

Fri Apr 13 08:16:07 CEST 2012

details:   /var/hg/gmp/rev/3c0ba709da78
changeset: 14813:3c0ba709da78
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Apr 13 08:16:03 2012 +0200
description:
mpz/bin_uiui.c: Rewrite.

diffstat:

 gen-fac_ui.c   |  149 +++++++++++++++-
 mpz/bin_uiui.c |  525 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 576 insertions(+), 98 deletions(-)

diffs (truncated from 740 to 300 lines):

diff -r 81644c27d7c8 -r 3c0ba709da78 gen-fac_ui.c

--- a/gen-fac_ui.c	Thu Apr 12 22:40:26 2012 +0200
+++ b/gen-fac_ui.c	Fri Apr 13 08:16:03 2012 +0200
@@ -22,13 +22,52 @@
 
 #include "bootstrap.c"
 
+void
+mpz_fac_ui (mpz_t x, unsigned long n)
+{
+  if (n < 2) {
+    mpz_set_ui (x, 1);
+    return;
+  }
+  mpz_set_ui (x, n);
+  for (;--n > 1;)
+    mpz_mul_ui (x, x, n);
+}
+
+void
+mpz_bin_uiui (mpz_t r, unsigned long int n, unsigned long int k)
+{
+  mpz_t t;
+
+  if (k > n) {
+    r->_mp_size = 0;
+    return;
+  }
+  mpz_fac_ui (r, n);
+  mpz_init (t);
+  mpz_fac_ui (t, k);
+  mpz_divexact (r, r, t);
+  mpz_fac_ui (t, n - k);
+  mpz_divexact (r, r, t);
+  mpz_clear (t);
+}
+
+int
+mpz_remove_twos (mpz_t x)
+{
+  int r = 0;
+  for (;mpz_even_p (x);r++)
+    mpz_tdiv_q_2exp (x, x, 1);
+  return r;
+}
 
 /* returns 0 on success		*/
 int
 gen_consts (int numb, int nail, int limb)
 {
-  mpz_t x, mask;
+  mpz_t x, mask, y;
   unsigned long a, b;
+  unsigned long ofl, ofe;
 
   printf ("/* This file is automatically generated by gen-fac_ui.c */\n\n");
   printf ("#if GMP_NUMB_BITS != %d\n", numb);
@@ -73,25 +112,40 @@
     }
   printf (")\n");
 
+  ofl = b - 1;
+  printf
+    ("#define ODD_FACTORIAL_TABLE_LIMIT (%lu)\n", ofl);
   mpz_init (mask);
   mpz_setbit (mask, numb);
   mpz_sub_ui (mask, mask, 1);
-#if 0
   printf
     ("\n/* Previous table, continued, values modulo 2^GMP_NUMB_BITS */\n");
   printf
-    ("#define ONE_LIMB_ODD_FACTORIAL_TABLE_CONT CNST_LIMB(0x");
+    ("#define ONE_LIMB_ODD_FACTORIAL_EXTTABLE CNST_LIMB(0x");
+  mpz_and (x, x, mask);
   mpz_out_str (stdout, 16, x);
-  for (;b < numb; b++)
+  mpz_init (y);
+  mpz_bin_uiui (y, b, b/2);
+  b++;
+  for (;; b++)
     {
       for (a = b; (a & 1) == 0; a >>= 1);
+      if (a == b) {
+	mpz_divexact_ui (y, y, a/2+1); 
+	mpz_mul_ui (y, y, a); 
+      } else
+	mpz_mul_2exp (y, y, 1);
+      if (mpz_sizeinbase (y, 2) > numb)
+	break;
       mpz_mul_ui (x, x, a);
       mpz_and (x, x, mask);
       printf ("),CNST_LIMB(0x");
       mpz_out_str (stdout, 16, x);
     }
   printf (")\n");
-#endif
+  ofe = b - 1;
+  printf
+    ("#define ODD_FACTORIAL_EXTTABLE_LIMIT (%lu)\n", ofe);
 
   printf
     ("\n/* This table is 1!!,3!!,...,(2n+1)!! where (2n+1)!! has <= GMP_NUMB_BITS bits */\n");
@@ -122,6 +176,91 @@
     }
   printf (")\n");
 
+  mpz_add_ui (mask, mask, 1);
+  printf
+    ("\n/* This table contains inverses of odd factorials, modulo 2^GMP_NUMB_BITS */\n");
+  printf
+    ("\n/* It begins with (2!/2)^-1=1 */\n");
+  printf
+    ("#define ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE CNST_LIMB(0x1");
+  mpz_set_ui (x, 1);
+  for (b = 3;b <= ofe - 2; b++)
+    {
+      for (a = b; (a & 1) == 0; a >>= 1);
+      mpz_mul_ui (x, x, a);
+      mpz_invert (y, x, mask);
+      printf ("),CNST_LIMB(0x");
+      mpz_out_str (stdout, 16, y);
+    }
+  printf (")\n");
+
+  printf
+    ("\n/* This table contains 1i-popc(2i) for small i */\n");
+  printf
+    ("\n/* It begins with 2-1=1 (N=1) */\n");
+  printf
+    ("#define TABLE_2N_MINUS_POPC_2N 1");
+  for (b = 4;b <= ofe + 1; b+=2)
+    {
+      mpz_set_ui (x, b);
+      printf (",%lu",b - mpz_popcount (x));
+    }
+  printf ("\n");
+
+  ofl = (ofl + 1) / 2;
+  printf
+    ("#define ODD_CENTRAL_BINOMIAL_OFFSET (%lu)\n", ofl);
+  printf
+    ("\n/* This table contains binomial(2k,k)/2^t */\n");
+  printf
+    ("\n/* It begins with ODD_CENTRAL_BINOMIAL_TABLE_MIN */\n");
+  printf
+    ("#define ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE ");
+  for (b = ofl;; b++)
+    {
+      mpz_bin_uiui (x, 2 * b, b);
+      mpz_remove_twos (x);
+      if (mpz_sizeinbase (x, 2) > numb)
+	break;
+      if (b != ofl)
+	printf ("),");
+      printf("CNST_LIMB(0x");
+      mpz_out_str (stdout, 16, x);
+    }
+  printf (")\n");
+
+  ofe = b - 1;
+  printf
+    ("#define ODD_CENTRAL_BINOMIAL_TABLE_LIMIT (%lu)\n", ofe);
+
+  printf
+    ("\n/* This table contains the inverses of elements in the previous table. */\n");
+  printf
+    ("#define ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE CNST_LIMB(0x");
+  for (b = ofl; b <= ofe; b++)
+    {
+      mpz_bin_uiui (x, 2 * b, b);
+      mpz_remove_twos (x);
+      mpz_invert (x, x, mask);
+      mpz_out_str (stdout, 16, x);
+      if (b != ofe)
+	printf ("),CNST_LIMB(0x");
+    }
+  printf (")\n");
+
+  printf
+    ("\n/* This table contains the values t in the formula binomial(2k,k)/2^t */\n");
+  printf
+    ("#define CENTRAL_BINOMIAL_2FAC_TABLE ");
+  for (b = ofl; b <= ofe; b++)
+    {
+      mpz_bin_uiui (x, 2 * b, b);
+      printf ("%lu",mpz_remove_twos (x));
+      if (b != ofe)
+	printf (",");
+    }
+  printf ("\n");
+
 #if 0
   mpz_set_ui (x, 1);
   mpz_mul_2exp (x, x, limb + 1);	/* x=2^(limb+1)        */
diff -r 81644c27d7c8 -r 3c0ba709da78 mpz/bin_uiui.c
--- a/mpz/bin_uiui.c	Thu Apr 12 22:40:26 2012 +0200
+++ b/mpz/bin_uiui.c	Fri Apr 13 08:16:03 2012 +0200
@@ -1,7 +1,8 @@
 /* mpz_bin_uiui - compute n over k.
 
-Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2006 Free Software Foundation,
-Inc.
+Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+Copyright 2011, 2012 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -22,102 +23,440 @@
 #include "gmp-impl.h"
 #include "longlong.h"
 
+#include "fac_ui.h"
 
-/* Enhancement: It ought to be possible to calculate the size of the final
-   result in advance, to a rough approximation at least, and use it to do
-   just one realloc.  Stirling's approximation n! ~= sqrt(2*pi*n)*(n/e)^n
-   (Knuth section 1.2.5) might be of use.  */
+#ifndef BIN_UIUI_ENABLE_SMALLDC
+#define BIN_UIUI_ENABLE_SMALLDC (1)
+#endif
+#ifndef BIN_UIUI_RECURSIVE_SMALLDC
+#define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
+#endif
+/* Algorithm:
 
-/* "inc" in the main loop allocates a chunk more space if not already
-   enough, so as to avoid repeated reallocs.  The final step on the other
-   hand requires only one more limb.  */
-#define MULDIV(inc)                                                     \
-  do {                                                                  \
-    ASSERT (rsize <= ralloc);                                           \
-                                                                        \
-    if (rsize == ralloc)                                                \
-      {                                                                 \
-        mp_size_t  new_ralloc = ralloc + (inc);                         \
-        rp = __GMP_REALLOCATE_FUNC_LIMBS (rp, ralloc, new_ralloc);      \
-        ralloc = new_ralloc;                                            \
-      }                                                                 \
-                                                                        \
-    rp[rsize] = mpn_mul_1 (rp, rp, rsize, nacc);                        \
-    MPN_DIVREM_OR_DIVEXACT_1 (rp, rp, rsize+1, kacc);                   \
-    rsize += (rp[rsize] != 0);                                          \
-                                                                        \
-} while (0)
+   Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
+   which are then accumulated into mpn numbers.  The first inner loop
+   accumulates divisor factors, the 2nd inner loop accumulates exactly the same
+   number of dividend factors.  We avoid accumulating more for the divisor,
+   even with its smaller factors, since we else cannot guarantee divisibility.
+
+   Since we know each division will yield an integer, we compute the quotient
+   using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
+   2^t.
+
+   Improvements:
+
+   (1) An obvious improvement to this code would be to compute mod 2^t
+   everywhere.  Unfortunately, we cannot determine t beforehand, unless we
+   invoke some approximation, such as Stirling's formula.  Of course, we don't
+   need t to be tight.  However, it is not clear that this would help much,
+   our numbers are kept reasonably small already.
+
+   (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
+   Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
+   would make it both reasonably accurate and fast.  (We could use a table
+   stored into a limb, perhaps.)  The table should take the removed factors of
+   2 into account (those done on-the-fly in mulN).
+*/
+
+/* This threshold determines how large divisor to accumulate before we call
+   bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
+   since we are just basecase code anyway?  Presumably, this depends on the
+   relative speed of the asymptotically fast code and this code.  */
+#define SOME_THRESHOLD 20
+
+/* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
+   All versions of MAXFACS don't take this 2 removal into account now, meaning
+   that then, shifting just adds some overhead.  (We remove factors from the
+   completed limb anyway.)  */
+
+static mp_limb_t
+mul1 (mp_limb_t m)
+{
+  return m;
+}
+
+static mp_limb_t
+mul2 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+  return m01;
+}
+
+static mp_limb_t
+mul3 (mp_limb_t m)
+{
+  mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+  mp_limb_t m2 = (m + 2);
+  return m01 * m2;