[Gmp-commit] /var/hg/gmp: mpz/bin_uiui.c: Rewrite.
mercurial at gmplib.org
mercurial at gmplib.org
Fri Apr 13 08:16:07 CEST 2012
details: /var/hg/gmp/rev/3c0ba709da78
changeset: 14813:3c0ba709da78
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Fri Apr 13 08:16:03 2012 +0200
description:
mpz/bin_uiui.c: Rewrite.
diffstat:
gen-fac_ui.c | 149 +++++++++++++++-
mpz/bin_uiui.c | 525 ++++++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 576 insertions(+), 98 deletions(-)
diffs (truncated from 740 to 300 lines):
diff -r 81644c27d7c8 -r 3c0ba709da78 gen-fac_ui.c
--- a/gen-fac_ui.c Thu Apr 12 22:40:26 2012 +0200
+++ b/gen-fac_ui.c Fri Apr 13 08:16:03 2012 +0200
@@ -22,13 +22,52 @@
#include "bootstrap.c"
+void
+mpz_fac_ui (mpz_t x, unsigned long n)
+{
+ if (n < 2) {
+ mpz_set_ui (x, 1);
+ return;
+ }
+ mpz_set_ui (x, n);
+ for (;--n > 1;)
+ mpz_mul_ui (x, x, n);
+}
+
+void
+mpz_bin_uiui (mpz_t r, unsigned long int n, unsigned long int k)
+{
+ mpz_t t;
+
+ if (k > n) {
+ r->_mp_size = 0;
+ return;
+ }
+ mpz_fac_ui (r, n);
+ mpz_init (t);
+ mpz_fac_ui (t, k);
+ mpz_divexact (r, r, t);
+ mpz_fac_ui (t, n - k);
+ mpz_divexact (r, r, t);
+ mpz_clear (t);
+}
+
+int
+mpz_remove_twos (mpz_t x)
+{
+ int r = 0;
+ for (;mpz_even_p (x);r++)
+ mpz_tdiv_q_2exp (x, x, 1);
+ return r;
+}
/* returns 0 on success */
int
gen_consts (int numb, int nail, int limb)
{
- mpz_t x, mask;
+ mpz_t x, mask, y;
unsigned long a, b;
+ unsigned long ofl, ofe;
printf ("/* This file is automatically generated by gen-fac_ui.c */\n\n");
printf ("#if GMP_NUMB_BITS != %d\n", numb);
@@ -73,25 +112,40 @@
}
printf (")\n");
+ ofl = b - 1;
+ printf
+ ("#define ODD_FACTORIAL_TABLE_LIMIT (%lu)\n", ofl);
mpz_init (mask);
mpz_setbit (mask, numb);
mpz_sub_ui (mask, mask, 1);
-#if 0
printf
("\n/* Previous table, continued, values modulo 2^GMP_NUMB_BITS */\n");
printf
- ("#define ONE_LIMB_ODD_FACTORIAL_TABLE_CONT CNST_LIMB(0x");
+ ("#define ONE_LIMB_ODD_FACTORIAL_EXTTABLE CNST_LIMB(0x");
+ mpz_and (x, x, mask);
mpz_out_str (stdout, 16, x);
- for (;b < numb; b++)
+ mpz_init (y);
+ mpz_bin_uiui (y, b, b/2);
+ b++;
+ for (;; b++)
{
for (a = b; (a & 1) == 0; a >>= 1);
+ if (a == b) {
+ mpz_divexact_ui (y, y, a/2+1);
+ mpz_mul_ui (y, y, a);
+ } else
+ mpz_mul_2exp (y, y, 1);
+ if (mpz_sizeinbase (y, 2) > numb)
+ break;
mpz_mul_ui (x, x, a);
mpz_and (x, x, mask);
printf ("),CNST_LIMB(0x");
mpz_out_str (stdout, 16, x);
}
printf (")\n");
-#endif
+ ofe = b - 1;
+ printf
+ ("#define ODD_FACTORIAL_EXTTABLE_LIMIT (%lu)\n", ofe);
printf
("\n/* This table is 1!!,3!!,...,(2n+1)!! where (2n+1)!! has <= GMP_NUMB_BITS bits */\n");
@@ -122,6 +176,91 @@
}
printf (")\n");
+ mpz_add_ui (mask, mask, 1);
+ printf
+ ("\n/* This table contains inverses of odd factorials, modulo 2^GMP_NUMB_BITS */\n");
+ printf
+ ("\n/* It begins with (2!/2)^-1=1 */\n");
+ printf
+ ("#define ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE CNST_LIMB(0x1");
+ mpz_set_ui (x, 1);
+ for (b = 3;b <= ofe - 2; b++)
+ {
+ for (a = b; (a & 1) == 0; a >>= 1);
+ mpz_mul_ui (x, x, a);
+ mpz_invert (y, x, mask);
+ printf ("),CNST_LIMB(0x");
+ mpz_out_str (stdout, 16, y);
+ }
+ printf (")\n");
+
+ printf
+ ("\n/* This table contains 1i-popc(2i) for small i */\n");
+ printf
+ ("\n/* It begins with 2-1=1 (N=1) */\n");
+ printf
+ ("#define TABLE_2N_MINUS_POPC_2N 1");
+ for (b = 4;b <= ofe + 1; b+=2)
+ {
+ mpz_set_ui (x, b);
+ printf (",%lu",b - mpz_popcount (x));
+ }
+ printf ("\n");
+
+ ofl = (ofl + 1) / 2;
+ printf
+ ("#define ODD_CENTRAL_BINOMIAL_OFFSET (%lu)\n", ofl);
+ printf
+ ("\n/* This table contains binomial(2k,k)/2^t */\n");
+ printf
+ ("\n/* It begins with ODD_CENTRAL_BINOMIAL_TABLE_MIN */\n");
+ printf
+ ("#define ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE ");
+ for (b = ofl;; b++)
+ {
+ mpz_bin_uiui (x, 2 * b, b);
+ mpz_remove_twos (x);
+ if (mpz_sizeinbase (x, 2) > numb)
+ break;
+ if (b != ofl)
+ printf ("),");
+ printf("CNST_LIMB(0x");
+ mpz_out_str (stdout, 16, x);
+ }
+ printf (")\n");
+
+ ofe = b - 1;
+ printf
+ ("#define ODD_CENTRAL_BINOMIAL_TABLE_LIMIT (%lu)\n", ofe);
+
+ printf
+ ("\n/* This table contains the inverses of elements in the previous table. */\n");
+ printf
+ ("#define ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE CNST_LIMB(0x");
+ for (b = ofl; b <= ofe; b++)
+ {
+ mpz_bin_uiui (x, 2 * b, b);
+ mpz_remove_twos (x);
+ mpz_invert (x, x, mask);
+ mpz_out_str (stdout, 16, x);
+ if (b != ofe)
+ printf ("),CNST_LIMB(0x");
+ }
+ printf (")\n");
+
+ printf
+ ("\n/* This table contains the values t in the formula binomial(2k,k)/2^t */\n");
+ printf
+ ("#define CENTRAL_BINOMIAL_2FAC_TABLE ");
+ for (b = ofl; b <= ofe; b++)
+ {
+ mpz_bin_uiui (x, 2 * b, b);
+ printf ("%lu",mpz_remove_twos (x));
+ if (b != ofe)
+ printf (",");
+ }
+ printf ("\n");
+
#if 0
mpz_set_ui (x, 1);
mpz_mul_2exp (x, x, limb + 1); /* x=2^(limb+1) */
diff -r 81644c27d7c8 -r 3c0ba709da78 mpz/bin_uiui.c
--- a/mpz/bin_uiui.c Thu Apr 12 22:40:26 2012 +0200
+++ b/mpz/bin_uiui.c Fri Apr 13 08:16:03 2012 +0200
@@ -1,7 +1,8 @@
/* mpz_bin_uiui - compute n over k.
-Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2006 Free Software Foundation,
-Inc.
+Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+Copyright 2011, 2012 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -22,102 +23,440 @@
#include "gmp-impl.h"
#include "longlong.h"
+#include "fac_ui.h"
-/* Enhancement: It ought to be possible to calculate the size of the final
- result in advance, to a rough approximation at least, and use it to do
- just one realloc. Stirling's approximation n! ~= sqrt(2*pi*n)*(n/e)^n
- (Knuth section 1.2.5) might be of use. */
+#ifndef BIN_UIUI_ENABLE_SMALLDC
+#define BIN_UIUI_ENABLE_SMALLDC (1)
+#endif
+#ifndef BIN_UIUI_RECURSIVE_SMALLDC
+#define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
+#endif
+/* Algorithm:
-/* "inc" in the main loop allocates a chunk more space if not already
- enough, so as to avoid repeated reallocs. The final step on the other
- hand requires only one more limb. */
-#define MULDIV(inc) \
- do { \
- ASSERT (rsize <= ralloc); \
- \
- if (rsize == ralloc) \
- { \
- mp_size_t new_ralloc = ralloc + (inc); \
- rp = __GMP_REALLOCATE_FUNC_LIMBS (rp, ralloc, new_ralloc); \
- ralloc = new_ralloc; \
- } \
- \
- rp[rsize] = mpn_mul_1 (rp, rp, rsize, nacc); \
- MPN_DIVREM_OR_DIVEXACT_1 (rp, rp, rsize+1, kacc); \
- rsize += (rp[rsize] != 0); \
- \
-} while (0)
+ Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
+ which are then accumulated into mpn numbers. The first inner loop
+ accumulates divisor factors, the 2nd inner loop accumulates exactly the same
+ number of dividend factors. We avoid accumulating more for the divisor,
+ even with its smaller factors, since we else cannot guarantee divisibility.
+
+ Since we know each division will yield an integer, we compute the quotient
+ using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
+ 2^t.
+
+ Improvements:
+
+ (1) An obvious improvement to this code would be to compute mod 2^t
+ everywhere. Unfortunately, we cannot determine t beforehand, unless we
+ invoke some approximation, such as Stirling's formula. Of course, we don't
+ need t to be tight. However, it is not clear that this would help much,
+ our numbers are kept reasonably small already.
+
+ (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
+ Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
+ would make it both reasonably accurate and fast. (We could use a table
+ stored into a limb, perhaps.) The table should take the removed factors of
+ 2 into account (those done on-the-fly in mulN).
+*/
+
+/* This threshold determines how large divisor to accumulate before we call
+ bdiv. Perhaps we should never call bdiv, and accumulate all we are told,
+ since we are just basecase code anyway? Presumably, this depends on the
+ relative speed of the asymptotically fast code and this code. */
+#define SOME_THRESHOLD 20
+
+/* Multiply-into-limb functions. These remove factors of 2 on-the-fly. FIXME:
+ All versions of MAXFACS don't take this 2 removal into account now, meaning
+ that then, shifting just adds some overhead. (We remove factors from the
+ completed limb anyway.) */
+
+static mp_limb_t
+mul1 (mp_limb_t m)
+{
+ return m;
+}
+
+static mp_limb_t
+mul2 (mp_limb_t m)
+{
+ mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+ return m01;
+}
+
+static mp_limb_t
+mul3 (mp_limb_t m)
+{
+ mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
+ mp_limb_t m2 = (m + 2);
+ return m01 * m2;
More information about the gmp-commit
mailing list