[Gmp-commit] /var/hg/gmp: Reverted mpz/bin_uiui.c, pushed by mistake...
mercurial at gmplib.org
mercurial at gmplib.org
Mon Apr 9 20:02:44 CEST 2012
details: /var/hg/gmp/rev/469b4e6699c9
changeset: 14800:469b4e6699c9
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Mon Apr 09 20:02:37 2012 +0200
description:
Reverted mpz/bin_uiui.c, pushed by mistake...
diffstat:
ChangeLog | 4 -
mpz/bin_uiui.c | 504 ++++++++++----------------------------------------------
2 files changed, 94 insertions(+), 414 deletions(-)
diffs (truncated from 535 to 300 lines):
diff -r de4654b90e31 -r 469b4e6699c9 ChangeLog
--- a/ChangeLog Mon Apr 09 18:07:50 2012 +0200
+++ b/ChangeLog Mon Apr 09 20:02:37 2012 +0200
@@ -1,7 +1,3 @@
-2012-04-02 Marco Bodrato <bodrato at mail.dm.unipi.it>
-
- * mpz/bin_uiui.c: New "basecase" code (by Torbjorn).
-
2012-04-05 Torbjorn Granlund <tege at gmplib.org>
* mpn/powerpc64/p7/popcount.asm: Properly extend arg n for mode32.
diff -r de4654b90e31 -r 469b4e6699c9 mpz/bin_uiui.c
--- a/mpz/bin_uiui.c Mon Apr 09 18:07:50 2012 +0200
+++ b/mpz/bin_uiui.c Mon Apr 09 20:02:37 2012 +0200
@@ -1,6 +1,7 @@
/* mpz_bin_uiui - compute n over k.
-Copyright 2011, 2012 Free Software Foundation, Inc.
+Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2006 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
@@ -21,419 +22,102 @@
#include "gmp-impl.h"
#include "longlong.h"
-#include "fac_ui.h"
-/* Algorithm:
+/* Enhancement: It ought to be possible to calculate the size of the final
+ result in advance, to a rough approximation at least, and use it to do
+ just one realloc. Stirling's approximation n! ~= sqrt(2*pi*n)*(n/e)^n
+ (Knuth section 1.2.5) might be of use. */
- Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
- which are then accumulated into mpn numbers. The first inner loop
- accumulates divisor factors, the 2nd inner loop accumulates exactly the same
- number of dividend factors. We avoid accumulating more for the divisor,
- even with its smaller factors, since we else cannot guarantee divisibility.
-
- Since we know each division will yield an integer, we compute the quotient
- using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
- 2^t.
-
- Improvements:
-
- (1) An obvious improvement to this code would be to compute mod 2^t
- everywhere. Unfortunately, we cannot determine t beforehand, unless we
- invoke some approximation, such as Stirling's formula. Of course, we don't
- need t to be tight. However, it is not clear that this would help much,
- our numbers are kept reasonably small already.
-
- (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
- Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
- would make it both reasonably accurate and fast. (We could use a table
- stored into a limb, perhaps.) The table should take the removed factors of
- 2 into account (those done on-the-fly in mulN).
-*/
-
-/* This threshold determines how large divisor to accumulate before we call
- bdiv. Perhaps we should never call bdiv, and accumulate all we are told,
- since we are just basecase code anyway? Presumably, this depends on the
- relative speed of the asymptotically fast code ad this code. */
-#define SOME_THRESHOLD 20
-
-/* Multiply-into-limb functions. These remove factors of 2 on-the-fly. FIXME:
- All versions of MAXFACS don't take this 2 removal into account now, meaning
- that then, shifting just adds some overhead. (We remove factors from the
- completed limb anyway.) */
-
-static mp_limb_t
-mul1 (mp_limb_t m)
-{
- return m;
-}
-
-static mp_limb_t
-mul2 (mp_limb_t m)
-{
- mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
- return m01;
-}
-
-static mp_limb_t
-mul3 (mp_limb_t m)
-{
- mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
- mp_limb_t m2 = (m + 2);
- return m01 * m2;
-}
-
-static mp_limb_t
-mul4 (mp_limb_t m)
-{
- mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
- mp_limb_t m23 = (m + 2) * (m + 3) >> 1;
- return m01 * m23 >> 1;
-}
-
-static mp_limb_t
-mul5 (mp_limb_t m)
-{
- mp_limb_t m012 = (m + 0) * (m + 1) * (m + 2) >> 1;
- mp_limb_t m34 = (m + 3) * (m + 4) >> 1;
- return m012 * m34 >> 1;
-}
-
-static mp_limb_t
-mul6 (mp_limb_t m)
-{
- mp_limb_t m01 = (m + 0) * (m + 1);
- mp_limb_t m23 = (m + 2) * (m + 3);
- mp_limb_t m45 = (m + 4) * (m + 5) >> 1;
- mp_limb_t m0123 = m01 * m23 >> 3;
- return m0123 * m45;
-}
-
-static mp_limb_t
-mul7 (mp_limb_t m)
-{
- mp_limb_t m01 = (m + 0) * (m + 1);
- mp_limb_t m23 = (m + 2) * (m + 3);
- mp_limb_t m456 = (m + 4) * (m + 5) * (m + 6) >> 1;
- mp_limb_t m0123 = m01 * m23 >> 3;
- return m0123 * m456;
-}
-
-static mp_limb_t
-mul8 (mp_limb_t m)
-{
- mp_limb_t m01 = (m + 0) * (m + 1);
- mp_limb_t m23 = (m + 2) * (m + 3);
- mp_limb_t m45 = (m + 4) * (m + 5);
- mp_limb_t m67 = (m + 6) * (m + 7);
- mp_limb_t m0123 = m01 * m23 >> 3;
- mp_limb_t m4567 = m45 * m67 >> 3;
- return m0123 * m4567 >> 1;
-}
-
-typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
-
-static const mulfunc_t mulfunc[] = {0,mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8};
-#define M (numberof(mulfunc)-1)
-
-/* Number of factors-of-2 removed by the corresponding mulN functon. */
-static const unsigned char tcnttab[] = {0,0,1,1,3,3,4,4,7};
-
-/* Rough computation of how many factors we can multiply together without
- spilling a limb. */
-#if 0
-/* This variant is inaccurate and relies on an expensive division. */
-#define MAXFACS(max,l) \
- do { \
- int __cnt, __logb; \
- count_leading_zeros (__cnt, (mp_limb_t) (l)); \
- __logb = GMP_LIMB_BITS - __cnt; \
- (max) = (GMP_NUMB_BITS + 1) / __logb; /* FIXME: remove division */ \
- } while (0)
-#else
-
-/* This variant is exact but uses a loop. It takes the 2 removal of mulN into
- account. */
-static const unsigned long ftab[] =
-#if GMP_NUMB_BITS == 64
- /* 1 to 8 factors per iteration */
- {0xfffffffffffffffful,0x100000000ul,0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
-#endif
-#if GMP_NUMB_BITS == 32
- /* 1 to 7 factors per iteration */
- {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
-#endif
-
-#define MAXFACS(max,l) \
- do { \
- int __i; \
- for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--) \
- ; \
- (max) = __i + 1; \
- } while (0)
-#endif
-
-static void
-mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
-{
- int nmax, kmax, nmaxnow, numfac;
- mp_ptr np, kp;
- mp_size_t nn, kn, alloc;
- mp_limb_t i, j, t, iii, jjj, cy, dinv;
- mp_bitcnt_t i2cnt, j2cnt;
- int cnt;
- mp_size_t maxn;
- TMP_DECL;
-
- TMP_MARK;
-
- maxn = 1 + n / GMP_NUMB_BITS; /* absolutely largest result size (limbs) */
-
- /* FIXME: This allocation might be insufficient, but is usually way too
- large. */
- alloc = SOME_THRESHOLD + MAX (3 * maxn / 2, SOME_THRESHOLD);
- np = TMP_ALLOC_LIMBS (alloc);
- kp = TMP_ALLOC_LIMBS (alloc);
-
- MAXFACS (nmax, n);
- nmax = MIN (nmax, M);
- MAXFACS (kmax, k);
- kmax = MIN (kmax, M);
-
- j = 1;
- i = n - k + 1;
-
- nmax = MIN (nmax, k);
- kmax = MIN (kmax, k);
-
- np[0] = 1; nn = 1;
-
- i2cnt = 0; /* total low zeros in dividend */
- j2cnt = 0; /* total low zeros in divisor */
-
- while (kmax != 0)
- {
- numfac = j;
-
- jjj = mulfunc[kmax] (j);
- j += kmax; /* number of factors used */
- count_trailing_zeros (cnt, jjj); /* count low zeros */
- jjj >>= cnt; /* remove remaining low zeros */
- j2cnt += tcnttab[kmax] + cnt; /* update low zeros count */
- kp[0] = jjj; /* store new factors */
- kn = 1;
- t = k - j + 1;
- kmax = MIN (kmax, t);
-
- while (kmax != 0 && kn < SOME_THRESHOLD)
- {
- jjj = mulfunc[kmax] (j);
- j += kmax; /* number of factors used */
- count_trailing_zeros (cnt, jjj); /* count low zeros */
- jjj >>= cnt; /* remove remaining low zeros */
- j2cnt += tcnttab[kmax] + cnt; /* update low zeros count */
- cy = mpn_mul_1 (kp, kp, kn, jjj); /* accumulate new factors */
- kp[kn] = cy;
- kn += cy != 0;
- t = k - j + 1;
- kmax = MIN (kmax, t);
- }
- numfac = j - numfac;
-
- while (numfac != 0)
- {
- nmaxnow = MIN (nmax, numfac);
- iii = mulfunc[nmaxnow] (i);
- i += nmaxnow; /* number of factors used */
- count_trailing_zeros (cnt, iii); /* count low zeros */
- iii >>= cnt; /* remove remaining low zeros */
- i2cnt += tcnttab[nmaxnow] + cnt; /* update low zeros count */
- cy = mpn_mul_1 (np, np, nn, iii); /* accumulate new factors */
- np[nn] = cy;
- nn += cy != 0;
- numfac -= nmaxnow;
- }
-
- ASSERT (nn < alloc);
-
- binvert_limb (dinv, kp[0]);
- nn += (np[nn - 1] >= kp[kn - 1]);
- nn -= kn;
- mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
- }
-
- /* Put back the right number of factors of 2. */
- cnt = i2cnt - j2cnt;
- if (cnt != 0)
- {
- ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
- cy = mpn_lshift (np, np, nn, cnt);
- np[nn] = cy;
- nn += cy != 0;
- }
-
- nn -= np[nn - 1] == 0; /* normalisation */
-
- MPZ_REALLOC (r, nn);
- SIZ(r) = nn;
- MPN_COPY (PTR(r), np, nn);
- TMP_FREE;
-}
-
-/* Entry i contains (i!/2^t) where t is chosen such that the parenthesis
- is an odd integer. */
-static const mp_limb_t fac[] = { ONE_LIMB_ODD_FACTORIAL_TABLE };
-
-#define MAX_K (numberof(fac)-1)
-
-/* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
- is an odd integer. */
-static const mp_limb_t facinv[] = {
-#if GMP_NUMB_BITS == 64
- 0x0000000000000001,
More information about the gmp-commit
mailing list