[Gmp-commit] /home/hgfiles/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Jan 18 19:00:56 CET 2010
details: /home/hgfiles/gmp/rev/a8a0dc41d911
changeset: 13382:a8a0dc41d911
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Jan 18 18:58:42 2010 +0100
description:
Rewrite FFT tuning code.
details: /home/hgfiles/gmp/rev/6e47f8f34f33
changeset: 13383:6e47f8f34f33
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Jan 18 19:00:36 2010 +0100
description:
(MUL_FFT_TABLE3, SQR_FFT_TABLE3): Provide dummy versions for tuneup builds.
diffstat:
ChangeLog | 9 +
gmp-impl.h | 8 +-
tune/tuneup.c | 406 +++++++++++++++++++++++++++++++++++++++++++++++++---------
3 files changed, 359 insertions(+), 64 deletions(-)
diffs (truncated from 502 to 300 lines):
diff -r ec4b3cc65104 -r 6e47f8f34f33 ChangeLog
--- a/ChangeLog Mon Jan 18 01:05:11 2010 +0100
+++ b/ChangeLog Mon Jan 18 19:00:36 2010 +0100
@@ -1,6 +1,15 @@
2010-01-18 Torbjorn Granlund <tege at gmplib.org>
+ * tune/tuneup.c (fftmes): New function
+ (fft): Rewrite.
+ (mpn_mul_fft_lcm): New function, copied from mpn/generic/mul_fft.c.
+ (fftfill): New function, code taken from mul_fft.c (mpn_mul_fft).
+ (cached_measure): New function.
+
* gmp-impl.h (struct fft_table_nk): Moved from mul_fft.c.
+ (MUL_FFT_TABLE3, SQR_FFT_TABLE3): Provide dummy versions for tuneup
+ builds.
+ (FFT_TABLE3_SIZE): Increase value for tuneup builds.
* mpn/generic/mul_fft.c: Handle a new FFT threshold table type ("3").
Misc cleanups to old table type code.
diff -r ec4b3cc65104 -r 6e47f8f34f33 gmp-impl.h
--- a/gmp-impl.h Mon Jan 18 01:05:11 2010 +0100
+++ b/gmp-impl.h Mon Jan 18 19:00:36 2010 +0100
@@ -4246,6 +4246,9 @@
#undef MUL_FFT_TABLE
#define MUL_FFT_TABLE { 0 }
+#undef MUL_FFT_TABLE3
+#define MUL_FFT_TABLE3 { {0,0} }
+
/* A native mpn_sqr_basecase is not tuned and SQR_BASECASE_THRESHOLD should
remain as zero (always use it). */
#if ! HAVE_NATIVE_mpn_sqr_basecase
@@ -4290,6 +4293,9 @@
#undef SQR_FFT_TABLE
#define SQR_FFT_TABLE { 0 }
+#undef SQR_FFT_TABLE3
+#define SQR_FFT_TABLE3 { {0,0} }
+
#undef MULLO_BASECASE_THRESHOLD
#define MULLO_BASECASE_THRESHOLD mullo_basecase_threshold
extern mp_size_t mullo_basecase_threshold;
@@ -4451,7 +4457,7 @@
#undef FFT_TABLE_ATTRS
#define FFT_TABLE_ATTRS
extern mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE];
-#define FFT_TABLE3_SIZE 200
+#define FFT_TABLE3_SIZE 2000 /* generous space for tuning */
extern struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE];
/* Sizes the tune program tests up to, used in a couple of recompilations. */
diff -r ec4b3cc65104 -r 6e47f8f34f33 tune/tuneup.c
--- a/tune/tuneup.c Mon Jan 18 01:05:11 2010 +0100
+++ b/tune/tuneup.c Mon Jan 18 19:00:36 2010 +0100
@@ -720,81 +720,352 @@
return pl;
}
+#define NMAX_DEFAULT 1000000
+#define MAX_REPS 25
+#define MIN_REPS 5
+
+static inline size_t
+mpn_mul_fft_lcm (size_t a, unsigned int k)
+{
+ unsigned int l = k;
+
+ while (a % 2 == 0 && k > 0)
+ {
+ a >>= 1;
+ k--;
+ }
+ return a << l;
+}
+
+mp_size_t
+fftfill (mp_size_t pl, int k, int sqr)
+{
+ mp_size_t maxLK;
+ mp_bitcnt_t N, Nprime, nprime, M;
+
+ N = pl * GMP_NUMB_BITS;
+ M = N >> k;
+
+ maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
+
+ Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
+ nprime = Nprime / GMP_NUMB_BITS;
+ if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+ {
+ size_t K2;
+ for (;;)
+ {
+ K2 = 1L << mpn_fft_best_k (nprime, sqr);
+ if ((nprime & (K2 - 1)) == 0)
+ break;
+ nprime = (nprime + K2 - 1) & -K2;
+ Nprime = nprime * GMP_LIMB_BITS;
+ }
+ }
+ ASSERT_ALWAYS (nprime < pl);
+
+ return Nprime;
+}
+
+static int
+compare_double (const void *ap, const void *bp)
+{
+ double a = * (const double *) ap;
+ double b = * (const double *) bp;
+
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ else
+ return 0;
+}
+
+double
+median (double *times, int n)
+{
+ qsort (times, n, sizeof (double), compare_double);
+ return times[n/2];
+}
+
+#define FFT_CACHE_SIZE 25
+typedef struct fft_cache
+{
+ mp_size_t n;
+ double time;
+} fft_cache_t;
+
+fft_cache_t fft_cache[FFT_CACHE_SIZE];
+
+double
+cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
+ int n_measurements)
+{
+ int i;
+ double t, ttab[MAX_REPS];
+
+ if (fft_cache[k].n == n)
+ return fft_cache[k].time;
+
+ for (i = 0; i < n_measurements; i++)
+ {
+ speed_starttime ();
+ mpn_mul_fft (rp, n, ap, n, bp, n, k);
+ ttab[i] = speed_endtime ();
+ }
+
+ t = median (ttab, n_measurements);
+ fft_cache[k].n = n;
+ fft_cache[k].time = t;
+ return t;
+}
+
+int
+fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
+{
+ mp_size_t n, n1, prev_n1;
+ int k, best_k, last_best_k, start_k, kmax;
+ int eff, prev_eff;
+ double t0, t1;
+ int n_measurements;
+ mp_limb_t *ap, *bp, *rp;
+ mp_size_t alloc;
+ char *linepref;
+
+ for (k = 0; k < FFT_CACHE_SIZE; k++)
+ fft_cache[k].n = 0;
+
+ if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+ {
+ nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
+ }
+
+ if (print)
+ printf ("#define %s%*s", p->table_name, 38, "");
+
+ if (idx == 0)
+ {
+ mpn_fft_table3[p->sqr][0].n = nmin;
+ mpn_fft_table3[p->sqr][0].k = initial_k;
+
+ if (print)
+ {
+ printf ("\\\n { ");
+ printf ("{%7u,%2u}", mpn_fft_table3[p->sqr][0].n, mpn_fft_table3[p->sqr][0].k);
+ linepref = " ";
+ }
+
+ idx = 1;
+ }
+
+ ap = malloc (sizeof (mp_limb_t));
+ if (p->sqr)
+ bp = ap;
+ else
+ bp = malloc (sizeof (mp_limb_t));
+ rp = malloc (sizeof (mp_limb_t));
+ alloc = 1;
+
+ n = nmin;
+
+ n_measurements = (18 - initial_k) | 1;
+ n_measurements = MAX (n_measurements, MIN_REPS);
+ n_measurements = MIN (n_measurements, MAX_REPS);
+
+ last_best_k = initial_k;
+ best_k = initial_k;
+
+ while (n < nmax)
+ {
+ /* Assume the current best k is best until we hit its next FFT step. */
+ t0 = 99999;
+
+ prev_n1 = n + 1;
+
+ start_k = MAX (4, best_k - 4);
+ for (k = start_k; k <= 30; k++)
+ {
+ n1 = mpn_fft_next_size (prev_n1, k);
+
+ eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
+
+ if (eff < 70) /* avoid measuring too slow fft:s */
+ continue;
+
+ if (n1 > alloc)
+ {
+ alloc = n1;
+ if (p->sqr)
+ {
+ ap = realloc (ap, sizeof (mp_limb_t));
+ rp = realloc (rp, sizeof (mp_limb_t));
+ ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
+ mpn_random (ap, alloc);
+ rp = realloc (rp, alloc * sizeof (mp_limb_t));
+ }
+ else
+ {
+ ap = realloc (ap, sizeof (mp_limb_t));
+ bp = realloc (bp, sizeof (mp_limb_t));
+ rp = realloc (rp, sizeof (mp_limb_t));
+ ap = realloc (ap, alloc * sizeof (mp_limb_t));
+ mpn_random (ap, alloc);
+ bp = realloc (bp, alloc * sizeof (mp_limb_t));
+ mpn_random (bp, alloc);
+ rp = realloc (rp, alloc * sizeof (mp_limb_t));
+ }
+ }
+
+ t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
+
+ if (t1 * n_measurements > 0.3)
+ n_measurements -= 2;
+ n_measurements = MAX (n_measurements, MIN_REPS);
+
+ if (t1 < t0)
+ {
+ best_k = k;
+ t0 = t1;
+ }
+ }
+
+ n1 = mpn_fft_next_size (prev_n1, best_k);
+
+ if (last_best_k != best_k)
+ {
+ if (last_best_k >= 0)
+ ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
+
+ if (idx >= FFT_TABLE3_SIZE)
+ {
+ printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
+ abort ();
+ }
+ mpn_fft_table3[p->sqr][idx].n = prev_n1 >> last_best_k;
+ mpn_fft_table3[p->sqr][idx].k = best_k;
+
+ if (print)
+ {
+ printf (", ");
+ if (idx % 4 == 0)
+ printf ("\\\n ");
+ printf ("{%7u,%2u}", mpn_fft_table3[p->sqr][idx].n, mpn_fft_table3[p->sqr][idx].k);
+ }
+
+ if (option_trace >= 2)
+ printf ("{%ld,%d}\n", prev_n1, best_k);
+
+ if (option_trace >= 2)
+ {
+ printf ("{%lu,%u}\n", prev_n1, best_k);
+ fflush (stdout);
+ }
+
+ last_best_k = best_k;
More information about the gmp-commit
mailing list