[Gmp-commit] /home/hgfiles/gmp: 2 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Mon Jan 18 19:00:56 CET 2010


details:   /home/hgfiles/gmp/rev/a8a0dc41d911
changeset: 13382:a8a0dc41d911
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Jan 18 18:58:42 2010 +0100
description:
Rewrite FFT tuning code.

details:   /home/hgfiles/gmp/rev/6e47f8f34f33
changeset: 13383:6e47f8f34f33
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Jan 18 19:00:36 2010 +0100
description:
(MUL_FFT_TABLE3, SQR_FFT_TABLE3): Provide dummy versions for tuneup builds.

diffstat:

 ChangeLog     |    9 +
 gmp-impl.h    |    8 +-
 tune/tuneup.c |  406 +++++++++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 359 insertions(+), 64 deletions(-)

diffs (truncated from 502 to 300 lines):

diff -r ec4b3cc65104 -r 6e47f8f34f33 ChangeLog
--- a/ChangeLog	Mon Jan 18 01:05:11 2010 +0100
+++ b/ChangeLog	Mon Jan 18 19:00:36 2010 +0100
@@ -1,6 +1,15 @@
 2010-01-18  Torbjorn Granlund  <tege at gmplib.org>
 
+	* tune/tuneup.c (fftmes): New function
+	(fft): Rewrite.
+	(mpn_mul_fft_lcm): New function, copied from mpn/generic/mul_fft.c.
+	(fftfill): New function, code taken from mul_fft.c (mpn_mul_fft).
+	(cached_measure): New function.
+
 	* gmp-impl.h (struct fft_table_nk): Moved from mul_fft.c.
+	(MUL_FFT_TABLE3, SQR_FFT_TABLE3): Provide dummy versions for tuneup
+	builds.
+	(FFT_TABLE3_SIZE): Increase value for tuneup builds.
 
 	* mpn/generic/mul_fft.c: Handle a new FFT threshold table type ("3").
 	Misc cleanups to old table type code.
diff -r ec4b3cc65104 -r 6e47f8f34f33 gmp-impl.h
--- a/gmp-impl.h	Mon Jan 18 01:05:11 2010 +0100
+++ b/gmp-impl.h	Mon Jan 18 19:00:36 2010 +0100
@@ -4246,6 +4246,9 @@
 #undef	MUL_FFT_TABLE
 #define MUL_FFT_TABLE			{ 0 }
 
+#undef	MUL_FFT_TABLE3
+#define MUL_FFT_TABLE3			{ {0,0} }
+
 /* A native mpn_sqr_basecase is not tuned and SQR_BASECASE_THRESHOLD should
    remain as zero (always use it). */
 #if ! HAVE_NATIVE_mpn_sqr_basecase
@@ -4290,6 +4293,9 @@
 #undef	SQR_FFT_TABLE
 #define SQR_FFT_TABLE			{ 0 }
 
+#undef	SQR_FFT_TABLE3
+#define SQR_FFT_TABLE3			{ {0,0} }
+
 #undef	MULLO_BASECASE_THRESHOLD
 #define MULLO_BASECASE_THRESHOLD	mullo_basecase_threshold
 extern mp_size_t			mullo_basecase_threshold;
@@ -4451,7 +4457,7 @@
 #undef  FFT_TABLE_ATTRS
 #define FFT_TABLE_ATTRS
 extern mp_size_t  mpn_fft_table[2][MPN_FFT_TABLE_SIZE];
-#define FFT_TABLE3_SIZE 200
+#define FFT_TABLE3_SIZE 2000	/* generous space for tuning */
 extern struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE];
 
 /* Sizes the tune program tests up to, used in a couple of recompilations. */
diff -r ec4b3cc65104 -r 6e47f8f34f33 tune/tuneup.c
--- a/tune/tuneup.c	Mon Jan 18 01:05:11 2010 +0100
+++ b/tune/tuneup.c	Mon Jan 18 19:00:36 2010 +0100
@@ -720,81 +720,352 @@
   return pl;
 }
 
+#define NMAX_DEFAULT 1000000
+#define MAX_REPS 25
+#define MIN_REPS 5
+
+static inline size_t
+mpn_mul_fft_lcm (size_t a, unsigned int k)
+{
+  unsigned int l = k;
+
+  while (a % 2 == 0 && k > 0)
+    {
+      a >>= 1;
+      k--;
+    }
+  return a << l;
+}
+
+mp_size_t
+fftfill (mp_size_t pl, int k, int sqr)
+{
+  mp_size_t maxLK;
+  mp_bitcnt_t N, Nprime, nprime, M;
+
+  N = pl * GMP_NUMB_BITS;
+  M = N >> k;
+
+  maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
+
+  Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
+  nprime = Nprime / GMP_NUMB_BITS;
+  if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+    {
+      size_t K2;
+      for (;;)
+	{
+	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
+	  if ((nprime & (K2 - 1)) == 0)
+	    break;
+	  nprime = (nprime + K2 - 1) & -K2;
+	  Nprime = nprime * GMP_LIMB_BITS;
+	}
+    }
+  ASSERT_ALWAYS (nprime < pl);
+
+  return Nprime;
+}
+
+static int
+compare_double (const void *ap, const void *bp)
+{
+  double a = * (const double *) ap;
+  double b = * (const double *) bp;
+
+  if (a < b)
+    return -1;
+  else if (a > b)
+    return 1;
+  else
+    return 0;
+}
+
+double
+median (double *times, int n)
+{
+  qsort (times, n, sizeof (double), compare_double);
+  return times[n/2];
+}
+
+#define FFT_CACHE_SIZE 25
+typedef struct fft_cache
+{
+  mp_size_t n;
+  double time;
+} fft_cache_t;
+
+fft_cache_t fft_cache[FFT_CACHE_SIZE];
+
+double
+cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
+		int n_measurements)
+{
+  int i;
+  double t, ttab[MAX_REPS];
+
+  if (fft_cache[k].n == n)
+    return fft_cache[k].time;
+
+  for (i = 0; i < n_measurements; i++)
+    {
+      speed_starttime ();
+      mpn_mul_fft (rp, n, ap, n, bp, n, k);
+      ttab[i] = speed_endtime ();
+    }
+
+  t = median (ttab, n_measurements);
+  fft_cache[k].n = n;
+  fft_cache[k].time = t;
+  return t;
+}
+
+int
+fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
+{
+  mp_size_t n, n1, prev_n1;
+  int k, best_k, last_best_k, start_k, kmax;
+  int eff, prev_eff;
+  double t0, t1;
+  int n_measurements;
+  mp_limb_t *ap, *bp, *rp;
+  mp_size_t alloc;
+  char *linepref;
+
+  for (k = 0; k < FFT_CACHE_SIZE; k++)
+    fft_cache[k].n = 0;
+
+  if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+    {
+      nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
+    }
+
+  if (print)
+    printf ("#define %s%*s", p->table_name, 38, "");
+
+  if (idx == 0)
+    {
+      mpn_fft_table3[p->sqr][0].n = nmin;
+      mpn_fft_table3[p->sqr][0].k = initial_k;
+
+      if (print)
+	{
+	  printf ("\\\n  { ");
+	  printf ("{%7u,%2u}", mpn_fft_table3[p->sqr][0].n, mpn_fft_table3[p->sqr][0].k);
+	  linepref = "    ";
+	}
+
+      idx = 1;
+    }
+
+  ap = malloc (sizeof (mp_limb_t));
+  if (p->sqr)
+    bp = ap;
+  else
+    bp = malloc (sizeof (mp_limb_t));
+  rp = malloc (sizeof (mp_limb_t));
+  alloc = 1;
+
+  n = nmin;
+
+  n_measurements = (18 - initial_k) | 1;
+  n_measurements = MAX (n_measurements, MIN_REPS);
+  n_measurements = MIN (n_measurements, MAX_REPS);
+
+  last_best_k = initial_k;
+  best_k = initial_k;
+
+  while (n < nmax)
+    {
+      /* Assume the current best k is best until we hit its next FFT step.  */
+      t0 = 99999;
+
+      prev_n1 = n + 1;
+
+      start_k = MAX (4, best_k - 4);
+      for (k = start_k; k <= 30; k++)
+	{
+          n1 = mpn_fft_next_size (prev_n1, k);
+
+	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
+
+	  if (eff < 70)		/* avoid measuring too slow fft:s */
+	    continue;
+
+	  if (n1 > alloc)
+	    {
+	      alloc = n1;
+	      if (p->sqr)
+		{
+		  ap = realloc (ap, sizeof (mp_limb_t));
+		  rp = realloc (rp, sizeof (mp_limb_t));
+		  ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
+		  mpn_random (ap, alloc);
+		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
+		}
+	      else
+		{
+		  ap = realloc (ap, sizeof (mp_limb_t));
+		  bp = realloc (bp, sizeof (mp_limb_t));
+		  rp = realloc (rp, sizeof (mp_limb_t));
+		  ap = realloc (ap, alloc * sizeof (mp_limb_t));
+		  mpn_random (ap, alloc);
+		  bp = realloc (bp, alloc * sizeof (mp_limb_t));
+		  mpn_random (bp, alloc);
+		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
+		}
+	    }
+
+	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
+
+	  if (t1 * n_measurements > 0.3)
+	    n_measurements -= 2;
+	  n_measurements = MAX (n_measurements, MIN_REPS);
+
+	  if (t1 < t0)
+	    {
+	      best_k = k;
+	      t0 = t1;
+	    }
+	}
+
+      n1 = mpn_fft_next_size (prev_n1, best_k);
+
+      if (last_best_k != best_k)
+	{
+	  if (last_best_k >= 0)
+	    ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
+
+	  if (idx >= FFT_TABLE3_SIZE)
+	    {
+	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
+	      abort ();
+	    }
+	  mpn_fft_table3[p->sqr][idx].n = prev_n1 >> last_best_k;
+	  mpn_fft_table3[p->sqr][idx].k = best_k;
+
+	  if (print)
+	    {
+	      printf (", ");
+	      if (idx % 4 == 0)
+		printf ("\\\n    ");
+	      printf ("{%7u,%2u}", mpn_fft_table3[p->sqr][idx].n, mpn_fft_table3[p->sqr][idx].k);
+	    }
+
+	  if (option_trace >= 2)
+	    printf ("{%ld,%d}\n", prev_n1, best_k);
+
+	  if (option_trace >= 2)
+	    {
+	      printf ("{%lu,%u}\n", prev_n1, best_k);
+	      fflush (stdout);
+	    }
+
+	  last_best_k = best_k;


More information about the gmp-commit mailing list