What's a reasonable size ratio for toom32?

Niels Möller nisse at lysator.liu.se
Mon Sep 18 18:23:13 CEST 2023


Paul Zimmermann <Paul.Zimmermann at inria.fr> writes:

>        Dear Niels,
>
> ./speed -p 1000000 -c -s 10-200 -f1.1 mpn_mul.0.6 would be more readable,
> although the change in speed.h would be larger.

See below patch, to support mpn_mul/0.6. Example output:

  $ ./speed -c -r -s 10-50 -t 5 -p 1000000 mpn_mul mpn_mul/0.6 
  overhead 5.00 cycles, precision 1000000 units of 1.25e-09 secs, CPU freq 798.28 MHz
                mpn_mul   mpn_mul/0.6
  10             215.48       #0.6295
  15             443.16       #0.6066
  20             784.50       #0.6138
  25            1209.57       #0.5957
  30            1490.59       #0.6934
  35            1986.51       #0.6918
  40            2547.99       #0.6981
  45            3189.27       #0.6633
  50            3827.87       #0.6734

What do you think? (Also deleted FLAG_RSIZE, which appeared unused).

(One potential bug is missing initialization of the new .size_ratio in
struct choice_t to 0.0, but I don't see initialization of .r either, I'm
probably missing something).

> Or maybe ./speed -p 1000000 -c -s 10-200 -f1.1 -r 0.6 mpn_mul ?

I think it's nicer to be able to specify it separately for each function
under test.

Regards,
/Niels

diff -r 510152c4ca97 tune/speed.c
--- a/tune/speed.c	Tue Aug 22 10:20:40 2023 +0200
+++ b/tune/speed.c	Mon Sep 18 18:10:04 2023 +0200
@@ -130,7 +130,7 @@
 
 #define FLAG_R            (1<<0)  /* require ".r" */
 #define FLAG_R_OPTIONAL   (1<<1)  /* optional ".r" */
-#define FLAG_RSIZE        (1<<2)
+#define FLAG_SR_OPTIONAL  (1<<2)  /* optional ".r" or "/r" */
 #define FLAG_NODATA       (1<<3)  /* don't alloc xp, yp */
 
 const struct routine_t {
@@ -328,8 +328,8 @@
   { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3    },
   { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4    },
 
-  { "mpn_mul",           speed_mpn_mul,         FLAG_R_OPTIONAL },
-  { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
+  { "mpn_mul",           speed_mpn_mul,         FLAG_SR_OPTIONAL },
+  { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_SR_OPTIONAL },
   { "mpn_sqr_basecase",  speed_mpn_sqr_basecase     },
 #if HAVE_NATIVE_mpn_sqr_diagonal
   { "mpn_sqr_diagonal",  speed_mpn_sqr_diagonal     },
@@ -346,22 +346,22 @@
   { "mpn_toom4_sqr",     speed_mpn_toom4_sqr        },
   { "mpn_toom6_sqr",     speed_mpn_toom6_sqr        },
   { "mpn_toom8_sqr",     speed_mpn_toom8_sqr        },
-  { "mpn_toom22_mul",    speed_mpn_toom22_mul       },
-  { "mpn_toom33_mul",    speed_mpn_toom33_mul       },
-  { "mpn_toom44_mul",    speed_mpn_toom44_mul       },
-  { "mpn_toom6h_mul",    speed_mpn_toom6h_mul       },
-  { "mpn_toom8h_mul",    speed_mpn_toom8h_mul       },
-  { "mpn_toom32_mul",    speed_mpn_toom32_mul       },
-  { "mpn_toom42_mul",    speed_mpn_toom42_mul       },
-  { "mpn_toom43_mul",    speed_mpn_toom43_mul       },
-  { "mpn_toom63_mul",    speed_mpn_toom63_mul       },
-  { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul    },
+  { "mpn_toom22_mul",    speed_mpn_toom22_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom33_mul",    speed_mpn_toom33_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom44_mul",    speed_mpn_toom44_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom6h_mul",    speed_mpn_toom6h_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom8h_mul",    speed_mpn_toom8h_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom32_mul",    speed_mpn_toom32_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom42_mul",    speed_mpn_toom42_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom43_mul",    speed_mpn_toom43_mul, FLAG_SR_OPTIONAL },
+  { "mpn_toom63_mul",    speed_mpn_toom63_mul, FLAG_SR_OPTIONAL },
+  { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul, FLAG_SR_OPTIONAL},
   { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
 #if WANT_OLD_FFT_FULL
-  { "mpn_mul_fft_full",      speed_mpn_mul_fft_full      },
+  { "mpn_mul_fft_full",      speed_mpn_mul_fft_full, FLAG_SR_OPTIONAL},
   { "mpn_mul_fft_full_sqr",  speed_mpn_mul_fft_full_sqr  },
 #endif
-  { "mpn_mul_fft",       speed_mpn_mul_fft,     FLAG_R_OPTIONAL },
+  { "mpn_mul_fft",       speed_mpn_mul_fft,     FLAG_SR_OPTIONAL },
   { "mpn_mul_fft_sqr",   speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
 
   { "mpn_sqrlo",          speed_mpn_sqrlo           },
@@ -576,6 +576,7 @@
 struct choice_t {
   const struct routine_t  *p;
   mp_limb_t               r;
+  double                  size_ratio;
   double                  scale;
   double                  time;
   int                     no_time;
@@ -670,6 +671,7 @@
   for (i = 0; i < num_choices; i++)
     {
       s->r = choice[i].r;
+      s->size_ratio = choice[i].size_ratio;
       choice[i].time = speed_measure (choice[i].p->fun, s);
       choice[i].no_time = (choice[i].time == -1.0);
       if (! choice[i].no_time)
@@ -1011,6 +1013,17 @@
   return n;
 }
 
+double slash_r_string (const char *s)
+{
+  char *end;
+  double r = strtod(s, &end);
+  if (s[0] == '\0' || end[0] != '\0' || r > 1.0 || r < 0.0)
+    {
+      fprintf (stderr, "invalid /r parameter: %s\n", s);
+      exit (1);
+    }
+  return r;
+}
 
 void
 routine_find (struct choice_t *c, const char *s_orig)
@@ -1054,6 +1067,22 @@
           c->r = r_string (s + nlen + 1);
           return;
         }
+      if (s[nlen] == '/')
+        {
+          /* match, with a /r parameter */
+
+          if (! (routine[i].flag & (FLAG_SR_OPTIONAL)))
+            {
+              fprintf (stderr,
+                       "Choice %s bad: doesn't take a \"/<r>\" parameter\n",
+                       s_orig);
+              exit (1);
+            }
+
+          c->p = &routine[i];
+          c->size_ratio = slash_r_string (s + nlen + 1);
+          return;
+        }
 
       if (s[nlen] == '\0')
         {
@@ -1125,6 +1154,8 @@
         printf ("\t%s.r\n", routine[i].name);
       else if (routine[i].flag & FLAG_R_OPTIONAL)
         printf ("\t%s (optional .r)\n", routine[i].name);
+      else if (routine[i].flag & FLAG_SR_OPTIONAL)
+        printf ("\t%s (optional .r or /r)\n", routine[i].name);
       else
         printf ("\t%s\n", routine[i].name);
     }
diff -r 510152c4ca97 tune/speed.h
--- a/tune/speed.h	Tue Aug 22 10:20:40 2023 +0200
+++ b/tune/speed.h	Mon Sep 18 18:10:04 2023 +0200
@@ -113,6 +113,7 @@
   mp_ptr     yp;	/* second argument */
   mp_size_t  size;	/* size of both arguments */
   mp_limb_t  r;		/* user supplied parameter */
+  double     size_ratio; /* ratio for smaller to larger size, e.g., for mpn_mul */
   mp_size_t  align_xp;	/* alignment of xp */
   mp_size_t  align_yp;	/* alignment of yp */
   mp_size_t  align_wp;	/* intended alignment of wp */
@@ -1122,9 +1123,13 @@
     double    t;							\
     TMP_DECL;								\
 									\
-    size1 = (s->r == 0 ? s->size : s->r);				\
-    if (size1 < 0) size1 = -size1 - s->size;				\
-									\
+    if (s->size_ratio > 0.0)						\
+      size1 = s->size_ratio * s->size;					\
+    else								\
+      {									\
+	size1 = (s->r == 0 ? s->size : s->r);				\
+	if (size1 < 0) size1 = -size1 - s->size;			\
+      }									\
     SPEED_RESTRICT_COND (size1 >= 1);					\
     SPEED_RESTRICT_COND (s->size >= size1);				\
 									\

-- 
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.


More information about the gmp-devel mailing list