[PATCH] mpn/generic/jacbase: Another method

David Sparks sparks05 at proton.me
Mon Feb 16 08:49:08 CET 2026


----8<----
Patch 1/4: mpn/generic/jacbase: Another method
    
This is basically a refinement of JACOBI_BASE_METHOD = 4, so I'm not sure
if this should replace 4 or exist as an alternative.  For now, it's an
alternative created as a way to learn about the tune/ infrastructure.

tuneup -p1000000 -t excerpt:
size=48, mpn_jacobi_base, method 1 0.000000125
size=48, mpn_jacobi_base, method 2 0.000000251
size=48, mpn_jacobi_base, method 3 0.000000208
size=48, mpn_jacobi_base, method 4 0.000000094
size=48, mpn_jacobi_base, method 5 0.000000086

diff --git a/mpn/generic/jacbase.c b/mpn/generic/jacbase.c
index 391ceac3c..503ac18fe 100644
--- a/mpn/generic/jacbase.c
+++ b/mpn/generic/jacbase.c
@@ -240,3 +240,57 @@ mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit)
   return 1-2*(bit & 1);
 }
 #endif /* JACOBI_BASE_METHOD == 4 */
+
+#if JACOBI_BASE_METHOD == 5
+/* This is a tuned variant of JACOBI_BASE_METHOD=4.  The values
+ * are right-shifted by 1 (leaving their lsbits implicit),
+ * which leaves room for a sign msbit when subtracting them.
+ */
+int
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit)
+{
+  ASSERT (b & 1);
+  b >>= 1;
+
+  if (LIKELY(a))
+    {
+      int c;
+
+      count_trailing_zeros (c, a);
+      a >>= 1;
+
+      for (;;)
+	{
+	  mp_limb_t bgta;	/* all-ones if b > a */
+	  int tbit;
+
+	  a >>= c;
+
+	  /* (2/b) = -1 if b = 3 or 5 mod 8 */
+	  bit ^= 2*c & (int)(b+1);
+
+	  tbit = 2 * (a & b);
+
+	  a -= b;
+	  if (a == 0)
+	    break;
+          bgta = LIMB_HIGHBIT_TO_MASK (a);
+	  count_trailing_zeros (c, a);
+	  c += 1;
+
+	  /* b <-- min (a, b) */
+	  b += a & bgta;
+
+	  /* a <-- |a - b| */
+	  a = (a + bgta) ^ bgta;
+
+	  /* If b > a, invoke reciprocity */
+	  bit ^= bgta & tbit;
+	}
+    }
+  if (b != 0)
+    return 0;
+
+  return 1-(bit & 2);
+}
+#endif /* JACOBI_BASE_METHOD == 5 */
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 0f564ed49..5113c0d2a 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -58,7 +58,7 @@ libspeed_la_SOURCES =							\
   freq.c								\
   gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
   hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c	\
-  jacbase1.c jacbase2.c jacbase3.c jacbase4.c				\
+  jacbase1.c jacbase2.c jacbase3.c jacbase4.c jacbase5.c		\
   hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c			\
   mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c		\
   noop.c powm_mod.c powm_redc.c pre_divrem_1.c				\
diff --git a/tune/common.c b/tune/common.c
index b888780ff..a53e2cb35 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -1862,6 +1862,11 @@ speed_mpn_jacobi_base_4 (struct speed_params *s)
 {
   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4);
 }
+double
+speed_mpn_jacobi_base_5 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_5);
+}
 
 
 double
diff --git a/tune/speed.c b/tune/speed.c
index 6ae76c739..33eea87cd 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -327,6 +327,7 @@ const struct routine_t {
   { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2    },
   { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3    },
   { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4    },
+  { "mpn_jacobi_base_5", speed_mpn_jacobi_base_5    },
 
   { "mpn_mul",           speed_mpn_mul,         FLAG_SR_OPTIONAL },
   { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_SR_OPTIONAL },
diff --git a/tune/speed.h b/tune/speed.h
index 27b26091f..3624ca21f 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -251,6 +251,7 @@ double speed_mpn_jacobi_base_1 (struct speed_params *);
 double speed_mpn_jacobi_base_2 (struct speed_params *);
 double speed_mpn_jacobi_base_3 (struct speed_params *);
 double speed_mpn_jacobi_base_4 (struct speed_params *);
+double speed_mpn_jacobi_base_5 (struct speed_params *);
 double speed_mpn_lshift (struct speed_params *);
 double speed_mpn_lshiftc (struct speed_params *);
 double speed_mpn_mod_1 (struct speed_params *);
@@ -497,6 +498,7 @@ int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_5 (mp_limb_t, mp_limb_t, int);
 
 int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
 int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
diff --git a/tune/tuneup.c b/tune/tuneup.c
index 5675b8478..dc35bc33a 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -2705,17 +2705,19 @@ void
 tune_jacobi_base (void)
 {
   static struct param_t  param;
-  speed_function_t f[4] =
+  speed_function_t f[] =
     {
      speed_mpn_jacobi_base_1,
      speed_mpn_jacobi_base_2,
      speed_mpn_jacobi_base_3,
      speed_mpn_jacobi_base_4,
+     speed_mpn_jacobi_base_5,
     };
 
   s.size = GMP_LIMB_BITS * 3 / 4;
 
-  one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
+  one_method (numberof (f), f, "mpn_jacobi_base", "JACOBI_BASE_METHOD",
+	      &param);
 }
 
 
----8<----
Patch 2/4: tune/common: Do double_cmp_ptr in a more standards-compliant way.

It's hardly a big deal but casting function pointers is always
a bit sketchy.

diff --git a/tune/common.c b/tune/common.c
index a53e2cb35..1fa500f97 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -106,11 +106,10 @@ pentium_wbinvd(void)
 
 
 int
-double_cmp_ptr (const double *p, const double *q)
+double_cmp_ptr (const void *pp, const void *qq)
 {
-  if (*p > *q)  return 1;
-  if (*p < *q)  return -1;
-  return 0;
+  const double *p = pp, *q = qq;
+  return (*p > *q) - (*p < *q);
 }
 
 
@@ -215,7 +214,7 @@ speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
 	 valid measurement.  Return smallest among them.  */
       if (i >= e)
 	{
-	  qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+	  qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
 	  for (j = e-1; j < i; j++)
 	    if (t[j] <= t[j-e+1] * TOLERANCE)
 	      return t[j-e+1] / s->time_divisor;
diff --git a/tune/freq.c b/tune/freq.c
index 9ba1655c3..574d21487 100644
--- a/tune/freq.c
+++ b/tune/freq.c
@@ -727,7 +727,7 @@ freq_measure (const char *name, double (*one) (void))
     {
       t[i] = (*one) ();
 
-      qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+      qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
       if (speed_option_verbose >= 3)
         for (j = 0; j <= i; j++)
           printf ("   t[%d] is %.6g\n", j, t[j]);
diff --git a/tune/speed.h b/tune/speed.h
index 3624ca21f..f96d4b6e7 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -465,9 +465,8 @@ int cycles_works_p (void);
 long clk_tck (void);
 double freq_measure (const char *, double (*)(void));
 
-int double_cmp_ptr (const double *, const double *);
+int double_cmp_ptr (const void *, const void *);
 void pentium_wbinvd (void);
-typedef int (*qsort_function_t) (const void *, const void *);
 
 void noop (void);
 void noop_1 (mp_limb_t);

----8<----
Patch 3/4: tune/tuneup: Find best method & runner-up in a single loop.

It ends up being simpler.

diff --git a/tune/tuneup.c b/tune/tuneup.c
index ed76f2cd9..8d822efc6 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -726,11 +726,10 @@ one_method (int n, speed_function_t *functions,
 	    const struct param_t *param)
 {
   double *t;
-  int i;
-  int method;
-  int method_runner_up;
+  int i, method, method_runner_up;
 
   TMP_DECL;
+  ASSERT_ALWAYS (n >= 2);
   TMP_MARK;
   t = (double*) TMP_ALLOC (n * sizeof (*t));
 
@@ -746,15 +745,14 @@ one_method (int n, speed_function_t *functions,
 	  abort ();
 	}
     }
-  method = 0;
-  for (i = 1; i < n; i++)
-    if (t[i] < t[method])
-      method = i;
-
-  method_runner_up = (method == 0);
-  for (i = 0; i < n; i++)
-    if (i != method && t[i] < t[method_runner_up])
-      method_runner_up = i;
+  method = (t[1] < t[0]);
+  method_runner_up = 1 - method;
+  for (i = 2; i < n; i++)
+    if (t[i] < t[method_runner_up])
+      if (t[i] < t[method])
+        method_runner_up = method, method = i;
+      else
+        method_runner_up = i;
 
   print_define_with_speedup (define, method + 1, method_runner_up + 1,
 			     t[method_runner_up] / t[method]);

----8<----
Patch 3/4: tune/common: Do double_cmp_ptr in a more standards-compliant way.
    
It's hardly a big deal but casting function pointers is always
a bit sketchy.

diff --git a/tune/common.c b/tune/common.c
index a53e2cb35..1fa500f97 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -106,11 +106,10 @@ pentium_wbinvd(void)
 
 
 int
-double_cmp_ptr (const double *p, const double *q)
+double_cmp_ptr (const void *pp, const void *qq)
 {
-  if (*p > *q)  return 1;
-  if (*p < *q)  return -1;
-  return 0;
+  const double *p = pp, *q = qq;
+  return (*p > *q) - (*p < *q);
 }
 
 
@@ -215,7 +214,7 @@ speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
 	 valid measurement.  Return smallest among them.  */
       if (i >= e)
 	{
-	  qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+	  qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
 	  for (j = e-1; j < i; j++)
 	    if (t[j] <= t[j-e+1] * TOLERANCE)
 	      return t[j-e+1] / s->time_divisor;
diff --git a/tune/freq.c b/tune/freq.c
index 9ba1655c3..574d21487 100644
--- a/tune/freq.c
+++ b/tune/freq.c
@@ -727,7 +727,7 @@ freq_measure (const char *name, double (*one) (void))
     {
       t[i] = (*one) ();
 
-      qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+      qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
       if (speed_option_verbose >= 3)
         for (j = 0; j <= i; j++)
           printf ("   t[%d] is %.6g\n", j, t[j]);
diff --git a/tune/speed.h b/tune/speed.h
index 3624ca21f..f96d4b6e7 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -465,9 +465,8 @@ int cycles_works_p (void);
 long clk_tck (void);
 double freq_measure (const char *, double (*)(void));
 
-int double_cmp_ptr (const double *, const double *);
+int double_cmp_ptr (const void *, const void *);
 void pentium_wbinvd (void);
-typedef int (*qsort_function_t) (const void *, const void *);
 
 void noop (void);
 void noop_1 (mp_limb_t);

----8<----
Patch 4/4: tune/speed.h: Don't require that x be odd.

I'm changing the code to match the comment because I don't know
how to write a comment explaining why x should be odd.
In practice, x is the remainder of an mpz modulo y, so
it should be pretty uniform.

This code raises a few questions:
1) Why is y > 1 so essential?  I agree it's not very interesting
   to benchmark, but it doesn't seem worth the explicit test.
2) What about sorting x and y so x <= y?  Obviously,
   this changes the distribution significantly.  Rather than y
   being uniform, this makes large y much more likely.

diff --git a/tune/speed.h b/tune/speed.h
index f96d4b6e7..4c20a5687 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -2975,7 +2975,6 @@ int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
     ({									\
        /* require x<y, y odd, y!=1 */					\
        px[i] %= py[i];							\
-       px[i] |= 1;							\
        py[i] |= 1;							\
        if (py[i]==1) py[i]=3;						\
      },									\




More information about the gmp-devel mailing list