[PATCH] mpn/generic/jacbase: Another method
David Sparks
sparks05 at proton.me
Mon Feb 16 08:49:08 CET 2026
----8<----
Patch 1/4: mpn/generic/jacbase: Another method
This is basically a refinement of JACOBI_BASE_METHOD = 4, so I'm not sure
if this should replace 4 or exist as an alternative. For now, it's an
alternative created as a way to learn about the tune/ infrastructure.
tuneup -p1000000 -t excerpt:
size=48, mpn_jacobi_base, method 1 0.000000125
size=48, mpn_jacobi_base, method 2 0.000000251
size=48, mpn_jacobi_base, method 3 0.000000208
size=48, mpn_jacobi_base, method 4 0.000000094
size=48, mpn_jacobi_base, method 5 0.000000086
diff --git a/mpn/generic/jacbase.c b/mpn/generic/jacbase.c
index 391ceac3c..503ac18fe 100644
--- a/mpn/generic/jacbase.c
+++ b/mpn/generic/jacbase.c
@@ -240,3 +240,57 @@ mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit)
return 1-2*(bit & 1);
}
#endif /* JACOBI_BASE_METHOD == 4 */
+
+#if JACOBI_BASE_METHOD == 5
+/* This is a tuned variant of JACOBI_BASE_METHOD=4. The values
+ * are right-shifted by 1 (leaving their lsbits implicit),
+ * which leaves room for a sign msbit when subtracting them.
+ */
+int
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit)
+{
+ ASSERT (b & 1);
+ b >>= 1;
+
+ if (LIKELY(a))
+ {
+ int c;
+
+ count_trailing_zeros (c, a);
+ a >>= 1;
+
+ for (;;)
+ {
+ mp_limb_t bgta; /* all-ones if b > a */
+ int tbit;
+
+ a >>= c;
+
+ /* (2/b) = -1 if b = 3 or 5 mod 8 */
+ bit ^= 2*c & (int)(b+1);
+
+ tbit = 2 * (a & b);
+
+ a -= b;
+ if (a == 0)
+ break;
+ bgta = LIMB_HIGHBIT_TO_MASK (a);
+ count_trailing_zeros (c, a);
+ c += 1;
+
+ /* b <-- min (a, b) */
+ b += a & bgta;
+
+ /* a <-- |a - b| */
+ a = (a + bgta) ^ bgta;
+
+ /* If b > a, invoke reciprocity */
+ bit ^= bgta & tbit;
+ }
+ }
+ if (b != 0)
+ return 0;
+
+ return 1-(bit & 2);
+}
+#endif /* JACOBI_BASE_METHOD == 5 */
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 0f564ed49..5113c0d2a 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -58,7 +58,7 @@ libspeed_la_SOURCES = \
freq.c \
gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \
hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \
- jacbase1.c jacbase2.c jacbase3.c jacbase4.c \
+ jacbase1.c jacbase2.c jacbase3.c jacbase4.c jacbase5.c \
hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c \
mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \
noop.c powm_mod.c powm_redc.c pre_divrem_1.c \
diff --git a/tune/common.c b/tune/common.c
index b888780ff..a53e2cb35 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -1862,6 +1862,11 @@ speed_mpn_jacobi_base_4 (struct speed_params *s)
{
SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4);
}
+double
+speed_mpn_jacobi_base_5 (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_5);
+}
double
diff --git a/tune/speed.c b/tune/speed.c
index 6ae76c739..33eea87cd 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -327,6 +327,7 @@ const struct routine_t {
{ "mpn_jacobi_base_2", speed_mpn_jacobi_base_2 },
{ "mpn_jacobi_base_3", speed_mpn_jacobi_base_3 },
{ "mpn_jacobi_base_4", speed_mpn_jacobi_base_4 },
+ { "mpn_jacobi_base_5", speed_mpn_jacobi_base_5 },
{ "mpn_mul", speed_mpn_mul, FLAG_SR_OPTIONAL },
{ "mpn_mul_basecase", speed_mpn_mul_basecase,FLAG_SR_OPTIONAL },
diff --git a/tune/speed.h b/tune/speed.h
index 27b26091f..3624ca21f 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -251,6 +251,7 @@ double speed_mpn_jacobi_base_1 (struct speed_params *);
double speed_mpn_jacobi_base_2 (struct speed_params *);
double speed_mpn_jacobi_base_3 (struct speed_params *);
double speed_mpn_jacobi_base_4 (struct speed_params *);
+double speed_mpn_jacobi_base_5 (struct speed_params *);
double speed_mpn_lshift (struct speed_params *);
double speed_mpn_lshiftc (struct speed_params *);
double speed_mpn_mod_1 (struct speed_params *);
@@ -497,6 +498,7 @@ int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
+int mpn_jacobi_base_5 (mp_limb_t, mp_limb_t, int);
int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
diff --git a/tune/tuneup.c b/tune/tuneup.c
index 5675b8478..dc35bc33a 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -2705,17 +2705,19 @@ void
tune_jacobi_base (void)
{
static struct param_t param;
- speed_function_t f[4] =
+ speed_function_t f[] =
{
speed_mpn_jacobi_base_1,
speed_mpn_jacobi_base_2,
speed_mpn_jacobi_base_3,
speed_mpn_jacobi_base_4,
+ speed_mpn_jacobi_base_5,
};
s.size = GMP_LIMB_BITS * 3 / 4;
- one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", ¶m);
+ one_method (numberof (f), f, "mpn_jacobi_base", "JACOBI_BASE_METHOD",
+ ¶m);
}
----8<----
Patch 2/4: tune/common: Do double_cmp_ptr in a more standards-compliant way.
It's hardly a big deal but casting function pointers is always
a bit sketchy.
diff --git a/tune/common.c b/tune/common.c
index a53e2cb35..1fa500f97 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -106,11 +106,10 @@ pentium_wbinvd(void)
int
-double_cmp_ptr (const double *p, const double *q)
+double_cmp_ptr (const void *pp, const void *qq)
{
- if (*p > *q) return 1;
- if (*p < *q) return -1;
- return 0;
+ const double *p = pp, *q = qq;
+ return (*p > *q) - (*p < *q);
}
@@ -215,7 +214,7 @@ speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
valid measurement. Return smallest among them. */
if (i >= e)
{
- qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+ qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
for (j = e-1; j < i; j++)
if (t[j] <= t[j-e+1] * TOLERANCE)
return t[j-e+1] / s->time_divisor;
diff --git a/tune/freq.c b/tune/freq.c
index 9ba1655c3..574d21487 100644
--- a/tune/freq.c
+++ b/tune/freq.c
@@ -727,7 +727,7 @@ freq_measure (const char *name, double (*one) (void))
{
t[i] = (*one) ();
- qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+ qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
if (speed_option_verbose >= 3)
for (j = 0; j <= i; j++)
printf (" t[%d] is %.6g\n", j, t[j]);
diff --git a/tune/speed.h b/tune/speed.h
index 3624ca21f..f96d4b6e7 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -465,9 +465,8 @@ int cycles_works_p (void);
long clk_tck (void);
double freq_measure (const char *, double (*)(void));
-int double_cmp_ptr (const double *, const double *);
+int double_cmp_ptr (const void *, const void *);
void pentium_wbinvd (void);
-typedef int (*qsort_function_t) (const void *, const void *);
void noop (void);
void noop_1 (mp_limb_t);
----8<----
Patch 3/4: tune/tuneup: Find best method & runner-up in a single loop.
It ends up being simpler.
diff --git a/tune/tuneup.c b/tune/tuneup.c
index ed76f2cd9..8d822efc6 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -726,11 +726,10 @@ one_method (int n, speed_function_t *functions,
const struct param_t *param)
{
double *t;
- int i;
- int method;
- int method_runner_up;
+ int i, method, method_runner_up;
TMP_DECL;
+ ASSERT_ALWAYS (n >= 2);
TMP_MARK;
t = (double*) TMP_ALLOC (n * sizeof (*t));
@@ -746,15 +745,14 @@ one_method (int n, speed_function_t *functions,
abort ();
}
}
- method = 0;
- for (i = 1; i < n; i++)
- if (t[i] < t[method])
- method = i;
-
- method_runner_up = (method == 0);
- for (i = 0; i < n; i++)
- if (i != method && t[i] < t[method_runner_up])
- method_runner_up = i;
+ method = (t[1] < t[0]);
+ method_runner_up = 1 - method;
+ for (i = 2; i < n; i++)
+ if (t[i] < t[method_runner_up])
+ if (t[i] < t[method])
+ method_runner_up = method, method = i;
+ else
+ method_runner_up = i;
print_define_with_speedup (define, method + 1, method_runner_up + 1,
t[method_runner_up] / t[method]);
----8<----
Patch 3/4: tune/common: Do double_cmp_ptr in a more standards-compliant way.
It's hardly a big deal but casting function pointers is always
a bit sketchy.
diff --git a/tune/common.c b/tune/common.c
index a53e2cb35..1fa500f97 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -106,11 +106,10 @@ pentium_wbinvd(void)
int
-double_cmp_ptr (const double *p, const double *q)
+double_cmp_ptr (const void *pp, const void *qq)
{
- if (*p > *q) return 1;
- if (*p < *q) return -1;
- return 0;
+ const double *p = pp, *q = qq;
+ return (*p > *q) - (*p < *q);
}
@@ -215,7 +214,7 @@ speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s)
valid measurement. Return smallest among them. */
if (i >= e)
{
- qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+ qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
for (j = e-1; j < i; j++)
if (t[j] <= t[j-e+1] * TOLERANCE)
return t[j-e+1] / s->time_divisor;
diff --git a/tune/freq.c b/tune/freq.c
index 9ba1655c3..574d21487 100644
--- a/tune/freq.c
+++ b/tune/freq.c
@@ -727,7 +727,7 @@ freq_measure (const char *name, double (*one) (void))
{
t[i] = (*one) ();
- qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
+ qsort (t, i+1, sizeof(t[0]), double_cmp_ptr);
if (speed_option_verbose >= 3)
for (j = 0; j <= i; j++)
printf (" t[%d] is %.6g\n", j, t[j]);
diff --git a/tune/speed.h b/tune/speed.h
index 3624ca21f..f96d4b6e7 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -465,9 +465,8 @@ int cycles_works_p (void);
long clk_tck (void);
double freq_measure (const char *, double (*)(void));
-int double_cmp_ptr (const double *, const double *);
+int double_cmp_ptr (const void *, const void *);
void pentium_wbinvd (void);
-typedef int (*qsort_function_t) (const void *, const void *);
void noop (void);
void noop_1 (mp_limb_t);
----8<----
Patch 4/4: tune/speed.h: Don't require that x be odd.
I'm changing the code to match the comment because I don't know
how to write a comment explaining why x should be odd.
In practice, x is the remainder of an mpz modulo y, so
it should be pretty uniform.
This code raises a few questions:
1) Why is y > 1 so essential? I agree it's not very interesting
to benchmark, but it doesn't seem worth the explicit test.
2) What about sorting x and y so x <= y? Obviously,
this changes the distribution significantly. Rather than y
being uniform, this makes large y much more likely.
diff --git a/tune/speed.h b/tune/speed.h
index f96d4b6e7..4c20a5687 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -2975,7 +2975,6 @@ int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
({ \
/* require x<y, y odd, y!=1 */ \
px[i] %= py[i]; \
- px[i] |= 1; \
py[i] |= 1; \
if (py[i]==1) py[i]=3; \
}, \
More information about the gmp-devel
mailing list