[PATCH] mpn/generic/perfsqr: Improve alternate (currently disabled) test.
David Sparks
sparks05 at proton.me
Sat Feb 14 19:10:24 CET 2026
If count_trailing_zeros is slow (i.e. not implemented in hardware),
the mod^2^k test can skip shifting down the input and instead
shift up the bits to be tested.
This takes a few more instructions, but a lot less than emulating
a CTZ instruction.
diff --git a/longlong.h b/longlong.h
index 031a092ec..5fd143db1 100644
--- a/longlong.h
+++ b/longlong.h
@@ -2254,6 +2254,7 @@ extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
/* Define count_trailing_zeros in plain C, assuming small counts are common.
We use clz_tab without ado, since the C count_leading_zeros above will have
pulled it in. */
+#define COUNT_TRAILING_ZEROS_SLOW
#define count_trailing_zeros(count, x) \
do { \
UWtype __ctz_x = (x); \
diff --git a/mpn/generic/perfsqr.c b/mpn/generic/perfsqr.c
index 1ea5c84b4..638f3af44 100644
--- a/mpn/generic/perfsqr.c
+++ b/mpn/generic/perfsqr.c
@@ -182,6 +182,7 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
+#if 1
/* The first test excludes 212/256 (82.8%) of the perfect square candidates
in O(1) time. */
{
@@ -191,24 +192,34 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
return 0;
}
-#if 0
- /* Check that we have even multiplicity of 2, and then check that the rest is
- a possible perfect square. Leave disabled until we can determine this
- really is an improvement. If it is, it could completely replace the
- simple probe above, since this should throw out more non-squares, but at
- the expense of somewhat more cycles. */
+#else
+ /* Check that we have even multiplicity of 2, and then check that the rest
+ is a possible perfect square mod 8. Leave disabled until we can
+ determine this really is an improvement. This supersedes the mod-256
+ test, throwing out more non-squares (5/6 = 83.3%), but at the expense
+ of somewhat more cycles.
+
+ Note that this test is not *quite* perfect. Because it only examines
+ one word, it will allow inputs of the form (8*k+5)<<62; it can't see
+ bit 64 which distinguishes them from (8*k+1)<<62. */
{
mp_limb_t lo;
- int cnt;
- lo = up[0];
- while (lo == 0)
- up++, lo = up[0], usize--;
- count_trailing_zeros (cnt, lo);
- if ((cnt & 1) != 0)
- return 0; /* return of not even multiplicity of 2 */
- lo >>= cnt; /* shift down to align lowest non-zero bit */
- if ((lo & 6) != 0)
- return 0;
+ while ((lo = up[0]) == 0)
+ up++, usize--;
+ {
+#ifndef COUNT_TRAILING_ZEROS_SLOW
+ int cnt;
+ count_trailing_zeros (cnt, lo);
+ lo >>= cnt & -2; /* Shift down to eliminate powers of 4 */
+ if (lo & 6 != 0)
+ return 0;
+#else
+ mp_limb_t lsbit = (lo | lo<<1) & MP_LIMB_T_MAX/3*2;
+ lsbit &= -lsbit; /* 2 * (greatest power of 4 dividing lo) */
+ if (lo & lsbit*3 != 0)
+ return 0;
+#endif
+ }
}
#endif
----8<----
Patchg 2/2: mpn/generic/perfsqr: Clean up parens
This is mostly getting rid of some extransous parens to reduce
visual clutter, but there's one missing pair in PERFSQR_MOD_IDX.
(Harmless, the callers don't pass in an expression of low enough
precedence to cause a problem.)
diff --git a/mpn/generic/perfsqr.c b/mpn/generic/perfsqr.c
index 638f3af44..a7ede9630 100644
--- a/mpn/generic/perfsqr.c
+++ b/mpn/generic/perfsqr.c
@@ -132,11 +132,11 @@ see https://www.gnu.org/licenses/. */
do { \
mp_limb_t q; \
ASSERT ((r) <= PERFSQR_MOD_MASK); \
- ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \
+ ASSERT (((inv) * (d) & PERFSQR_MOD_MASK) == 1); \
ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \
\
- q = ((r) * (inv)) & PERFSQR_MOD_MASK; \
- ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \
+ q = (r) * (inv) & PERFSQR_MOD_MASK; \
+ ASSERT ((r) == (q * (d) & PERFSQR_MOD_MASK)); \
(idx) = (q * (d)) >> PERFSQR_MOD_BITS; \
} while (0)
@@ -147,7 +147,7 @@ see https://www.gnu.org/licenses/. */
PERFSQR_MOD_IDX(idx, r, d, inv); \
TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \
d, r%d, idx)); \
- if ((((mask) >> idx) & 1) == 0) \
+ if (((mask) >> idx & 1) == 0) \
{ \
TRACE (printf (" non-square\n")); \
return 0; \
@@ -167,7 +167,7 @@ see https://www.gnu.org/licenses/. */
d, r%d, idx)); \
m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \
idx %= GMP_LIMB_BITS; \
- if (((m >> idx) & 1) == 0) \
+ if ((m >> idx & 1) == 0) \
{ \
TRACE (printf (" non-square\n")); \
return 0; \
@@ -187,8 +187,7 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
in O(1) time. */
{
unsigned idx = up[0] % 0x100;
- if (((sq_res_0x100[idx / GMP_LIMB_BITS]
- >> (idx % GMP_LIMB_BITS)) & 1) == 0)
+ if ((sq_res_0x100[idx/GMP_LIMB_BITS] >> idx%GMP_LIMB_BITS & 1) == 0)
return 0;
}
More information about the gmp-devel
mailing list