[PATCH] mpn/generic/perfsqr: Improve alternate (currently disabled) test.

David Sparks sparks05 at proton.me
Sat Feb 14 19:10:24 CET 2026


If count_trailing_zeros is slow (i.e. not implemented in hardware),
the mod^2^k test can skip shifting down the input and instead
shift up the bits to be tested.

This takes a few more instructions, but a lot less than emulating
a CTZ instruction.

diff --git a/longlong.h b/longlong.h
index 031a092ec..5fd143db1 100644
--- a/longlong.h
+++ b/longlong.h
@@ -2254,6 +2254,7 @@ extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
 /* Define count_trailing_zeros in plain C, assuming small counts are common.
    We use clz_tab without ado, since the C count_leading_zeros above will have
    pulled it in.  */
+#define COUNT_TRAILING_ZEROS_SLOW
 #define count_trailing_zeros(count, x)					\
   do {									\
     UWtype __ctz_x = (x);						\
diff --git a/mpn/generic/perfsqr.c b/mpn/generic/perfsqr.c
index 1ea5c84b4..638f3af44 100644
--- a/mpn/generic/perfsqr.c
+++ b/mpn/generic/perfsqr.c
@@ -182,6 +182,7 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
 
   TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
 
+#if 1
   /* The first test excludes 212/256 (82.8%) of the perfect square candidates
      in O(1) time.  */
   {
@@ -191,24 +192,34 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
       return 0;
   }
 
-#if 0
-  /* Check that we have even multiplicity of 2, and then check that the rest is
-     a possible perfect square.  Leave disabled until we can determine this
-     really is an improvement.  If it is, it could completely replace the
-     simple probe above, since this should throw out more non-squares, but at
-     the expense of somewhat more cycles.  */
+#else
+  /* Check that we have even multiplicity of 2, and then check that the rest
+     is a possible perfect square mod 8.  Leave disabled until we can
+     determine this really is an improvement.  This supersedes the mod-256
+     test, throwing out more non-squares (5/6 = 83.3%), but at the expense
+     of somewhat more cycles.
+
+     Note that this test is not *quite* perfect.  Because it only examines
+     one word, it will allow inputs of the form (8*k+5)<<62; it can't see
+     bit 64 which distinguishes them from (8*k+1)<<62.  */
   {
     mp_limb_t lo;
-    int cnt;
-    lo = up[0];
-    while (lo == 0)
-      up++, lo = up[0], usize--;
-    count_trailing_zeros (cnt, lo);
-    if ((cnt & 1) != 0)
-      return 0;			/* return of not even multiplicity of 2 */
-    lo >>= cnt;			/* shift down to align lowest non-zero bit */
-    if ((lo & 6) != 0)
-      return 0;
+    while ((lo = up[0]) == 0)
+      up++, usize--;
+    {
+#ifndef COUNT_TRAILING_ZEROS_SLOW
+      int cnt;
+      count_trailing_zeros (cnt, lo);
+      lo >>= cnt & -2;		/* Shift down to eliminate powers of 4 */
+      if (lo & 6 != 0)
+        return 0;
+#else
+      mp_limb_t lsbit = (lo | lo<<1) & MP_LIMB_T_MAX/3*2;
+      lsbit &= -lsbit;		/* 2 * (greatest power of 4 dividing lo) */
+      if (lo & lsbit*3 != 0)
+        return 0;
+#endif
+    }
   }
 #endif
 
----8<----
Patchg 2/2: mpn/generic/perfsqr: Clean up parens
    
This is mostly getting rid of some extransous parens to reduce
visual clutter, but there's one missing pair in PERFSQR_MOD_IDX.
(Harmless, the callers don't pass in an expression of low enough
precedence to cause a problem.)

diff --git a/mpn/generic/perfsqr.c b/mpn/generic/perfsqr.c
index 638f3af44..a7ede9630 100644
--- a/mpn/generic/perfsqr.c
+++ b/mpn/generic/perfsqr.c
@@ -132,11 +132,11 @@ see https://www.gnu.org/licenses/.  */
   do {								\
     mp_limb_t  q;						\
     ASSERT ((r) <= PERFSQR_MOD_MASK);				\
-    ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);		\
+    ASSERT (((inv) * (d) & PERFSQR_MOD_MASK) == 1);		\
     ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);		\
 								\
-    q = ((r) * (inv)) & PERFSQR_MOD_MASK;			\
-    ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));		\
+    q = (r) * (inv) & PERFSQR_MOD_MASK;				\
+    ASSERT ((r) == (q * (d) & PERFSQR_MOD_MASK));		\
     (idx) = (q * (d)) >> PERFSQR_MOD_BITS;			\
   } while (0)
 
@@ -147,7 +147,7 @@ see https://www.gnu.org/licenses/.  */
     PERFSQR_MOD_IDX(idx, r, d, inv);				\
     TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",	\
 		   d, r%d, idx));				\
-    if ((((mask) >> idx) & 1) == 0)				\
+    if (((mask) >> idx & 1) == 0)				\
       {								\
 	TRACE (printf ("  non-square\n"));			\
 	return 0;						\
@@ -167,7 +167,7 @@ see https://www.gnu.org/licenses/.  */
 		   d, r%d, idx));				\
     m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));	\
     idx %= GMP_LIMB_BITS;					\
-    if (((m >> idx) & 1) == 0)					\
+    if ((m >> idx & 1) == 0)					\
       {								\
 	TRACE (printf ("  non-square\n"));			\
 	return 0;						\
@@ -187,8 +187,7 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
      in O(1) time.  */
   {
     unsigned  idx = up[0] % 0x100;
-    if (((sq_res_0x100[idx / GMP_LIMB_BITS]
-	  >> (idx % GMP_LIMB_BITS)) & 1) == 0)
+    if ((sq_res_0x100[idx/GMP_LIMB_BITS] >> idx%GMP_LIMB_BITS & 1) == 0)
       return 0;
   }
 




More information about the gmp-devel mailing list