[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Jun 18 01:58:15 CEST 2026
details: /var/hg/gmp/rev/988855dbba55
changeset: 18508:988855dbba55
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jun 17 20:10:11 2026 +0200
description:
mpn/generic/bsqrtinv.c: Unroll initial loop
details: /var/hg/gmp/rev/a8abe1de2503
changeset: 18509:a8abe1de2503
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Wed Jun 17 20:46:21 2026 +0200
description:
mpn/generic/bsqrtinv.c: Use a table to initialize.
details: /var/hg/gmp/rev/d746975c7596
changeset: 18510:d746975c7596
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Thu Jun 18 00:50:15 2026 +0200
description:
mpn/generic/bsqrtinv.c: Add a specialized step from 1 to 2 limbs
details: /var/hg/gmp/rev/1f2075599e49
changeset: 18511:1f2075599e49
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Thu Jun 18 01:12:30 2026 +0200
description:
mpn/generic/bsqrtinv.c: Rewrite iteration as r' <- r-r(r^2 y - 1)/2
details: /var/hg/gmp/rev/331e66376226
changeset: 18512:331e66376226
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Thu Jun 18 01:23:30 2026 +0200
description:
mpn/generic/bsqrtinv.c: Avoid some superflous computations.
diffstat:
mpn/generic/bsqrtinv.c | 158 ++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 129 insertions(+), 29 deletions(-)
diffs (207 lines):
diff -r 367030886996 -r 331e66376226 mpn/generic/bsqrtinv.c
--- a/mpn/generic/bsqrtinv.c Tue Jun 16 18:15:41 2026 +0200
+++ b/mpn/generic/bsqrtinv.c Thu Jun 18 01:23:30 2026 +0200
@@ -31,6 +31,7 @@
see https://www.gnu.org/licenses/. */
#include "gmp-impl.h"
+#include "longlong.h"
/* Compute r such that r^2 * y = 1 (mod 2^{b+1}).
Return non-zero if such an integer r exists.
@@ -44,26 +45,47 @@
(1) Simplify to do precision book-keeping in limbs rather than bits.
(2) Rewrite iteration as
- r' <-- r - r (r^2 y - 1) / 2
- and take advantage of zero low part of r^2 y - 1.
+ r' <-- r - r (r^2 y - 1) / 2 [Done]
+ and take advantage of zero low part of r^2 y - 1. [Done]
(3) Use wrap-around trick.
- (4) Use a small table to get starting value.
+ (4) Use a small table to get starting value. [Done]
*/
+
+/* Generated with GP-Pari:
+ b=8;v=Vecsmall(-binary(2^2^b-1));
+ forstep(i=1,2^(b+1),2,v[lift(Mod(i,2^(b+3))^-2)>>3+1]=i\2);
+ for(i=1,2^b,print1(v[i],",");if(i%16==0,print(),print1(" ")))
+ */
+
+#ifndef BSQRTINV_DONT_USE_TABLE
+static const unsigned char binvsqrttab[256] = /* The least significant 1 was removed */
+ {
+ 0, 170, 172, 102, 184, 253, 219, 129, 240, 218, 227, 22, 168, 50, 148, 46,
+ 31, 245, 115, 57, 152, 157, 4, 222, 208, 197, 3, 137, 136, 146, 139, 113,
+ 63, 149, 108, 217, 120, 61, 228, 62, 176, 101, 220, 214, 104, 242, 84, 238,
+ 95, 53, 179, 134, 88, 34, 59, 97, 144, 5, 67, 54, 72, 173, 203, 78,
+ 127, 42, 44, 25, 56, 130, 164, 254, 112, 90, 156, 105, 40, 77, 20, 81,
+ 159, 138, 243, 185, 24, 226, 123, 94, 80, 186, 131, 246, 8, 18, 244, 241,
+ 191, 234, 19, 166, 7, 189, 100, 65, 48, 229, 92, 86, 23, 114, 43, 110,
+ 223, 181, 204, 6, 39, 93, 187, 225, 16, 133, 195, 73, 55, 210, 180, 49,
+ 255, 85, 83, 153, 71, 2, 36, 126, 15, 37, 28, 233, 87, 205, 107, 209,
+ 224, 10, 140, 198, 103, 98, 251, 33, 47, 58, 252, 118, 119, 109, 116, 142,
+ 192, 106, 147, 38, 135, 194, 27, 193, 79, 154, 35, 41, 151, 13, 171, 17,
+ 160, 202, 76, 121, 167, 221, 196, 158, 111, 250, 188, 201, 183, 82, 52, 177,
+ 128, 213, 211, 230, 199, 125, 91, 1, 143, 165, 99, 150, 215, 178, 235, 174,
+ 96, 117, 12, 70, 231, 29, 132, 161, 175, 69, 124, 9, 247, 237, 11, 14,
+ 64, 21, 236, 89, 248, 66, 155, 190, 207, 26, 163, 169, 232, 141, 212, 145,
+ 32, 74, 51, 249, 216, 162, 68, 30, 239, 122, 60, 182, 200, 45, 75, 206,
+ };
+#endif
+
int
mpn_bsqrtinv (mp_ptr rp, mp_srcptr yp, mp_bitcnt_t bnb, mp_ptr tp)
{
- mp_ptr tp2;
- mp_size_t bn, order[GMP_LIMB_BITS + 1];
- int i, d;
-
ASSERT (bnb > 0);
- bn = 1 + bnb / GMP_LIMB_BITS;
-
- tp2 = tp + bn;
-
rp[0] = 1;
if (bnb == 1)
{
@@ -72,40 +94,118 @@
}
else
{
+ mp_ptr tp2 = tp + 1 + bnb / GMP_NUMB_BITS;
+ mp_size_t bn, order[GMP_LIMB_BITS + 1];
mp_limb_t t0, r0, y0 = *yp;
+ int i;
if ((y0 & 7) != 1)
return 0;
+#ifdef BSQRTINV_DONT_USE_TABLE
r0 = 33 + ((y0 & 8) * 5 >> 2) - ((y0 & 16) >> 1);
- do {
+
+ t0 = r0 * r0 * y0 >> 1;
+ r0 -= r0 * t0;
+ ASSERT ((t0 & (GMP_NUMB_MAX >> (GMP_NUMB_BITS - 4))) == 0);
+#if GMP_NUMB_BITS >= 7 * 2 - 1
+ t0 = r0 * r0 * y0 >> 1;
+ r0 -= r0 * t0;
+ ASSERT ((t0 & (GMP_NUMB_MAX >> (GMP_NUMB_BITS - 7))) == 0);
+#if GMP_NUMB_BITS >= 13 * 2 - 1
+ t0 = r0 * r0 * y0 >> 1;
+ r0 -= r0 * t0;
+ ASSERT ((t0 & GMP_NUMB_MAX >> (GMP_NUMB_BITS - 13)) == 0);
+#if GMP_NUMB_BITS >= 25 * 2 - 1
+ t0 = r0 * r0 * y0 >> 1;
+ r0 -= r0 * t0;
+ ASSERT ((t0 & GMP_NUMB_MAX >> (GMP_NUMB_BITS - 25)) == 0);
+
+ const mp_bitcnt_t precomputed_bits = 49;
+#else /* GMP_NUMB_BITS < 25 * 2 - 1 */
+ const mp_bitcnt_t precomputed_bits = 25;
+#endif
+#else /* GMP_NUMB_BITS < 13 * 2 - 1 */
+ const mp_bitcnt_t precomputed_bits = 13;
+#endif
+#else /* GMP_NUMB_BITS < 7 * 2 - 1 */
+ const mp_bitcnt_t precomputed_bits = 7;
+#endif
+#else
+ r0 = binvsqrttab[(y0 >> 3) & 0xff];
+ r0 = (r0 << 1) + 1;
+
+#if GMP_NUMB_BITS >= 10 * 2 - 1
+ t0 = r0 * r0 * y0 >> 1;
+ r0 -= r0 * t0;
+ ASSERT ((t0 & (GMP_NUMB_MAX >> (GMP_NUMB_BITS - 10))) == 0);
+#if GMP_NUMB_BITS >= 19 * 2 - 1
+ t0 = r0 * r0 * y0 >> 1;
+ r0 -= r0 * t0;
+ ASSERT ((t0 & (GMP_NUMB_MAX >> (GMP_NUMB_BITS - 19))) == 0);
+
+ const mp_bitcnt_t precomputed_bits = 19 * 2 - 1;
+#else /* GMP_NUMB_BITS < 19 * 2 - 1 */
+ const mp_bitcnt_t precomputed_bits = 19;
+#endif
+#else /* GMP_NUMB_BITS < 10 * 2 - 1 */
+ const mp_bitcnt_t precomputed_bits = 10;
+#endif
+#endif
+
+ mpn_zero (rp + 1, bnb / GMP_NUMB_BITS);
+ i = 0;
+ for (; bnb >= GMP_NUMB_BITS; bnb = (bnb + 2) >> 1)
+ order[i++] = bnb;
+ if (bnb > precomputed_bits) {
t0 = r0 * r0 * y0 >> 1;
r0 -= r0 * t0;
- } while ((t0 & (GMP_NUMB_MAX >> (GMP_NUMB_BITS >> 1))) != 0);
- *rp = r0 & GMP_NUMB_MAX;
+ ASSERT ((t0 & (GMP_NUMB_MAX >> (GMP_NUMB_BITS - precomputed_bits))) == 0);
+ }
+ if (i) {
+ mp_limb_t t5, t4, t3, t2, t1, r1;
+ --i;
+
+ umul_ppmm (t1, t0, r0, r0);
+ umul_ppmm (t3, t2, y0, t0);
+ t3 += y0 * t1 + yp[1] * t0;
+ t2 = ((t2 >> 1) | (t3 << (GMP_NUMB_BITS - 1))) & GMP_NUMB_MAX;
+ t3 = t3 >> 1; /* [t3,t2] <- (rp^2 y - 1) / 2 */
- d = 0;
- for (; bnb >= GMP_NUMB_BITS; bnb = (bnb + 2) >> 1)
- order[d++] = bnb;
+ umul_ppmm (t5, t4, r0, t2);
+ t5 += r0 * t3;
- for (i = d - 1; i >= 0; i--)
+ sub_ddmmss(rp[1], rp[0], 0, r0, t5, t4);
+ } else {
+ *rp = r0 & GMP_NUMB_MAX;
+ return 1;
+ }
+
+ for (bn = 2; --i >= 0;)
{
+ mp_size_t pbn = bn;
+ mpn_sqr (tp, rp, bn); /* tp <- r^2 */
+
bnb = order[i];
bn = 1 + bnb / GMP_LIMB_BITS;
- mpn_sqrlo (tp, rp, bn);
- mpn_mullo_n (tp2, rp, tp, bn); /* tp2 <- rp ^ 3 */
+ mpn_mullo_n (tp2, yp, tp, bn); /* tp2 <- rp^2 y */
+ mpn_rshift (tp2+pbn-1, tp2+pbn-1, bn-pbn+1, 1); /* tp2 <- (rp^2 y - 1) / 2 */
- mpn_mul_1 (tp, rp, bn, 3);
-
- mpn_mullo_n (rp, yp, tp2, bn);
+ if (pbn <= bn - pbn)
+ mpn_mul (tp, tp2 + pbn - 1, bn - pbn + 1, rp, pbn);
+ else
+ mpn_mul (tp, rp, pbn, tp2 + pbn - 1, bn - pbn + 1);
+ /* tp <- r (r^2 y - 1) / 2 */
-#if HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (rp, tp, rp, bn);
-#else
- mpn_sub_n (tp2, tp, rp, bn);
- mpn_rshift (rp, tp2, bn, 1);
-#endif
+ if (bn > pbn) {
+ if (rp[pbn - 1] < tp[0])
+ mpn_com (rp + pbn, tp + 1, bn - pbn);
+ else
+ mpn_neg (rp + pbn, tp + 1, bn - pbn);
+ }
+ rp[pbn - 1] -= tp[0];
+ /* rp <- r - r (r^2 y - 1) / 2 */
}
}
return 1;
More information about the gmp-commit
mailing list