mpn_sqrtrem{1,2}
Adrien Prost-Boucle
adrien.prost-boucle at laposte.net
Tue Mar 14 21:17:50 UTC 2017
Hi,
On Mon, 2017-03-13 at 22:11 +0100, Marco Bodrato wrote:
> Not even my change really gave 2x... but, before it, you could obtain
> 20-30%, now you measure 50-70%... that's the right direction :-)
I just reached 2.4x :-)
For that, I added macros to indicate whether sqrtrem{1,2} need normalized input.
This shortens the track to sqrtrem{1,2} even more, and removes checks.
Than I recompiled GMP with this new config, with sqrtrem{1,2} extern,
and re-launched my benchmarks by preloading my floating-point based sqrtrem{1,2}.
I added the new results to the previous ones:
Laptop Core 2 Duo 2GHz
======================
1) Time when using only rev 17327
2) Time when using rev 17327 + FP version
3) Time when using rev 17327 and SQRTREM{1,2}_NEEDNORM=0 macros + FP version
1024 bits time 74.93 -> 75.05 -> 75.11 speedup 0.998 -> 0.997
512 bits time 41.63 -> 37.33 -> 37.57 speedup 1.12 -> 1.11
256 bits time 19.93 -> 16.66 -> 16.81 speedup 1.20 -> 1.19
192 bits time 22.20 -> 19.36 -> 19.22 speedup 1.15 -> 1.16
128 bits time 8.43 -> 5.65 -> 5.53 speedup 1.49 -> 1.52
96 bits time 9.29 -> 6.27 -> 5.10 speedup 1.48 -> 1.82
64 bits time 4.65 -> 3.66 -> 2.73 speedup 1.27 -> 1.70
PC i7-4790 3.60GHz
==================
1) Time when using only rev 17327
2) Time when using rev 17327 + FP version
3) Time when using rev 17327 and SQRTREM{1,2}_NEEDNORM=0 macros + FP version
1024 bits time 25.55 -> 24.52 -> 24.26 speedup 1.04 -> 1.05
512 bits time 13.40 -> 12.64 -> 12.88 speedup 1.06 -> 1.04
256 bits time 6.09 -> 5.16 -> 5.13 speedup 1.18 -> 1.19
192 bits time 6.96 -> 6.21 -> 6.23 speedup 1.12 -> 1.12
128 bits time 2.42 -> 1.41 -> 1.41 speedup 1.72 -> 1.72
96 bits time 3.16 -> 1.89 -> 1.32 speedup 1.67 -> 2.39
64 bits time 1.06 -> 0.93 -> 0.85 speedup 1.14 -> 1.25
Here is the patch:
diff -r 6393a907d1b5 mpn/generic/sqrtrem.c
--- a/mpn/generic/sqrtrem.c Wed Mar 08 19:57:44 2017 +0100
+++ b/mpn/generic/sqrtrem.c Tue Mar 14 22:06:05 2017 +0100
@@ -427,6 +427,15 @@
}
+/* By default, do as if sqrtrem{1,2} need normalized input
+ * When using version that support non-normalized input, set to 0 */
+#if ! defined(SQRTREM1_NEEDNORM)
+#define SQRTREM1_NEEDNORM 1
+#endif
+#if ! defined(SQRTREM2_NEEDNORM)
+#define SQRTREM2_NEEDNORM 1
+#endif
+
mp_size_t
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
{
@@ -443,6 +452,30 @@
ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
+#if !SQRTREM1_NEEDNORM
+ if (nn == 1) {
+ sp[0] = mpn_sqrtrem1 (&rl, high);
+ if (rp != NULL)
+ rp[0] = rl;
+ return rl != 0;
+ }
+#endif
+#if !SQRTREM2_NEEDNORM
+ if (nn == 2) {
+ mp_limb_t tp [2];
+ if (rp == NULL) rp = tp;
+#if SQRTREM2_INPLACE
+ rp[1] = np[1];
+ rp[0] = np[0];
+ cc = CALL_SQRTREM2_INPLACE (sp, rp);
+#else
+ cc = mpn_sqrtrem2 (sp, rp, np);
+#endif
+ rp[1] = cc;
+ return ((rp[0] | cc) != 0) + cc;
+ }
+#endif
+
high = np[nn - 1];
if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
c = 0;
@@ -453,6 +486,8 @@
c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
}
+
+#if SQRTREM1_NEEDNORM
if (nn == 1) {
if (c == 0)
{
@@ -469,6 +504,8 @@
}
return rl != 0;
}
+#endif
+#if SQRTREM2_NEEDNORM
if (nn == 2) {
mp_limb_t tp [2];
if (rp == NULL) rp = tp;
@@ -490,11 +527,13 @@
rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
rp[0] = rl << (2*c);
CALL_SQRTREM2_INPLACE (sp, rp);
- cc = sp[0] >>= c; /* c != 0, the higest bit of the root cc is 0. */
+ cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
rp[0] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
return rl != 0;
}
}
+#endif
+
tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
if ((rp == NULL) && (nn > 8))
More information about the gmp-devel
mailing list