mpn_sqrtrem{1,2}

Adrien Prost-Boucle adrien.prost-boucle at laposte.net
Tue Mar 14 21:17:50 UTC 2017


Hi,

On Mon, 2017-03-13 at 22:11 +0100, Marco Bodrato wrote:
> Not even my change really gave 2x... but, before it, you could obtain
> 20-30%, now you measure 50-70%... that's the right direction :-)

I just reached 2.4x :-)
For that, I added macros to indicate whether sqrtrem{1,2} need normalized input.
This shortens the track to sqrtrem{1,2} even more, and removes checks.
Than I recompiled GMP with this new config, with sqrtrem{1,2} extern,
and re-launched my benchmarks by preloading my floating-point based sqrtrem{1,2}.

I added the new results to the previous ones:

Laptop Core 2 Duo 2GHz
======================

1) Time when using only rev 17327
2) Time when using rev 17327 + FP version
3) Time when using rev 17327 and SQRTREM{1,2}_NEEDNORM=0 macros + FP version

1024 bits   time 74.93 -> 75.05 -> 75.11   speedup 0.998 -> 0.997
 512 bits   time 41.63 -> 37.33 -> 37.57   speedup 1.12  -> 1.11
 256 bits   time 19.93 -> 16.66 -> 16.81   speedup 1.20  -> 1.19
 192 bits   time 22.20 -> 19.36 -> 19.22   speedup 1.15  -> 1.16
 128 bits   time  8.43 ->  5.65 ->  5.53   speedup 1.49  -> 1.52
  96 bits   time  9.29 ->  6.27 ->  5.10   speedup 1.48  -> 1.82
  64 bits   time  4.65 ->  3.66 ->  2.73   speedup 1.27  -> 1.70


PC i7-4790 3.60GHz
==================

1) Time when using only rev 17327
2) Time when using rev 17327 + FP version
3) Time when using rev 17327 and SQRTREM{1,2}_NEEDNORM=0 macros + FP version

1024 bits   time 25.55 -> 24.52 -> 24.26   speedup 1.04 -> 1.05
 512 bits   time 13.40 -> 12.64 -> 12.88   speedup 1.06 -> 1.04
 256 bits   time  6.09 ->  5.16 ->  5.13   speedup 1.18 -> 1.19
 192 bits   time  6.96 ->  6.21 ->  6.23   speedup 1.12 -> 1.12
 128 bits   time  2.42 ->  1.41 ->  1.41   speedup 1.72 -> 1.72
  96 bits   time  3.16 ->  1.89 ->  1.32   speedup 1.67 -> 2.39
  64 bits   time  1.06 ->  0.93 ->  0.85   speedup 1.14 -> 1.25



Here is the patch:



diff -r 6393a907d1b5 mpn/generic/sqrtrem.c
--- a/mpn/generic/sqrtrem.c	Wed Mar 08 19:57:44 2017 +0100
+++ b/mpn/generic/sqrtrem.c	Tue Mar 14 22:06:05 2017 +0100
@@ -427,6 +427,15 @@
 }
 
 
+/* By default, do as if sqrtrem{1,2} need normalized input
+ * When using version that support non-normalized input, set to 0 */
+#if ! defined(SQRTREM1_NEEDNORM)
+#define SQRTREM1_NEEDNORM 1
+#endif
+#if ! defined(SQRTREM2_NEEDNORM)
+#define SQRTREM2_NEEDNORM 1
+#endif
+
 mp_size_t
 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
 {
@@ -443,6 +452,30 @@
   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
 
+#if !SQRTREM1_NEEDNORM
+  if (nn == 1) {
+    sp[0] = mpn_sqrtrem1 (&rl, high);
+    if (rp != NULL)
+      rp[0] = rl;
+    return rl != 0;
+  }
+#endif
+#if !SQRTREM2_NEEDNORM
+  if (nn == 2) {
+    mp_limb_t tp [2];
+    if (rp == NULL) rp = tp;
+#if SQRTREM2_INPLACE
+    rp[1] = np[1];
+    rp[0] = np[0];
+    cc = CALL_SQRTREM2_INPLACE (sp, rp);
+#else
+    cc = mpn_sqrtrem2 (sp, rp, np);
+#endif
+    rp[1] = cc;
+    return ((rp[0] | cc) != 0) + cc;
+  }
+#endif
+
   high = np[nn - 1];
   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
     c = 0;
@@ -453,6 +486,8 @@
 
       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
     }
+
+#if SQRTREM1_NEEDNORM
   if (nn == 1) {
     if (c == 0)
       {
@@ -469,6 +504,8 @@
       }
     return rl != 0;
   }
+#endif
+#if SQRTREM2_NEEDNORM
   if (nn == 2) {
     mp_limb_t tp [2];
     if (rp == NULL) rp = tp;
@@ -490,11 +527,13 @@
 	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
 	rp[0] = rl << (2*c);
 	CALL_SQRTREM2_INPLACE (sp, rp);
-	cc = sp[0] >>= c;	/* c != 0, the higest bit of the root cc is 0. */
+	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
 	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
 	return rl != 0;
       }
   }
+#endif
+
   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
 
   if ((rp == NULL) && (nn > 8))



More information about the gmp-devel mailing list