mpn_sqrtrem{1,2}

Wed Mar 22 22:45:00 UTC 2017

Hi,

I now have a working version of sqrtrem1 that uses floating-point sqrt instruction on x86-64.
For a quick glance, here is the speedup on my two machines:

Laptop Core 2 Duo 2GHz
======================

1) Time when using only rev 17327
2) Time when using rev 17327 + FP version for sqrtrem1 only, with NEEDNORM=0

1024 bits   time 74.72 -> 74.99   speedup 1.00
 512 bits   time 39.69 -> 39.03   speedup 1.02
 256 bits   time 20.37 -> 17.96   speedup 1.13
 192 bits   time 22.70 -> 20.83   speedup 1.08
 128 bits   time  8.42 ->  7.07   speedup 1.19
  96 bits   time  9.43 ->  8.40   speedup 1.12
  64 bits   time  4.55 ->  2.29   speedup 1.99
  48 bits   time  5.12 ->  2.30   speedup 2.23
  32 bits   time  5.12 ->  2.29   speedup 2.24
  16 bits   time  5.12 ->  2.30   speedup 2.23


PC i7-4790 3.60GHz
==================

1) Time when using only rev 17327
2) Time when using rev 17327 + FP version for sqrtrem1 only, with NEEDNORM=0

1024 bits   time 25.55 -> 25.45   speedup 1.00
 512 bits   time 13.40 -> 13.08   speedup 1.02
 256 bits   time  6.09 ->  5.69   speedup 1.07
 192 bits   time  6.96 ->  6.73   speedup 1.03
 128 bits   time  2.42 ->  2.22   speedup 1.09
  96 bits   time  3.16 ->  3.14   speedup 1.01
  64 bits   time  1.06 ->  0.79   speedup 1.34
  48 bits   time  1.27 ->  0.79   speedup 1.61
  32 bits   time  1.27 ->  0.79   speedup 1.61
  16 bits   time  1.27 ->  0.79   speedup 1.61



The patch is at the end of this message.
It tests whether we're using GNU C, that we have ASM, and that the machine is amd64.
If so, it defines a special sqrtrem1 implementation.
It also declares it as "no need for normalization", which contributes to speedup.

What's missing is:
- feature tests for presence of SSE / SSE2 and whether the compiler is able to use these?
- decide whether it's appropriate to put inline asm in this source file instead of in longlong.h or in mpn/
- benchmark with 32-bit ABI
- exhaustive test for good measure
- a similar version for sqrtrem2

About the pure C code, integer version that was working on,
I now have an exhaustively validated version, with only one table of invsqrt shared between the 2 versions (32b, 64b, 2x64b).
Previously I observed a moderate but interesting speedup compared to GMP.
But... when I put that code in GMP code, that resulted in a noticeable slowdown /o\
So, not yet ready...

Below is my current patch to have sqrtrem1 based on FP sqrt instructions.
FP instructions are from SSE or SSE2 depending on GMP_NUMB_BITS.

Adrien



diff -r 6393a907d1b5 mpn/generic/sqrtrem.c

--- a/mpn/generic/sqrtrem.c	Wed Mar 08 19:57:44 2017 +0100
+++ b/mpn/generic/sqrtrem.c	Wed Mar 22 23:39:31 2017 +0100
@@ -49,6 +49,50 @@
 #define USE_DIVAPPR_Q 1
 #define TRACE(x)
 
+
+#if ! defined (NO_ASM) && defined (__GNUC__) && defined (__amd64__)
+
+static mp_limb_t
+mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
+{
+  mp_limb_t root, rem, corr;
+
+  #if GMP_NUMB_BITS == 32
+  /* Use SSE floating-point instruction SQRTSS */
+  float fp0, fp1;
+  fp0 = a0;
+  __asm__ ("sqrtss %1, %0" : "=x" (fp1) : "xm" (fp0));
+  root = fp1;
+  #endif
+
+  #if GMP_NUMB_BITS == 64
+  /* Use SSE2 floating-point instruction SQRTSD */
+  double fp0, fp1;
+  fp0 = a0;
+  __asm__ ("sqrtsd %1, %0" : "=x" (fp1) : "xm" (fp0));
+  root = fp1;
+  #endif
+
+  /* Get correction if root is too high: a - r^2 < 0 */
+  rem = a0 - root * root;
+  corr = rem >> 63;
+  /* Get correction if root is too low: r^2 + 2r - a < 0 */
+  rem = 2*root - rem;
+  root = root + (rem >> 63) - corr;
+
+  *rp = a0 - root * root;
+  return root;
+}
+
+#define SQRTREM1 1
+#define SQRTREM1_NEEDNORM 0
+
+#endif  /* amd64 with SSE / SSE2 */
+
+
+/* Fallback code for sqrtrem1: pure C code */
+#ifndef SQRTREM1
+
 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
 {
   0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
@@ -164,6 +208,7 @@
   return x0;
 }
 
+#endif  /* #ifndef SQRTREM1 */
 
 #define Prec (GMP_NUMB_BITS >> 1)
 #if ! defined(SQRTREM2_INPLACE)
@@ -427,6 +472,15 @@
 }
 
 
+/* By default, do as if sqrtrem{1,2} need normalized input
+ * When using version that support non-normalized input, set to 0 */
+#if ! defined(SQRTREM1_NEEDNORM)
+#define SQRTREM1_NEEDNORM 1
+#endif
+#if ! defined(SQRTREM2_NEEDNORM)
+#define SQRTREM2_NEEDNORM 1
+#endif
+
 mp_size_t
 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
 {
@@ -443,6 +497,30 @@
   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
 
+#if !SQRTREM1_NEEDNORM
+  if (nn == 1) {
+    sp[0] = mpn_sqrtrem1 (&rl, high);
+    if (rp != NULL)
+      rp[0] = rl;
+    return rl != 0;
+  }
+#endif
+#if !SQRTREM2_NEEDNORM
+  if (nn == 2) {
+    mp_limb_t tp [2];
+    if (rp == NULL) rp = tp;
+#if SQRTREM2_INPLACE
+    rp[1] = np[1];
+    rp[0] = np[0];
+    cc = CALL_SQRTREM2_INPLACE (sp, rp);
+#else
+    cc = mpn_sqrtrem2 (sp, rp, np);
+#endif
+    rp[1] = cc;
+    return ((rp[0] | cc) != 0) + cc;
+  }
+#endif
+
   high = np[nn - 1];
   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
     c = 0;
@@ -453,6 +531,8 @@
 
       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
     }
+
+#if SQRTREM1_NEEDNORM
   if (nn == 1) {
     if (c == 0)
       {
@@ -469,6 +549,8 @@
       }
     return rl != 0;
   }
+#endif
+#if SQRTREM2_NEEDNORM
   if (nn == 2) {
     mp_limb_t tp [2];
     if (rp == NULL) rp = tp;
@@ -490,11 +572,13 @@
 	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
 	rp[0] = rl << (2*c);
 	CALL_SQRTREM2_INPLACE (sp, rp);
-	cc = sp[0] >>= c;	/* c != 0, the higest bit of the root cc is 0. */
+	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
 	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
 	return rl != 0;
       }
   }
+#endif
+
   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
 
   if ((rp == NULL) && (nn > 8))