mpn_sqrtrem1

Tue Dec 20 21:57:36 UTC 2016

Ciao,

Il Mar, 20 Dicembre 2016 7:29 pm, Adrien Prost-Boucle ha scritto:
>> Reducing the number of multiplications is possible... but I bet a
>> Karatsuba umul_ppmm() is not faster than the plain version (at least not
>> on current 64-bits CPUs ;-)
>
> Thank you for the analysis, I was curious :-)

Me too.

I decided to try an as-branch-free-as-I-can Karatsuba implementation for
umul_ppmm in the simpler environment: mini-gmp.

Apply the following patch, define UMUL_PPMM_USE_KARATSUBA, and experience
a slowed-down, larger-code version of mini-gmp ;-D


diff -r 1cdb2f2d864b mini-gmp/mini-gmp.c

--- a/mini-gmp/mini-gmp.c	Sun Dec 18 09:19:44 2016 +0100
+++ b/mini-gmp/mini-gmp.c	Tue Dec 20 23:40:55 2016 +0100
@@ -111,6 +111,50 @@
     (sl) = __x;								\
   } while (0)

+#ifdef UMUL_PPMM_USE_KARATSUBA
+#define gmp_umul_ppmm(w1, w0, u, v)				\
+  do {								\
+    mp_limb_t __x1, __x2, __x3;					\
+    unsigned __x0, __ul, __vl, __uh, __vh;			\
+    mp_limb_t __u = (u), __v = (v);				\
+								\
+    __ul = __u & GMP_LLIMB_MASK;				\
+    __uh = __u >> (GMP_LIMB_BITS / 2);				\
+    __vl = __v & GMP_LLIMB_MASK;				\
+    __vh = __v >> (GMP_LIMB_BITS / 2);				\
+								\
+    __x2 = (mp_limb_t) __ul * __vl;				\
+    __x3 = (mp_limb_t) __uh * __vh;				\
+    __u  = __uh;						\
+    __u -= __ul;						\
+    __uh = __u >> (GMP_LIMB_BITS / 2);				\
+    __ul = (((unsigned)__u ^ __uh) - __uh) & GMP_LLIMB_MASK;	\
+    __v  = __vh;						\
+    __v -= __vl;						\
+    __vh = __v >> (GMP_LIMB_BITS / 2);				\
+    __vl = (((unsigned)__v ^ __vh) - __vh) & GMP_LLIMB_MASK;	\
+    __vh ^= __uh;						\
+								\
+    __x1 = (mp_limb_t) __ul * __vl;				\
+    __x0 = __x2 & GMP_LLIMB_MASK;				\
+    __x2 = __x2 + (__x2 >> (GMP_LIMB_BITS / 2)) + __x3;		\
+    __ul = __x2 < __x3;						\
+    if (__vh)							\
+      {								\
+	__x2 += __x1;						\
+	__ul += __x2 < __x1;					\
+      }								\
+    else							\
+      {								\
+	__ul -= __x1 > __x2;					\
+	__x2 -= __x1;						\
+      }								\
+								\
+    (w0) = (__x2 << (GMP_LIMB_BITS / 2)) + __x0;		\
+    (w1) = __x3 + (__x2 >> (GMP_LIMB_BITS / 2)) +		\
+      ((mp_limb_t)__ul << (GMP_LIMB_BITS / 2));			\
+  } while (0)
+#else
 #define gmp_umul_ppmm(w1, w0, u, v)					\
   do {									\
     mp_limb_t __x0, __x1, __x2, __x3;					\
@@ -135,6 +179,7 @@
     (w1) = __x3 + (__x1 >> (GMP_LIMB_BITS / 2));			\
     (w0) = (__x1 << (GMP_LIMB_BITS / 2)) + (__x0 & GMP_LLIMB_MASK);	\
   } while (0)
+#endif

 #define gmp_udiv_qrnnd_preinv(q, r, nh, nl, d, di)			\
   do {									\

Regards,
m

-- 
http://bodrato.it/toom/