[Gmp-commit] /var/hg/gmp: Replace div2 implementation.

mercurial at gmplib.org mercurial at gmplib.org
Tue Sep 17 04:20:03 UTC 2019


details:   /var/hg/gmp/rev/f044264e2fe9
changeset: 17914:f044264e2fe9
user:      Niels M?ller <nisse at lysator.liu.se>
date:      Mon Sep 16 22:18:22 2019 +0200
description:
Replace div2 implementation.

* mpn/generic/hgcd2.c (HGCD2_DIV2_METHOD): New define.
(div2): Replaced, since the old implementation had lots of poorly
predicted and expensive branches. Two new implementaions, selected
by HGCD2_DIV2_METHOD.
(div2) [HGCD2_DIV2_METHOD == 1]: Calls div1 on the high limbs,
with unlikely case handling large quotients.
(div2) [HGCD2_DIV2_METHOD == 2]: The previously #if:ed out
version. A bitwise division, relying on fast count_leading_zeros,
and with fewer branches than the previous code.

diffstat:

 ChangeLog           |   12 +++++
 mpn/generic/hgcd2.c |  106 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 70 insertions(+), 48 deletions(-)

diffs (170 lines):

diff -r 370ef90c6a9a -r f044264e2fe9 ChangeLog
--- a/ChangeLog	Mon Sep 16 21:57:37 2019 +0200
+++ b/ChangeLog	Mon Sep 16 22:18:22 2019 +0200
@@ -1,3 +1,15 @@
+2019-09-16  Niels Möller  <nisse at lysator.liu.se>
+
+	* mpn/generic/hgcd2.c (HGCD2_DIV2_METHOD): New define.
+	(div2): Replaced, since the old implementation had lots of poorly
+	predicted and expensive branches. Two new implementaions, selected
+	by HGCD2_DIV2_METHOD.
+	(div2) [HGCD2_DIV2_METHOD == 1]: Calls div1 on the high limbs,
+	with unlikely case handling large quotients.
+	(div2) [HGCD2_DIV2_METHOD == 2]: The previously #if:ed out
+	version. A bitwise division, relying on fast count_leading_zeros,
+	and with fewer branches than the previous code.
+
 2019-09-14  Niels Möller  <nisse at lysator.liu.se>
 
 	* mpn/generic/hgcd2.c (HGCD2_DIV1_METHOD): Rename, and change
diff -r 370ef90c6a9a -r f044264e2fe9 mpn/generic/hgcd2.c
--- a/mpn/generic/hgcd2.c	Mon Sep 16 21:57:37 2019 +0200
+++ b/mpn/generic/hgcd2.c	Mon Sep 16 22:18:22 2019 +0200
@@ -40,6 +40,10 @@
 #define HGCD2_DIV1_METHOD 3
 #endif
 
+#ifndef HGCD2_DIV2_METHOD
+#define HGCD2_DIV2_METHOD 2
+#endif
+
 #if GMP_NAIL_BITS != 0
 #error Nails not implemented
 #endif
@@ -49,6 +53,12 @@
 static inline mp_double_limb_t
 div1 (mp_limb_t n0, mp_limb_t d0);
 
+/* Two-limb division optimized for small quotients.  */
+static mp_limb_t
+div2 (mp_ptr rp,
+      mp_limb_t n1, mp_limb_t n0,
+      mp_limb_t d1, mp_limb_t d0);
+
 #if HGCD2_DIV1_METHOD == 1
 
 static inline mp_double_limb_t
@@ -138,68 +148,66 @@
 #error Unknown HGCD2_DIV1_METHOD
 #endif
 
-/* Two-limb division optimized for small quotients.  */
-static inline mp_limb_t
+#if HGCD2_DIV2_METHOD == 1
+
+static mp_limb_t
 div2 (mp_ptr rp,
-      mp_limb_t nh, mp_limb_t nl,
-      mp_limb_t dh, mp_limb_t dl)
+      mp_limb_t n1, mp_limb_t n0,
+      mp_limb_t d1, mp_limb_t d0)
 {
-  mp_limb_t q = 0;
+  mp_double_limb_t rq = div1 (n1, d1);
+  if (UNLIKELY (rq.d1 > d1))
+    {
+      mp_limb_t n2, q, t1, t0;
+      int c;
+
+      /* Normalize */
+      count_leading_zeros (c, d1);
+      ASSERT (c > 0);
 
-  if ((mp_limb_signed_t) nh < 0)
-    {
-      int cnt;
-      for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
+      n2 = n1 >> (GMP_LIMB_BITS - c);
+      n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
+      n0 <<= c;
+      d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
+      d0 <<= c;
+
+      udiv_qrnnd (q, n1, n2, n1, d1);
+      umul_ppmm (t1, t0, q, d0);
+      if (t1 > n1 || (t1 == n1 && t0 > n0))
 	{
-	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
-	  dl = dl << 1;
+	  ASSERT (q > 0);
+	  q--;
+	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
 	}
+      sub_ddmmss (n1, n0, n1, n0, t1, t0);
 
-      while (cnt)
-	{
-	  q <<= 1;
-	  if (nh > dh || (nh == dh && nl >= dl))
-	    {
-	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
-	      q |= 1;
-	    }
-	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
-	  dh = dh >> 1;
-	  cnt--;
-	}
+      /* Undo normalization */
+      rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
+      rp[1] = n1 >> c;
+
+      return q;
     }
   else
     {
-      int cnt;
-      for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
-	{
-	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
-	  dl = dl << 1;
-	}
-
-      while (cnt)
+      mp_limb_t q, t1, t0;
+      n1 = rq.d0;
+      q = rq.d1;
+      umul_ppmm (t1, t0, q, d0);
+      if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
 	{
-	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
-	  dh = dh >> 1;
-	  q <<= 1;
-	  if (nh > dh || (nh == dh && nl >= dl))
-	    {
-	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
-	      q |= 1;
-	    }
-	  cnt--;
+	  ASSERT (q > 0);
+	  q--;
+	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
 	}
+      sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
+      return q;
     }
-
-  rp[0] = nl;
-  rp[1] = nh;
-
-  return q;
 }
 
-#if 0
-/* This div2 uses less branches, but relies on fast count_leading_zeros. */
-static inline mp_limb_t
+#elif HGCD2_DIV2_METHOD == 2
+
+/* Bit-wise div2. Relies on fast count_leading_zeros. */
+static mp_limb_t
 div2 (mp_ptr rp,
       mp_limb_t nh, mp_limb_t nl,
       mp_limb_t dh, mp_limb_t dl)
@@ -238,6 +246,8 @@
 
   return q;
 }
+#else
+#error Unknown HGCD2_DIV2_METHOD
 #endif
 
 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs


More information about the gmp-commit mailing list