[Gmp-commit] /var/hg/gmp: New mpn_div_qr_1n_pi1 variants, DIV_QR_1N_METHOD 3 ...

mercurial at gmplib.org mercurial at gmplib.org
Thu Jul 1 18:38:18 UTC 2021


details:   /var/hg/gmp/rev/a0346cf658ea
changeset: 18227:a0346cf658ea
user:      Niels Möller <nisse at lysator.liu.se>
date:      Thu Jul 01 20:34:15 2021 +0200
description:
New mpn_div_qr_1n_pi1 variants, DIV_QR_1N_METHOD 3 and 4.
Not enabled, but hooked into the speed and tuneup programs.

diffstat:

 ChangeLog                   |   18 +++
 mpn/generic/div_qr_1n_pi1.c |  202 ++++++++++++++++++++++++++++++++++++++++++++
 tune/Makefile.am            |    3 +-
 tune/common.c               |   10 ++
 tune/div_qr_1_tune.c        |    7 +-
 tune/div_qr_1n_pi1_3.c      |   38 ++++++++
 tune/div_qr_1n_pi1_4.c      |   38 ++++++++
 tune/speed.c                |    2 +
 tune/speed.h                |    4 +
 tune/tuneup.c               |    6 +-
 10 files changed, 323 insertions(+), 5 deletions(-)

diffs (truncated from 430 to 300 lines):

diff -r 1694ca7a775e -r a0346cf658ea ChangeLog
--- a/ChangeLog	Sun Jun 20 15:21:23 2021 +0200
+++ b/ChangeLog	Thu Jul 01 20:34:15 2021 +0200
@@ -1,3 +1,21 @@
+2021-07-01  Niels Möller  <nisse at lysator.liu.se>
+
+	* mpn/generic/div_qr_1n_pi1.c (mpn_div_qr_1n_pi1): New variants,
+	DIV_QR_1N_METHOD 3 and 4.
+
+	* tune/div_qr_1n_pi1_3.c: New file.
+	* tune/div_qr_1n_pi1_4.c: New file.
+	* tune/Makefile.am (libspeed_la_SOURCES): Add new files.
+	* tune/div_qr_1_tune.c (__gmpn_div_qr_1n_pi1): Handle new values
+	of div_qr_1n_pi1_method.
+	* tune/common.c (speed_mpn_div_qr_1n_pi1_3)
+	(speed_mpn_div_qr_1n_pi1_4): New functions.
+	* tune/speed.c (routine): Add mpn_div_qr_1n_pi1_3 and
+	mpn_div_qr_1n_pi1_3 to list.
+	* tune/speed.h: Declare new functions.
+	* tune/tuneup.c (tune_div_qr_1): Extend tuning of
+	DIV_QR_1N_PI1_METHOD.
+
 2021-06-20  Marc Glisse  <marc.glisse at inria.fr>
 
 	* gmpxx.h (mpq_class(mpz_class&&)): New constructor.
diff -r 1694ca7a775e -r a0346cf658ea mpn/generic/div_qr_1n_pi1.c
--- a/mpn/generic/div_qr_1n_pi1.c	Sun Jun 20 15:21:23 2021 +0200
+++ b/mpn/generic/div_qr_1n_pi1.c	Thu Jul 01 20:34:15 2021 +0200
@@ -298,6 +298,208 @@
   return u0;
 }
 
+#elif DIV_QR_1N_METHOD == 3
+
+/* This variant handles carry from the u update earlier. This gives a
+   longer critical path, but reduces the work needed for the
+   quotients. */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t cy, u0;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+  u0 = up[n-1];	/* Early read, to allow qp == up. */
+
+  add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
+  u1 -= cy & d;
+  q1 -= cy;
+  qp[n-1] = q1;
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+      mp_limb_t t1, t0;
+
+      /* Additions for the q update:
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *        +---+
+       *        | 1 |  (conditional on {u1, u0} carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+       *
+       * Additions for the u update:
+       *        +-------+
+       *        |u1 * B2|
+       *        +---+---+
+       *      + |u0 |u-1|
+       *        +---+---+
+       *      - | d |     (conditional on carry)
+       *     ---+---+---+
+       *        |u1 | u0|
+       *        +---+---+
+       *
+      */
+      umul_ppmm (p1, p0, u1, B2);
+      ADDC_LIMB (q2, q1, u1, q0);
+      umul_ppmm (t1, t0, u1, dinv);
+      add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
+      u1 -= cy & d;
+
+      /* t1 <= B-2, so cy can be added in without overflow. */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
+      q0 = t0;
+
+      /* Final q update */
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+    }
+
+  q1 = (u1 >= d);
+  u1 -= (-q1) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
+
+#elif DIV_QR_1N_METHOD == 4
+
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t u2, u0;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t B2d0, B2d1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+  /* B2 * (B-d) */
+  umul_ppmm (B2d1, B2d0, B2, -d);
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+
+  add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0);
+
+  /* After read of up[n-1], to allow qp == up. */
+  qp[n-1] = q1 - u2;
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+      mp_limb_t t1, t0;
+
+      /* Additions for the q update. *After* u1 -= u2 & d adjustment.
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *        +---+
+       *        | 1 |  (conditional on {u1, u0} carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+       *
+       * Additions for the u update. *Before* u1 -= u2 & d adjstment.
+       *        +-------+
+       *        |u1 * B2|
+       *        +---+---+
+       *        |u0 |u-1|
+       *        +---+---+
+       + +      |B2(B-d)| (conditional on u2)
+       *   -+---+---+---+
+       *    |u2 |u1 | u0|
+       *    +---+---+---+
+       *
+      */
+       /* Multiply with unadjusted u1, to shorten critical path. */
+      umul_ppmm (p1, p0, u1, B2);
+      u1 -= (d & u2);
+      ADDC_LIMB (q2, q1, u1, q0);
+      umul_ppmm (t1, t0, u1, dinv);
+
+      add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0);
+      add_mssaaaa (u2, u1, u0, u1, u0, p1, p0);
+      u2 += cy;
+      ASSERT(-u2 <= 1);
+
+      /* t1 <= B-2, so u2 can be added in without overflow. */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2);
+      q0 = t0;
+
+      /* Final q update */
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+    }
+  u1 -= u2 & d;
+
+  q1 = (u1 >= d);
+  u1 -= (-q1) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
 #else
 #error Unknown DIV_QR_1N_METHOD
 #endif
diff -r 1694ca7a775e -r a0346cf658ea tune/Makefile.am
--- a/tune/Makefile.am	Sun Jun 20 15:21:23 2021 +0200
+++ b/tune/Makefile.am	Thu Jul 01 20:34:15 2021 +0200
@@ -53,7 +53,8 @@
 
 libspeed_la_SOURCES =							\
   common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
-  div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1_tune.c			\
+  div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c 		\
+  div_qr_1n_pi1_4.c div_qr_1_tune.c					\
   freq.c								\
   gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
   hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c	\
diff -r 1694ca7a775e -r a0346cf658ea tune/common.c
--- a/tune/common.c	Sun Jun 20 15:21:23 2021 +0200
+++ b/tune/common.c	Thu Jul 01 20:34:15 2021 +0200
@@ -718,6 +718,16 @@
 {
   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2);
 }
+double
+speed_mpn_div_qr_1n_pi1_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_3);
+}
+double
+speed_mpn_div_qr_1n_pi1_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_4);
+}
 
 double
 speed_mpn_div_qr_1 (struct speed_params *s)
diff -r 1694ca7a775e -r a0346cf658ea tune/div_qr_1_tune.c
--- a/tune/div_qr_1_tune.c	Sun Jun 20 15:21:23 2021 +0200
+++ b/tune/div_qr_1_tune.c	Thu Jul 01 20:34:15 2021 +0200
@@ -34,10 +34,13 @@
 
 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 
 #if !HAVE_NATIVE_mpn_div_qr_1n_pi1
-#define __gmpn_div_qr_1n_pi1 \
-  (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2)
+#define __gmpn_div_qr_1n_pi1						\
+  (div_qr_1n_pi1_method <= 2						\
+   ? (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2) \
+   : (div_qr_1n_pi1_method == 3 ? mpn_div_qr_1n_pi1_3 : mpn_div_qr_1n_pi1_4))
 #endif
 
 #undef mpn_div_qr_1
diff -r 1694ca7a775e -r a0346cf658ea tune/div_qr_1n_pi1_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tune/div_qr_1n_pi1_3.c	Thu Jul 01 20:34:15 2021 +0200
@@ -0,0 +1,38 @@
+/* mpn/generic/div_qr_1n_pi1.c method 3.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify


More information about the gmp-commit mailing list