[Gmp-commit] /var/hg/gmp: New mpn_div_qr_1n_pi1 variants, DIV_QR_1N_METHOD 3 ...
mercurial at gmplib.org
mercurial at gmplib.org
Thu Jul 1 18:38:18 UTC 2021
details: /var/hg/gmp/rev/a0346cf658ea
changeset: 18227:a0346cf658ea
user: Niels Möller <nisse at lysator.liu.se>
date: Thu Jul 01 20:34:15 2021 +0200
description:
New mpn_div_qr_1n_pi1 variants, DIV_QR_1N_METHOD 3 and 4.
Not enabled, but hooked into the speed and tuneup programs.
diffstat:
ChangeLog | 18 +++
mpn/generic/div_qr_1n_pi1.c | 202 ++++++++++++++++++++++++++++++++++++++++++++
tune/Makefile.am | 3 +-
tune/common.c | 10 ++
tune/div_qr_1_tune.c | 7 +-
tune/div_qr_1n_pi1_3.c | 38 ++++++++
tune/div_qr_1n_pi1_4.c | 38 ++++++++
tune/speed.c | 2 +
tune/speed.h | 4 +
tune/tuneup.c | 6 +-
10 files changed, 323 insertions(+), 5 deletions(-)
diffs (truncated from 430 to 300 lines):
diff -r 1694ca7a775e -r a0346cf658ea ChangeLog
--- a/ChangeLog Sun Jun 20 15:21:23 2021 +0200
+++ b/ChangeLog Thu Jul 01 20:34:15 2021 +0200
@@ -1,3 +1,21 @@
+2021-07-01 Niels Möller <nisse at lysator.liu.se>
+
+ * mpn/generic/div_qr_1n_pi1.c (mpn_div_qr_1n_pi1): New variants,
+ DIV_QR_1N_METHOD 3 and 4.
+
+ * tune/div_qr_1n_pi1_3.c: New file.
+ * tune/div_qr_1n_pi1_4.c: New file.
+ * tune/Makefile.am (libspeed_la_SOURCES): Add new files.
+ * tune/div_qr_1_tune.c (__gmpn_div_qr_1n_pi1): Handle new values
+ of div_qr_1n_pi1_method.
+ * tune/common.c (speed_mpn_div_qr_1n_pi1_3)
+ (speed_mpn_div_qr_1n_pi1_4): New functions.
+ * tune/speed.c (routine): Add mpn_div_qr_1n_pi1_3 and
+ mpn_div_qr_1n_pi1_3 to list.
+ * tune/speed.h: Declare new functions.
+ * tune/tuneup.c (tune_div_qr_1): Extend tuning of
+ DIV_QR_1N_PI1_METHOD.
+
2021-06-20 Marc Glisse <marc.glisse at inria.fr>
* gmpxx.h (mpq_class(mpz_class&&)): New constructor.
diff -r 1694ca7a775e -r a0346cf658ea mpn/generic/div_qr_1n_pi1.c
--- a/mpn/generic/div_qr_1n_pi1.c Sun Jun 20 15:21:23 2021 +0200
+++ b/mpn/generic/div_qr_1n_pi1.c Thu Jul 01 20:34:15 2021 +0200
@@ -298,6 +298,208 @@
return u0;
}
+#elif DIV_QR_1N_METHOD == 3
+
+/* This variant handles carry from the u update earlier. This gives a
+ longer critical path, but reduces the work needed for the
+ quotients. */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+ mp_limb_t d, mp_limb_t dinv)
+{
+ mp_limb_t B2;
+ mp_limb_t cy, u0;
+ mp_limb_t q0, q1;
+ mp_limb_t p0, p1;
+ mp_limb_t t;
+ mp_size_t j;
+
+ ASSERT (d & GMP_LIMB_HIGHBIT);
+ ASSERT (n > 0);
+ ASSERT (u1 < d);
+
+ if (n == 1)
+ {
+ udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+ return u1;
+ }
+
+ /* FIXME: Could be precomputed */
+ B2 = -d*dinv;
+
+ umul_ppmm (q1, q0, dinv, u1);
+ umul_ppmm (p1, p0, B2, u1);
+ q1 += u1;
+ ASSERT (q1 >= u1);
+ u0 = up[n-1]; /* Early read, to allow qp == up. */
+
+ add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
+ u1 -= cy & d;
+ q1 -= cy;
+ qp[n-1] = q1;
+
+ /* FIXME: Keep q1 in a variable between iterations, to reduce number
+ of memory accesses. */
+ for (j = n-2; j-- > 0; )
+ {
+ mp_limb_t q2, cy;
+ mp_limb_t t1, t0;
+
+ /* Additions for the q update:
+ * +-------+
+ * |u1 * v |
+ * +---+---+
+ * | u1|
+ * +---+
+ * | 1 | (conditional on {u1, u0} carry)
+ * +---+
+ * + | q0|
+ * -+---+---+---+
+ * | q2| q1| q0|
+ * +---+---+---+
+ *
+ * Additions for the u update:
+ * +-------+
+ * |u1 * B2|
+ * +---+---+
+ * + |u0 |u-1|
+ * +---+---+
+ * - | d | (conditional on carry)
+ * ---+---+---+
+ * |u1 | u0|
+ * +---+---+
+ *
+ */
+ umul_ppmm (p1, p0, u1, B2);
+ ADDC_LIMB (q2, q1, u1, q0);
+ umul_ppmm (t1, t0, u1, dinv);
+ add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
+ u1 -= cy & d;
+
+ /* t1 <= B-2, so cy can be added in without overflow. */
+ add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
+ q0 = t0;
+
+ /* Final q update */
+ qp[j+1] = q1;
+ MPN_INCR_U (qp+j+2, n-j-2, q2);
+ }
+
+ q1 = (u1 >= d);
+ u1 -= (-q1) & d;
+
+ udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+ add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+ MPN_INCR_U (qp+1, n-1, q1);
+
+ qp[0] = q0;
+ return u0;
+}
+
+#elif DIV_QR_1N_METHOD == 4
+
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+ mp_limb_t d, mp_limb_t dinv)
+{
+ mp_limb_t B2;
+ mp_limb_t u2, u0;
+ mp_limb_t q0, q1;
+ mp_limb_t p0, p1;
+ mp_limb_t B2d0, B2d1;
+ mp_limb_t t;
+ mp_size_t j;
+
+ ASSERT (d & GMP_LIMB_HIGHBIT);
+ ASSERT (n > 0);
+ ASSERT (u1 < d);
+
+ if (n == 1)
+ {
+ udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+ return u1;
+ }
+
+ /* FIXME: Could be precomputed */
+ B2 = -d*dinv;
+ /* B2 * (B-d) */
+ umul_ppmm (B2d1, B2d0, B2, -d);
+
+ umul_ppmm (q1, q0, dinv, u1);
+ umul_ppmm (p1, p0, B2, u1);
+ q1 += u1;
+ ASSERT (q1 >= u1);
+
+ add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0);
+
+ /* After read of up[n-1], to allow qp == up. */
+ qp[n-1] = q1 - u2;
+
+ /* FIXME: Keep q1 in a variable between iterations, to reduce number
+ of memory accesses. */
+ for (j = n-2; j-- > 0; )
+ {
+ mp_limb_t q2, cy;
+ mp_limb_t t1, t0;
+
+ /* Additions for the q update. *After* u1 -= u2 & d adjustment.
+ * +-------+
+ * |u1 * v |
+ * +---+---+
+ * | u1|
+ * +---+
+ * | 1 | (conditional on {u1, u0} carry)
+ * +---+
+ * + | q0|
+ * -+---+---+---+
+ * | q2| q1| q0|
+ * +---+---+---+
+ *
+ * Additions for the u update. *Before* u1 -= u2 & d adjstment.
+ * +-------+
+ * |u1 * B2|
+ * +---+---+
+ * |u0 |u-1|
+ * +---+---+
+ + + |B2(B-d)| (conditional on u2)
+ * -+---+---+---+
+ * |u2 |u1 | u0|
+ * +---+---+---+
+ *
+ */
+ /* Multiply with unadjusted u1, to shorten critical path. */
+ umul_ppmm (p1, p0, u1, B2);
+ u1 -= (d & u2);
+ ADDC_LIMB (q2, q1, u1, q0);
+ umul_ppmm (t1, t0, u1, dinv);
+
+ add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0);
+ add_mssaaaa (u2, u1, u0, u1, u0, p1, p0);
+ u2 += cy;
+ ASSERT(-u2 <= 1);
+
+ /* t1 <= B-2, so u2 can be added in without overflow. */
+ add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2);
+ q0 = t0;
+
+ /* Final q update */
+ qp[j+1] = q1;
+ MPN_INCR_U (qp+j+2, n-j-2, q2);
+ }
+ u1 -= u2 & d;
+
+ q1 = (u1 >= d);
+ u1 -= (-q1) & d;
+
+ udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+ add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+ MPN_INCR_U (qp+1, n-1, q1);
+
+ qp[0] = q0;
+ return u0;
+}
#else
#error Unknown DIV_QR_1N_METHOD
#endif
diff -r 1694ca7a775e -r a0346cf658ea tune/Makefile.am
--- a/tune/Makefile.am Sun Jun 20 15:21:23 2021 +0200
+++ b/tune/Makefile.am Thu Jul 01 20:34:15 2021 +0200
@@ -53,7 +53,8 @@
libspeed_la_SOURCES = \
common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \
- div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1_tune.c \
+ div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c \
+ div_qr_1n_pi1_4.c div_qr_1_tune.c \
freq.c \
gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \
hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \
diff -r 1694ca7a775e -r a0346cf658ea tune/common.c
--- a/tune/common.c Sun Jun 20 15:21:23 2021 +0200
+++ b/tune/common.c Thu Jul 01 20:34:15 2021 +0200
@@ -718,6 +718,16 @@
{
SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2);
}
+double
+speed_mpn_div_qr_1n_pi1_3 (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_3);
+}
+double
+speed_mpn_div_qr_1n_pi1_4 (struct speed_params *s)
+{
+ SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_4);
+}
double
speed_mpn_div_qr_1 (struct speed_params *s)
diff -r 1694ca7a775e -r a0346cf658ea tune/div_qr_1_tune.c
--- a/tune/div_qr_1_tune.c Sun Jun 20 15:21:23 2021 +0200
+++ b/tune/div_qr_1_tune.c Thu Jul 01 20:34:15 2021 +0200
@@ -34,10 +34,13 @@
mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
#if !HAVE_NATIVE_mpn_div_qr_1n_pi1
-#define __gmpn_div_qr_1n_pi1 \
- (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2)
+#define __gmpn_div_qr_1n_pi1 \
+ (div_qr_1n_pi1_method <= 2 \
+ ? (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2) \
+ : (div_qr_1n_pi1_method == 3 ? mpn_div_qr_1n_pi1_3 : mpn_div_qr_1n_pi1_4))
#endif
#undef mpn_div_qr_1
diff -r 1694ca7a775e -r a0346cf658ea tune/div_qr_1n_pi1_3.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tune/div_qr_1n_pi1_3.c Thu Jul 01 20:34:15 2021 +0200
@@ -0,0 +1,38 @@
+/* mpn/generic/div_qr_1n_pi1.c method 3.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
More information about the gmp-commit
mailing list