[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Mar 21 22:57:09 CET 2011
details: /var/hg/gmp/rev/f524b532036b
changeset: 14090:f524b532036b
user: Niels M?ller <nisse at lysator.liu.se>
date: Mon Mar 21 22:43:20 2011 +0100
description:
Avoid a copy.
details: /var/hg/gmp/rev/1b37551f5a16
changeset: 14091:1b37551f5a16
user: Niels M?ller <nisse at lysator.liu.se>
date: Mon Mar 21 22:56:29 2011 +0100
description:
New 4/2 division loop.
diffstat:
ChangeLog | 25 ++++
gmp-impl.h | 7 +-
mpn/generic/div_qr_2.c | 258 +++++++++++++++++++++++++++++++++++++++++++++++-
tune/Makefile.am | 2 +-
tune/tuneup.c | 12 ++
5 files changed, 293 insertions(+), 11 deletions(-)
diffs (truncated from 394 to 300 lines):
diff -r 67a956b0504a -r 1b37551f5a16 ChangeLog
--- a/ChangeLog Mon Mar 21 21:43:22 2011 +0100
+++ b/ChangeLog Mon Mar 21 22:56:29 2011 +0100
@@ -1,3 +1,28 @@
+2011-03-21 Niels Möller <nisse at lysator.liu.se>
+
+ * tune/tuneup.c (div_qr_2_pi2_threshold): New global variable.
+ (tune_div_qr_2): New function.
+ (all): Call tune_div_qr_2.
+
+ * tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Added div_qr_2.c.
+
+ * gmp-impl.h (DIV_QR_2_PI2_THRESHOLD): Setup for tuning.
+
+ New 4/2 division loop, based on Torbjörn's work:
+ * mpn/generic/div_qr_2.c (add_sssaaaa, add_csaac): New macros.
+ (udiv_qr_4by2): New macro.
+ (invert_4by2): New function.
+ (mpn_div_qr_2_pi2_norm): New function.
+ (DIV_QR_2_PI2_THRESHOLD): New threshold.
+ (mpn_div_qr_2_pi1_norm): Renamed, from...
+ (mpn_div_qr_2_norm): ... old name.
+ (mpn_div_qr_2_pi1_unnorm): Renamed, from...
+ (mpn_div_qr_2_unnorm): ... old name.
+ (mpn_div_qr_2): Use mpn_div_qr_2_pi2_norm for large enough
+ normalized divisors.
+
+ * gmp-impl.h (udiv_qr_3by2): Avoid a copy.
+
2011-03-21 Torbjorn Granlund <tege at gmplib.org>
* configure.in (hppa): Under linux, treat 64-bit processors as if they
diff -r 67a956b0504a -r 1b37551f5a16 gmp-impl.h
--- a/gmp-impl.h Mon Mar 21 21:43:22 2011 +0100
+++ b/gmp-impl.h Mon Mar 21 22:56:29 2011 +0100
@@ -2860,8 +2860,7 @@
\
/* Compute the two most significant limbs of n - q'd */ \
(r1) = (n1) - (d1) * (q); \
- (r0) = (n0); \
- sub_ddmmss ((r1), (r0), (r1), (r0), (d1), (d0)); \
+ sub_ddmmss ((r1), (r0), (r1), (n0), (d1), (d0)); \
umul_ppmm (_t1, _t0, (d0), (q)); \
sub_ddmmss ((r1), (r0), (r1), (r0), _t1, _t0); \
(q)++; \
@@ -4473,6 +4472,10 @@
#define MULLO_MUL_N_THRESHOLD mullo_mul_n_threshold
extern mp_size_t mullo_mul_n_threshold;
+#undef DIV_QR_2_PI2_THRESHOLD
+#define DIV_QR_2_PI2_THRESHOLD div_qr_2_pi2_threshold
+extern mp_size_t div_qr_2_pi2_threshold;
+
#undef DC_DIV_QR_THRESHOLD
#define DC_DIV_QR_THRESHOLD dc_div_qr_threshold
extern mp_size_t dc_div_qr_threshold;
diff -r 67a956b0504a -r 1b37551f5a16 mpn/generic/div_qr_2.c
--- a/mpn/generic/div_qr_2.c Mon Mar 21 21:43:22 2011 +0100
+++ b/mpn/generic/div_qr_2.c Mon Mar 21 22:56:29 2011 +0100
@@ -31,10 +31,194 @@
#include "gmp-impl.h"
#include "longlong.h"
+#ifndef DIV_QR_2_PI2_THRESHOLD
+/* Disabled unless explicitly tuned. */
+#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
+#endif
+
+#ifndef SANITY_CHECK
+#define SANITY_CHECK 0
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+ * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
+ carry-out into an additional sum opeand.
+ * add_csaac accepts two addends and a carry in, and generates a sum
+ and a carry out. A little like a "full adder".
+*/
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((USItype)(s2)), \
+ "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
+ "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#define add_csaac(co, s, a, b, ci) \
+ __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0" \
+ : "=r" (co), "=r" (s) \
+ : "rm" ((USItype)(ci)), "0" (CNST_LIMB(0)), \
+ "%1" ((USItype)(a)), "g" ((USItype)(b)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((UDItype)(s2)), \
+ "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
+ "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#define add_csaac(co, s, a, b, ci) \
+ __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0" \
+ : "=r" (co), "=r" (s) \
+ : "rm" ((UDItype)(ci)), "0" (CNST_LIMB(0)), \
+ "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
+#endif
+
+/* NOTE: Slightly different from Torbjörn's version. */
+#if HAVE_HOST_CPU_FAMILY_powerpc
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((UDItype)(s2)), \
+ "r" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
+ "%r" ((UDItype)(a0)), "rI" ((UDItype)(b0)))
+#endif
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ do { \
+ UWtype __s0, __s1, __c0, __c1; \
+ __s0 = (a0) + (b0); \
+ __s1 = (a1) + (b1); \
+ __c0 = __s0 < (a0); \
+ __c1 = __s1 < (a1); \
+ (s0) = __s0; \
+ __s1 = __s1 + __c0; \
+ (s1) = __s1; \
+ (s2) += __c1 + (__s1 < __c0); \
+ } while (0)
+#endif
+
+#ifndef add_csaac
+#define add_csaac(co, s, a, b, ci) \
+ do { \
+ UWtype __s, __c; \
+ __s = (a) + (b); \
+ __c = __s < (a); \
+ __s = __s + (ci); \
+ (s) = __s; \
+ (co) = __c + (__s < (ci)); \
+ } while (0)
+
+#endif
+
+/* Typically used with r1, r0 same as n3, n2. Other types of overlap
+ between inputs and outputs not supported. */
+#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
+ do { \
+ mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
+ mp_limb_t _t1, _t0; \
+ mp_limb_t _c, _mask; \
+ \
+ umul_ppmm (_q3,_q2a, n3, di1); \
+ umul_ppmm (_q2,_q1, n2, di1); \
+ umul_ppmm (_q2c,_q1c, n3, di0); \
+ add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \
+ umul_ppmm (_q1d,_q0, n2, di0); \
+ add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \
+ \
+ add_ssaaaa (r1, r0, n3, n2, 0, 1); /* FIXME: combine as in x86_64 asm */ \
+ \
+ /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */ \
+ add_csaac (_c, _q0, _q0, n0, 0); \
+ add_csaac (_c, _q1, _q1, n1, _c); \
+ add_csaac (_c, _q2, _q2, r0, _c); \
+ _q3 = _q3 + r1 + _c; \
+ \
+ umul_ppmm (_t1,_t0, _q2, d0); \
+ _t1 += _q2 * d1 + _q3 * d0; \
+ \
+ sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
+ \
+ _mask = -(mp_limb_t) (r1 >= _q1 & (r1 > _q1 | r0 >= _q0)); /* (r1,r0) >= (q1,q0) */ \
+ add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
+ sub_ddmmss (_q3, _q2, _q3, _q2, 0, -_mask); \
+ \
+ if (UNLIKELY (r1 >= d1)) \
+ { \
+ if (r1 > d1 || r0 >= d0) \
+ { \
+ sub_ddmmss (r1, r0, r1, r0, d1, d0); \
+ add_ssaaaa (_q3, _q2, _q3, _q2, 0, 1); \
+ } \
+ } \
+ (q1) = _q3; \
+ (q0) = _q2; \
+ } while (0)
+
+static void
+invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
+{
+ mp_limb_t v1, v0, p1, t1, t0, p0, mask;
+ invert_limb (v1, d1);
+ p1 = d1 * v1;
+ /* <1, v1> * d1 = <B-1, p1> */
+ p1 += d0;
+ if (p1 < d0)
+ {
+ v1--;
+ mask = -(mp_limb_t) (p1 >= d1);
+ p1 -= d1;
+ v1 += mask;
+ p1 -= mask & d1;
+ }
+ /* <1, v1> * d1 + d0 = <B-1, p1> */
+ umul_ppmm (t1, p0, d0, v1);
+ p1 += t1;
+ if (p1 < t1)
+ {
+ if (UNLIKELY (p1 >= d1))
+ {
+ if (p1 > d1 || p0 >= d0)
+ {
+ sub_ddmmss (p1, p0, p1, p0, d1, d0);
+ v1--;
+ }
+ }
+ sub_ddmmss (p1, p0, p1, p0, d1, d0);
+ v1--;
+ }
+ /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
+ * with <p1, p0> + <d1, d0> >= B^2.
+ *
+ * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
+ * partial remainder after <1, v1> is
+ *
+ * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
+ * = <~p1, ~p0, B-1>
+ */
+ udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
+ di[0] = v0;
+ di[1] = v1;
+
+#if SANITY_CHECK
+ {
+ mp_limb_t tp[4];
+ mp_limb_t dp[2];
+ dp[0] = d0;
+ dp[1] = d1;
+ mpn_mul_n (tp, dp, di, 2);
+ ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
+ ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
+ ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
+ ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
+ }
+#endif
+}
static mp_limb_t
-mpn_div_qr_2_norm (mp_ptr qp, mp_ptr np, mp_size_t nn,
- mp_limb_t d1, mp_limb_t d0, mp_limb_t di)
+mpn_div_qr_2_pi1_norm (mp_ptr qp, mp_ptr np, mp_size_t nn,
+ mp_limb_t d1, mp_limb_t d0, mp_limb_t di)
{
mp_limb_t qh;
mp_size_t i;
@@ -76,8 +260,57 @@
}
static mp_limb_t
-mpn_div_qr_2_unnorm (mp_ptr qp, mp_ptr np, mp_size_t nn,
- mp_limb_t d1, mp_limb_t d0, int shift, mp_limb_t di)
+mpn_div_qr_2_pi2_norm (mp_ptr qp, mp_ptr np, mp_size_t nn,
+ mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
+{
+ mp_limb_t qh;
+ mp_size_t i;
+ mp_limb_t r1, r0;
+
+ ASSERT (nn >= 2);
+ ASSERT (d1 & GMP_NUMB_HIGHBIT);
+
+ r1 = np[nn-1];
+ r0 = np[nn-2];
+
+ qh = 0;
+ if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+ {
+#if GMP_NAIL_BITS == 0
+ sub_ddmmss (r1, r0, r1, r0, d1, d0);
+#else
+ r0 = r0 - d0;
+ r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
+ r0 &= GMP_NUMB_MASK;
+#endif
+ qh = 1;
+ }
+
+ for (i = nn - 2; i >= 2; i -= 2)
+ {
+ mp_limb_t n1, n0, q1, q0;
+ n1 = np[i-1];
+ n0 = np[i-2];
+ udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
+ qp[i-1] = q1;
+ qp[i-2] = q0;
+ }
+
+ if (i > 0)
+ {
More information about the gmp-commit
mailing list