[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Feb 28 22:58:40 CET 2011
details: /var/hg/gmp/rev/ff573b48f6e9
changeset: 13947:ff573b48f6e9
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Feb 28 20:22:26 2011 +0100
description:
(mpn_rootrem): Combine two similar scalar divisions. Misc minor cleanup.
details: /var/hg/gmp/rev/74a7752cfd14
changeset: 13948:74a7752cfd14
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Feb 28 20:23:17 2011 +0100
description:
*** empty log message ***
details: /var/hg/gmp/rev/e42801a3626b
changeset: 13949:e42801a3626b
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Feb 28 22:58:27 2011 +0100
description:
Trivial merge.
diffstat:
ChangeLog | 30 +++++++++
mpn/generic/mod_1_1.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++
mpn/generic/rootrem.c | 23 +++---
mpn/x86_64/mod_1_1.asm | 31 +++------
tune/Makefile.am | 2 +-
tune/common.c | 10 +++
tune/mod_1_1-1.c | 28 ++++++++
tune/mod_1_1-2.c | 28 ++++++++
tune/speed.c | 2 +
tune/speed.h | 8 ++
tune/tuneup.c | 24 +++++++
11 files changed, 309 insertions(+), 33 deletions(-)
diffs (truncated from 519 to 300 lines):
diff -r 4828d99fcfb3 -r e42801a3626b ChangeLog
--- a/ChangeLog Mon Feb 28 16:54:52 2011 +0100
+++ b/ChangeLog Mon Feb 28 22:58:27 2011 +0100
@@ -1,5 +1,35 @@
+2011-02-28 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/generic/rootrem.c (mpn_rootrem): Combine two similar scalar
+ divisions. Misc minor cleanup.
+
2011-02-28 Niels Möller <nisse at lysator.liu.se>
+ * tune/tuneup.c (tune_mod_1): Measure mpn_mod_1_1_1 and
+ mpn_mod_1_1_2, to set MOD_1_1P_METHOD.
+
+ * tune/speed.c (routine): Added mpn_mod_1_1_1 and mpn_mod_1_1_2.
+
+ * tune/speed.h: Declare speed_mpn_mod_1_1_1, speed_mpn_mod_1_1_2,
+ mpn_mod_1_1p_1, mpn_mod_1_1p_2, mpn_mod_1_1p_cps_1, and
+ mpn_mod_1_1p_cps_2.
+
+ * tune/common.c (speed_mpn_mod_1_1_1): New function.
+ (speed_mpn_mod_1_1_2): New function.
+
+ * tune/Makefile.am (libspeed_la_SOURCES): Added mod_1_1-1.c
+ mod_1_1-2.c.
+
+ * tune/mod_1_1-1.c: New file.
+ * tune/mod_1_1-2.c: New file.
+
+ * mpn/generic/mod_1_1.c: Implemented an algorithm with fewer
+ multiplications, configured via MOD_1_1P_METHOD.
+
+ * mpn/x86_64/mod_1_1.asm (mpn_mod_1_1p_cps): Simplified
+ computation of B2modb, use B^2 mod (normalized b).
+ (mpn_mod_1_1p): Corresponding changes. Don't shift b.
+
* mpn/generic/pre_mod_1.c (mpn_preinv_mod_1): Use udiv_rnnd_preinv
rather than udiv_qrnnd_preinv.
diff -r 4828d99fcfb3 -r e42801a3626b mpn/generic/mod_1_1.c
--- a/mpn/generic/mod_1_1.c Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/generic/mod_1_1.c Mon Feb 28 22:58:27 2011 +0100
@@ -29,6 +29,61 @@
#include "gmp-impl.h"
#include "longlong.h"
+#ifndef MOD_1_1P_METHOD
+# define MOD_1_1P_METHOD 1
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+ * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
+ * carry out, in the form of a mask. */
+
+#if defined (__GNUC__)
+
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_mssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tsbb\t%k0, %k0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((USItype)(s2)), \
+ "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
+ "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tsbb\t%q0, %q0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((UDItype)(s2)), \
+ "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
+ "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+/* FIXME: How to do carry -> mask on powerpc? */
+#if 0 && HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((UDItype)(s2)), \
+ "r" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
+ "%2" ((UDItype)(a0)), "rI" ((UDItype)(b0)))
+#endif
+#endif /* defined (__GNUC__) */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ do { \
+ UWtype __s0, __s1, __c0, __c1; \
+ __s0 = (a0) + (b0); \
+ __s1 = (a1) + (b1); \
+ __c0 = __s0 < (a0); \
+ __c1 = __s1 < (a1); \
+ (s0) = __s0; \
+ __s1 = __s1 + __c0; \
+ (s1) = __s1; \
+ (s2) = - (__c1 + (__s1 < __c0)); \
+ } while (0)
+#endif
+
+#if MOD_1_1P_METHOD == 1
void
mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
{
@@ -103,3 +158,104 @@
return r >> cnt;
}
+#endif /* MOD_1_1P_METHOD == 1 */
+
+#if MOD_1_1P_METHOD == 2
+void
+mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
+{
+ mp_limb_t bi;
+ mp_limb_t B2modb;
+ int cnt;
+
+ count_leading_zeros (cnt, b);
+
+ b <<= cnt;
+ invert_limb (bi, b);
+
+ cps[0] = bi;
+ cps[1] = cnt;
+
+ if (LIKELY (cnt != 0))
+ {
+ mp_limb_t B1modb = -b;
+ B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+ ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
+ cps[2] = B1modb >> cnt;
+ }
+ B2modb = - b * bi;
+ ASSERT (B2modb <= b); // NB: equality iff b = B/2
+ cps[3] = B2modb;
+}
+
+mp_limb_t
+mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
+{
+ int cnt;
+ mp_limb_t bi, B1modb;
+ mp_limb_t r0, r1;
+ mp_limb_t r;
+
+ ASSERT (n >= 2); /* fix tuneup.c if this is changed */
+
+ r0 = ap[n-2];
+ r1 = ap[n-1];
+
+ if (n > 2)
+ {
+ mp_limb_t B2modb, B2mb;
+ mp_limb_t p0, p1;
+ mp_limb_t r2;
+ mp_size_t j;
+
+ B2modb = bmodb[3];
+ B2mb = B2modb - b;
+
+ umul_ppmm (p1, p0, r1, B2modb);
+ add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
+
+ for (j = n-4; j >= 0; j--)
+ {
+ mp_limb_t cy;
+ /* mp_limb_t t = r0 + B2mb; */
+ umul_ppmm (p1, p0, r1, B2modb);
+
+ ADDC_LIMB (cy, r0, r0, r2 & B2modb);
+ /* Alternative, for cmov: if (cy) r0 = t; */
+ r0 -= (-cy) & b;
+ add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
+ }
+
+ r1 -= (r2 & b);
+ }
+
+ cnt = bmodb[1];
+
+ if (LIKELY (cnt != 0))
+ {
+ mp_limb_t t;
+ mp_limb_t B1modb = bmodb[2];
+
+ umul_ppmm (r1, t, r1, B1modb);
+ r0 += t;
+ r1 += (r0 < t);
+
+ /* Normalize */
+ r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
+ r0 <<= cnt;
+
+ /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows
+ that. */
+ }
+ else
+ {
+ mp_limb_t mask = - (r1 >= b);
+ r1 -= mask & b;
+ }
+
+ bi = bmodb[0];
+
+ udiv_rnnd_preinv (r, r1, r0, b, bi);
+ return r >> cnt;
+}
+#endif /* MOD_1_1P_METHOD == 2 */
diff -r 4828d99fcfb3 -r e42801a3626b mpn/generic/rootrem.c
--- a/mpn/generic/rootrem.c Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/generic/rootrem.c Mon Feb 28 22:58:27 2011 +0100
@@ -79,14 +79,15 @@
mpn_rootrem (mp_ptr rootp, mp_ptr remp,
mp_srcptr up, mp_size_t un, mp_limb_t k)
{
+ mp_size_t m;
ASSERT (un > 0);
ASSERT (up[un - 1] != 0);
ASSERT (k > 1);
- if ((remp == NULL) && (un / k > 2))
- /* call mpn_rootrem recursively, padding {up,un} with k zero limbs,
- which will produce an approximate root with one more limb,
- so that in most cases we can conclude. */
+ m = (un - 1) / k; /* ceil(un/k) - 1 */
+ if (remp == NULL && m > 2)
+ /* Pad {up,un} with k zero limbs. This will produce an approximate root
+ with one more limb, allowing us to compute the exact integral result. */
{
mp_ptr sp, wp;
mp_size_t rn, sn, wn;
@@ -94,21 +95,21 @@
TMP_MARK;
wn = un + k;
wp = TMP_ALLOC_LIMBS (wn); /* will contain the padded input */
- sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
+ sn = m + 2; /* ceil(un/k) + 1 */
sp = TMP_ALLOC_LIMBS (sn); /* approximate root of padded input */
MPN_COPY (wp + k, up, un);
MPN_ZERO (wp, k);
rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
- /* the approximate root S = {sp,sn} is either the correct root of
- {sp,sn}, or one too large. Thus unless the least significant limb
- of S is 0 or 1, we can deduce the root of {up,un} is S truncated by
- one limb. (In case sp[0]=1, we can deduce the root, but not decide
+ /* The approximate root S = {sp,sn} is either the correct root of
+ {sp,sn}, or 1 too large. Thus unless the least significant limb of
+ S is 0 or 1, we can deduce the root of {up,un} is S truncated by one
+ limb. (In case sp[0]=1, we can deduce the root, but not decide
whether it is exact or not.) */
MPN_COPY (rootp, sp + 1, sn - 1);
TMP_FREE;
return rn;
}
- else /* remp <> NULL */
+ else
{
return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
}
@@ -395,7 +396,7 @@
ASSERT_ALWAYS (rn >= qn);
/* R = R - Q = floor(U/2^kk) - S^k */
- if ((i > 1) || (approx == 0))
+ if (i > 1 || approx == 0)
{
mpn_sub (rp, rp, rn, qp, qn);
MPN_NORMALIZE (rp, rn);
diff -r 4828d99fcfb3 -r e42801a3626b mpn/x86_64/mod_1_1.asm
--- a/mpn/x86_64/mod_1_1.asm Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/x86_64/mod_1_1.asm Mon Feb 28 22:58:27 2011 +0100
@@ -48,10 +48,6 @@
C The pre array contains bi, cnt, B1modb, B2modb
C Note: This implementaion needs B1modb only when cnt > 0
-C Currently needs b to not be preshifted, we actually have to undo shift done
-C by caller. Perhaps b shouldn't be passed at all, it should be in the pre
-C block where the cps function is free to store whatever is needed.
-
C The iteration is almost as follows,
C
C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
@@ -80,9 +76,6 @@
mov %rdx, b
mov %rcx, pre
- mov 8(pre), R32(%rcx)
- shr R8(%rcx), b
-
mov -8(ap, n, 8), %rax
cmp $3, n
jnc L(first)
@@ -128,7 +121,7 @@
test R32(%rcx), R32(%rcx)
jz L(normalized)
- C Unnormalized, use B1modb to reduce to size < B b
+ C Unnormalized, use B1modb to reduce to size < B (b+1)
mulq 16(pre)
More information about the gmp-commit
mailing list