[Gmp-commit] /home/hgfiles/gmp: Slightly faster CRT for {mul, sqr}mod_bnm1.
mercurial at gmplib.org
mercurial at gmplib.org
Sun Dec 20 23:59:45 CET 2009
details: /home/hgfiles/gmp/rev/f378a4363215
changeset: 13149:f378a4363215
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Sun Dec 20 23:59:38 2009 +0100
description:
Slightly faster CRT for {mul,sqr}mod_bnm1.
diffstat:
ChangeLog | 5 +++
mpn/generic/mulmod_bnm1.c | 77 ++++++++++++++++++++++++++++++++--------------
mpn/generic/sqrmod_bnm1.c | 77 ++++++++++++++++++++++++++++++++--------------
3 files changed, 111 insertions(+), 48 deletions(-)
diffs (222 lines):
diff -r 8933a257b0c7 -r f378a4363215 ChangeLog
--- a/ChangeLog Sun Dec 20 23:54:57 2009 +0100
+++ b/ChangeLog Sun Dec 20 23:59:38 2009 +0100
@@ -1,3 +1,8 @@
+2009-12-20 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/generic/mulmod_bnm1.c (mpn_mulmod_bnm1): New CRT.
+ * mpn/generic/sqrmod_bnm1.c (mpn_sqrmod_bnm1): Likewise.
+
2009-12-20 Torbjorn Granlund <tege at gmplib.org>
* Change all bit counts for bignums to use mp_bitcnt_t.
diff -r 8933a257b0c7 -r f378a4363215 mpn/generic/mulmod_bnm1.c
--- a/mpn/generic/mulmod_bnm1.c Sun Dec 20 23:54:57 2009 +0100
+++ b/mpn/generic/mulmod_bnm1.c Sun Dec 20 23:59:38 2009 +0100
@@ -27,6 +27,7 @@
#include "gmp.h"
#include "gmp-impl.h"
+#include "longlong.h"
/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
mod B^rn - 1, and values are semi-normalised; zero is represented
@@ -132,7 +133,7 @@
/* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
and crt together as
- x = xm + (B^n - 1) * [B^n/2 (xp - xm) mod (B^n+1)]
+ x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
*/
#define a0 ap
@@ -229,33 +230,61 @@
}
}
- /* xp = xm - xp mod (B^n + 1). Assumes normalised
- representation. Puts high bit in hi. */
- if (UNLIKELY (xp[n]))
- hi = mpn_add_1 (xp, rp, n, 1);
- else
- {
- cy = mpn_sub_n (xp, rp, xp, n);
- MPN_INCR_U (xp, n + 1 , cy);
- hi = xp[n];
- }
- /* Multiply by -B^n/2, using
+ /* Here the CRT recomposition begins.
- -B^n/2 * (2 x1 + x0) = x1 - B^n/2 x0 (mod (B^n + 1))
+ xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
+ Division by 2 is a bitwise rotation.
+
+ Assumes xp normalised mod (B^n+1).
+
+ The residue class [0] is represented by [B^n-1]; except when
+ both input are ZERO.
*/
- cy = mpn_rshift (rp+n, xp, n, 1);
- if (hi != cy)
- rp[2*n-1] |= GMP_NUMB_HIGHBIT;
- if (hi < cy)
- /* Underflow */
- hi = mpn_add_1 (rp+n, rp+n, n, 1);
- else
- hi = 0;
+#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
+#if HAVE_NATIVE_mpn_rsh1add_nc
+ cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
+ hi = cy << (GMP_NUMB_BITS - 1);
+ cy = 0;
+ /* next add_ssaaaa will set cy = 1 only if rp[n-1]+=hi overflows,
+ i.e. a further increment will not overflow again. */
+#else /* ! _nc */
+ cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
+ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+ cy >>= 1;
+ /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
+ the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
+#endif
+ add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
+#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
+#if HAVE_NATIVE_mpn_add_nc
+ cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
+#else /* ! _nc */
+ cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
+#endif
+ cy += (rp[0]&1);
+ mpn_rshift(rp, rp, n, 1);
+ ASSERT (cy <= 2);
+ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+ cy >>= 1;
+ /* We can have cy != 0 only if hi = 0... */
+ ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
+ rp[n-1] |= hi;
+ /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
+#endif
+ ASSERT (cy <= 1);
+ /* Next increment can not overflow, read the previous comments about cy. */
+ ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
+ MPN_INCR_U(rp, n, cy);
- cy = mpn_sub_n (rp, rp, rp+n, n);
- cy = mpn_sub_1 (rp+n, rp+n, n, cy + hi);
- ASSERT (cy == hi);
+ /* Compute the highest half:
+ ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
+ */
+ cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+ /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+ DECR will affect _at most_ the lowest n limbs. */
+ MPN_DECR_U (rp, 2*n, cy);
+
#undef a0
#undef a1
#undef b0
diff -r 8933a257b0c7 -r f378a4363215 mpn/generic/sqrmod_bnm1.c
--- a/mpn/generic/sqrmod_bnm1.c Sun Dec 20 23:54:57 2009 +0100
+++ b/mpn/generic/sqrmod_bnm1.c Sun Dec 20 23:59:38 2009 +0100
@@ -27,6 +27,7 @@
#include "gmp.h"
#include "gmp-impl.h"
+#include "longlong.h"
/* Input is {ap,rn}; output is {rp,rn}, computation is
mod B^rn - 1, and values are semi-normalised; zero is represented
@@ -118,7 +119,7 @@
/* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
and crt together as
- x = xm + (B^n - 1) * [B^n/2 (xp - xm) mod (B^n+1)]
+ x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
*/
#define a0 ap
@@ -184,33 +185,61 @@
}
}
- /* xp = xm - xp mod (B^n + 1). Assumes normalised
- representation. Puts high bit in hi. */
- if (UNLIKELY (xp[n]))
- hi = mpn_add_1 (xp, rp, n, 1);
- else
- {
- cy = mpn_sub_n (xp, rp, xp, n);
- MPN_INCR_U (xp, n + 1 , cy);
- hi = xp[n];
- }
- /* Multiply by -B^n/2, using
+ /* Here the CRT recomposition begins.
- -B^n/2 * (2 x1 + x0) = x1 - B^n/2 x0 (mod (B^n + 1))
+ xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
+ Division by 2 is a bitwise rotation.
+
+ Assumes xp normalised mod (B^n+1).
+
+ The residue class [0] is represented by [B^n-1]; except when
+ both input are ZERO.
*/
- cy = mpn_rshift (rp+n, xp, n, 1);
- if (hi != cy)
- rp[2*n-1] |= GMP_NUMB_HIGHBIT;
- if (hi < cy)
- /* Underflow */
- hi = mpn_add_1 (rp+n, rp+n, n, 1);
- else
- hi = 0;
+#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
+#if HAVE_NATIVE_mpn_rsh1add_nc
+ cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
+ hi = cy << (GMP_NUMB_BITS - 1);
+ cy = 0;
+ /* next add_ssaaaa will set cy = 1 only if rp[n-1]+=hi overflows,
+ i.e. a further increment will not overflow again. */
+#else /* ! _nc */
+ cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
+ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+ cy >>= 1;
+ /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
+ the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
+#endif
+ add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
+#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
+#if HAVE_NATIVE_mpn_add_nc
+ cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
+#else /* ! _nc */
+ cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
+#endif
+ cy += (rp[0]&1);
+ mpn_rshift(rp, rp, n, 1);
+ ASSERT (cy <= 2);
+ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+ cy >>= 1;
+ /* We can have cy != 0 only if hi = 0... */
+ ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
+ rp[n-1] |= hi;
+ /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
+#endif
+ ASSERT (cy <= 1);
+ /* Next increment can not overflow, read the previous comments about cy. */
+ ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
+ MPN_INCR_U(rp, n, cy);
- cy = mpn_sub_n (rp, rp, rp+n, n);
- cy = mpn_sub_1 (rp+n, rp+n, n, cy + hi);
- ASSERT (cy == hi);
+ /* Compute the highest half:
+ ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
+ */
+ cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+ /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+ DECR will affect _at most_ the lowest n limbs. */
+ MPN_DECR_U (rp, 2*n, cy);
+
#undef a0
#undef a1
#undef xp
More information about the gmp-commit
mailing list