[Gmp-commit] /home/hgfiles/gmp: mpn_mulmod_bnm1 changes, to reduce allocation...
mercurial at gmplib.org
mercurial at gmplib.org
Thu Jan 21 11:26:04 CET 2010
details: /home/hgfiles/gmp/rev/85730915e18f
changeset: 13387:85730915e18f
user: Niels M?ller <nisse at lysator.liu.se>
date: Thu Jan 21 11:19:52 2010 +0100
description:
mpn_mulmod_bnm1 changes, to reduce allocation in mpn_nussbaumer_mul.
diffstat:
ChangeLog | 14 +++++++++++++
mpn/generic/mulmod_bnm1.c | 46 +++++++++++++++++++++++++++++++++++++------
mpn/generic/nussbaumer_mul.c | 9 +++----
mpn/generic/sqrmod_bnm1.c | 34 +++++++++++++++++++++++++------
tests/mpn/t-mulmod_bnm1.c | 35 +++++++++++++++++----------------
5 files changed, 102 insertions(+), 36 deletions(-)
diffs (truncated from 311 to 300 lines):
diff -r 630c72a64734 -r 85730915e18f ChangeLog
--- a/ChangeLog Wed Jan 20 09:21:25 2010 +0100
+++ b/ChangeLog Thu Jan 21 11:19:52 2010 +0100
@@ -1,3 +1,17 @@
+2010-01-21 Niels Möller <nisse at lysator.liu.se>
+
+ * mpn/generic/nussbaumer_mul.c (mpn_nussbaumer_mul): Take
+ advantage of new mpn_mulmod_bnm1 interface, to reduce allocation.
+
+ * tests/mpn/t-mulmod_bnm1.c (ref_mulmod_bnm1, main): Adapted to
+ mpn_mulmod_bnm1 interface change.
+
+ * mpn/generic/mulmod_bnm1.c (mpn_mulmod_bnm1): Interface change,
+ in case an + bn < rn, only write an + bn output limbs. New input
+ requirement, an + bn > rn/2.
+ * mpn/generic/sqrmod_bnm1.c (mpn_sqrmod_bnm1): Corresponding
+ changes.
+
2010-01-19 Torbjorn Granlund <tege at gmplib.org>
* tune/tuneup.c (fftmes): Round up initial n according to initial k.
diff -r 630c72a64734 -r 85730915e18f mpn/generic/mulmod_bnm1.c
--- a/mpn/generic/mulmod_bnm1.c Wed Jan 20 09:21:25 2010 +0100
+++ b/mpn/generic/mulmod_bnm1.c Thu Jan 21 11:19:52 2010 +0100
@@ -70,7 +70,7 @@
}
-/* Computes {rp,rn} <- {ap,an}*{bp,bn} Mod(B^rn-1)
+/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
*
* The result is expected to be ZERO if and only if one of the operand
* already is. Otherwise the class [0] Mod(B^rn-1) is represented by
@@ -81,7 +81,7 @@
* compute the full product with an+bn <= rn, because this condition
* implies (B^an-1)(B^bn-1) < (B^rn-1) .
*
- * Requires both 0 < bn <= an <= rn
+ * Requires 0 < bn <= an <= rn and an + bn > rn/2
* Scratch need: rn + 2 + (need for recursive call OR rn + 2). This gives
*
* S(n) <= rn + 2 + MAX (rn + 2, S(n/2)) <= 2rn + 2 log2 rn + 2
@@ -100,7 +100,6 @@
if (UNLIKELY (an + bn <= rn))
{
mpn_mul (rp, ap, an, bp, bn);
- MPN_ZERO (rp + an + bn, rn - (an + bn));
}
else
{
@@ -121,6 +120,14 @@
n = rn >> 1;
+ /* We need at least an + bn >= n, to be able to fit one of the
+ recursive products at rp. Requiring strict inequality makes
+ the coded slightly simpler. If desired, we could avoid this
+ restriction by initially halving rn as long as rn is even and
+ an + bn <= rn/2. */
+
+ ASSERT (an + bn > n);
+
/* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
and crt together as
@@ -132,6 +139,10 @@
#define b0 bp
#define b1 (bp + n)
+ /* FIXME: See if the need for temporary storage can be reduced
+ * in case an <= n or bn <= n. This matters for calls from
+ * mpn_nussbaumer_mul, since then bn <= n always and an <= n for
+ * balanced products. */
#define xp tp /* 2n + 2 */
/* am1 maybe in {xp, n} */
/* bm1 maybe in {xp + n, n} */
@@ -275,11 +286,32 @@
/* Compute the highest half:
([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
*/
- cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
- /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
- DECR will affect _at most_ the lowest n limbs. */
- MPN_DECR_U (rp, 2*n, cy);
+ if (UNLIKELY (an + bn < rn))
+ {
+ /* Note that in this case, the only way the result can equal
+ zero mod B^{rn} - 1 is if one of the inputs is zero, and
+ then the output of both the recursive calls and this CRT
+ reconstruction is zero, not B^{rn} - 1. Which is good,
+ since the latter representation doesn't fit in the output
+ area.*/
+ cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
+ /* FIXME: This subtraction of the high parts is not really
+ necessary, we do it to get the carry out, and for sanity
+ checking. */
+ cy = xp[n] + mpn_sub_nc (so, rp + an + bn - n, xp + an + bn - n,
+ rn - (an + bn), cy);
+ ASSERT (an + bn == rn - 1 || mpn_zero_p (so+1, rn - 1 - (an + bn)));
+ cy = mpn_sub_1 (rp, rp, an + bn, cy);
+ ASSERT (cy == so[0]);
+ }
+ else
+ {
+ cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+ /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+ DECR will affect _at most_ the lowest n limbs. */
+ MPN_DECR_U (rp, 2*n, cy);
+ }
#undef a0
#undef a1
#undef b0
diff -r 630c72a64734 -r 85730915e18f mpn/generic/nussbaumer_mul.c
--- a/mpn/generic/nussbaumer_mul.c Wed Jan 20 09:21:25 2010 +0100
+++ b/mpn/generic/nussbaumer_mul.c Thu Jan 21 11:19:52 2010 +0100
@@ -35,7 +35,7 @@
mp_srcptr bp, mp_size_t bn)
{
mp_size_t rn;
- mp_ptr rp, tp;
+ mp_ptr tp;
TMP_DECL;
ASSERT (an >= bn);
@@ -44,13 +44,12 @@
rn = mpn_mulmod_bnm1_next_size (an + bn);
TMP_MARK;
- TMP_ALLOC_LIMBS_2(rp, rn, tp, mpn_mulmod_bnm1_itch (rn));
+ tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (rn));
if ((ap == bp) && (an == bn))
- mpn_sqrmod_bnm1 (rp, rn, ap, an, tp);
+ mpn_sqrmod_bnm1 (pp, rn, ap, an, tp);
else
- mpn_mulmod_bnm1 (rp, rn, ap, an, bp, bn, tp);
+ mpn_mulmod_bnm1 (pp, rn, ap, an, bp, bn, tp);
- MPN_COPY (pp, rp, an + bn);
TMP_FREE;
}
diff -r 630c72a64734 -r 85730915e18f mpn/generic/sqrmod_bnm1.c
--- a/mpn/generic/sqrmod_bnm1.c Wed Jan 20 09:21:25 2010 +0100
+++ b/mpn/generic/sqrmod_bnm1.c Thu Jan 21 11:19:52 2010 +0100
@@ -68,7 +68,7 @@
}
-/* Computes {rp,rn} <- {ap,an}^2 Mod(B^rn-1)
+/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
*
* The result is expected to be ZERO if and only if the operand
* already is. Otherwise the class [0] Mod(B^rn-1) is represented by
@@ -77,7 +77,7 @@
* compute the full square with an <= 2*rn, because this condition
* implies (B^an-1)^2 < (B^rn-1) .
*
- * Requires 0 < an <= rn
+ * Requires rn/4 < an <= rn
* Scratch need: rn + 2 + (need for recursive call OR rn + 2). This gives
*
* S(n) <= rn + 2 + MAX (rn + 2, S(n/2)) <= 2rn + 2 log2 rn + 2
@@ -95,7 +95,6 @@
if (UNLIKELY (2*an <= rn))
{
mpn_sqr (rp, ap, an);
- MPN_ZERO (rp + 2*an, rn - 2*an);
}
else
{
@@ -116,6 +115,8 @@
n = rn >> 1;
+ ASSERT (2*an > n);
+
/* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
and crt together as
@@ -240,11 +241,30 @@
/* Compute the highest half:
([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
*/
- cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
- /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
- DECR will affect _at most_ the lowest n limbs. */
- MPN_DECR_U (rp, 2*n, cy);
+ if (UNLIKELY (2*an < rn))
+ {
+ /* Note that in this case, the only way the result can equal
+ zero mod B^{rn} - 1 is if the input is zero, and
+ then the output of both the recursive calls and this CRT
+ reconstruction is zero, not B^{rn} - 1. */
+ cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
+ /* FIXME: This subtraction of the high parts is not really
+ necessary, we do it to get the carry out, and for sanity
+ checking. */
+ cy = xp[n] + mpn_sub_nc (so, rp + 2*an - n, xp + 2*an - n,
+ rn - 2*an, cy);
+ ASSERT (mpn_zero_p (so+1, rn - 1 - 2*an));
+ cy = mpn_sub_1 (rp, rp, 2*an, cy);
+ ASSERT (cy == so[0]);
+ }
+ else
+ {
+ cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+ /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+ DECR will affect _at most_ the lowest n limbs. */
+ MPN_DECR_U (rp, 2*n, cy);
+ }
#undef a0
#undef a1
#undef xp
diff -r 630c72a64734 -r 85730915e18f tests/mpn/t-mulmod_bnm1.c
--- a/tests/mpn/t-mulmod_bnm1.c Wed Jan 20 09:21:25 2010 +0100
+++ b/tests/mpn/t-mulmod_bnm1.c Thu Jan 21 11:19:52 2010 +0100
@@ -62,9 +62,7 @@
else
refmpn_mul (rp, bp, bn, ap, an);
an += bn;
- if( UNLIKELY(an <= rn) )
- MPN_ZERO (rp + an, rn - an);
- else {
+ if (an > rn) {
cy = mpn_add (rp, rp, rn, rp + rn, an - rn);
/* If cy == 1, then the value of rp is at most B^rn - 2, so there can
* be no overflow when adding in the carry. */
@@ -114,7 +112,7 @@
{
unsigned size_min;
unsigned size_range;
- mp_size_t an,bn,n;
+ mp_size_t an,bn,rn,n;
mp_size_t itch;
mp_limb_t p_before, p_after, s_before, s_after;
@@ -129,17 +127,19 @@
+ gmp_urandomm_ui (rands, (1L << size_range) + 1 - MIN_N);
n = mpn_mulmod_bnm1_next_size (n);
- if (test & 1) {
+ if ( (test & 1) || n == 1) {
/* Half of the tests are done with the main scenario in mind:
both an and bn >= rn/2 */
an = ((n+1) >> 1) + gmp_urandomm_ui (rands, (n+1) >> 1);
bn = ((n+1) >> 1) + gmp_urandomm_ui (rands, (n+1) >> 1);
} else {
/* Second half of the tests are done using mulmod to compute a
- full product and an >= bn > 0; recursion make it eventually
- fall in the case above. */
- an = ((n+3) >> 2) + gmp_urandomm_ui (rands, n - (n >> 2));
- bn = 1 + ((an == 1) ? 0 : gmp_urandomm_ui (rands, an - 1));
+ full product with n/2 < an+bn <= n. */
+ an = 1 + gmp_urandomm_ui (rands, n - 1);
+ if (an >= n/2)
+ bn = 1 + gmp_urandomm_ui (rands, n - an);
+ else
+ bn = n/2 + 1 - an + gmp_urandomm_ui (rands, (n+1)/2);
}
/* Make sure an >= bn */
@@ -165,9 +165,10 @@
/* We don't propagate carry, this means that the desired condition
is not triggered all the times. A few times are enough anyway. */
}
- mpn_random2 (pp-1, n + 2);
+ rn = MIN(n, an + bn);
+ mpn_random2 (pp-1, rn + 2);
p_before = pp[-1];
- p_after = pp[n];
+ p_after = pp[rn];
itch = mpn_mulmod_bnm1_itch (n);
ASSERT_ALWAYS (itch <= mpn_mulmod_bnm1_itch (MAX_N));
@@ -177,9 +178,9 @@
mpn_mulmod_bnm1 ( pp, n, ap, an, bp, bn, scratch);
ref_mulmod_bnm1 (refp, n, ap, an, bp, bn);
- if (pp[-1] != p_before || pp[n] != p_after
+ if (pp[-1] != p_before || pp[rn] != p_after
|| scratch[-1] != s_before || scratch[itch] != s_after
- || mpn_cmp (refp, pp, n) != 0)
+ || mpn_cmp (refp, pp, rn) != 0)
{
printf ("ERROR in test %d, an = %d, bn = %d, n = %d\n",
test, (int) an, (int) bn, (int) n);
@@ -188,9 +189,9 @@
printf ("before pp:"); mpn_dump (pp -1, 1);
printf ("keep: "); mpn_dump (&p_before, 1);
}
- if (pp[n] != p_after)
+ if (pp[rn] != p_after)
{
- printf ("after pp:"); mpn_dump (pp + n, 1);
+ printf ("after pp:"); mpn_dump (pp + rn, 1);
printf ("keep: "); mpn_dump (&p_after, 1);
}
if (scratch[-1] != s_before)
More information about the gmp-commit
mailing list