[Gmp-commit] /var/hg/gmp: 3 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Fri Aug 16 12:38:24 UTC 2019


details:   /var/hg/gmp/rev/042ea5bf2ed1
changeset: 17818:042ea5bf2ed1
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Aug 16 14:28:42 2019 +0200
description:
mpn/generic/brootinv.c: Shorten computations, using even exponent.

details:   /var/hg/gmp/rev/d863c8b31626
changeset: 17819:d863c8b31626
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Aug 16 14:29:24 2019 +0200
description:
mpn/generic/powlo.c: Avoid copies with a flipflop.

details:   /var/hg/gmp/rev/a905169c7cdb
changeset: 17820:a905169c7cdb
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Fri Aug 16 14:38:08 2019 +0200
description:
ChangeLog

diffstat:

 ChangeLog              |   5 +++
 mpn/generic/brootinv.c |  76 ++++++++++++++++++++++++++++----------------
 mpn/generic/powlo.c    |  83 +++++++++++++++++++++++++++++--------------------
 3 files changed, 102 insertions(+), 62 deletions(-)

diffs (truncated from 306 to 300 lines):

diff -r 31d5fb37995b -r a905169c7cdb ChangeLog
--- a/ChangeLog	Fri Aug 16 08:21:42 2019 +0200
+++ b/ChangeLog	Fri Aug 16 14:38:08 2019 +0200
@@ -1,3 +1,8 @@
+2019-08-16 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpn/generic/brootinv.c: Shorten computations, using even exponent.
+	* mpn/generic/powlo.c: Avoid copies with a flipflop.
+
 2019-08-16  Niels Möller  <nisse at lysator.liu.se>
 
 	Speed support for gcd_22. Calls mpn_gcd_22(al, al, bl, bl), so
diff -r 31d5fb37995b -r a905169c7cdb mpn/generic/brootinv.c
--- a/mpn/generic/brootinv.c	Fri Aug 16 08:21:42 2019 +0200
+++ b/mpn/generic/brootinv.c	Fri Aug 16 14:38:08 2019 +0200
@@ -2,7 +2,7 @@
 
    Contributed to the GNU project by Martin Boij (as part of perfpow.c).
 
-Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
+Copyright 2009, 2010, 2012, 2013, 2018 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -32,16 +32,21 @@
 
 #include "gmp-impl.h"
 
-/* Computes a^e (mod B). Uses right-to-left binary algorithm, since
+/* Computes a^2e (mod B). Uses right-to-left binary algorithm, since
    typical use will have e small. */
 static mp_limb_t
-powlimb (mp_limb_t a, mp_limb_t e)
+powsquaredlimb (mp_limb_t a, mp_limb_t e)
 {
   mp_limb_t r;
 
-  for (r = 1; e > 0; e >>= 1, a *= a)
+  r = 1;
+  /* if (LIKELY (e != 0)) */
+  do {
+    a *= a;
     if (e & 1)
       r *= a;
+    e >>= 1;
+  } while (e != 0);
 
   return r;
 }
@@ -66,7 +71,9 @@
 
      (4) Use a small table to get starting value.
 
-   Scratch need: 5*bn, where bn = ceil (bnb / GMP_NUMB_BITS).
+   Scratch need: bn + (((bn + 1) >> 1) + 1) + scratch for mpn_powlo
+   Currently mpn_powlo requires 3*bn
+   so that 5*bn is surely enough, where bn = ceil (bnb / GMP_NUMB_BITS).
 */
 
 void
@@ -75,36 +82,36 @@
   mp_ptr tp2, tp3;
   mp_limb_t kinv, k2, r0, y0;
   mp_size_t order[GMP_LIMB_BITS + 1];
-  int i, d;
+  int d;
 
   ASSERT (bn > 0);
   ASSERT ((k & 1) != 0);
 
   tp2 = tp + bn;
-  tp3 = tp + 2 * bn;
-  k2 = k + 1;
+  tp3 = tp + bn + ((bn + 3) >> 1);
+  k2 = (k >> 1) + 1; /* (k + 1) / 2 , but avoid k+1 overflow */
 
   binvert_limb (kinv, k);
 
   /* 4-bit initial approximation:
 
    y%16 | 1  3  5  7  9 11 13 15,
-    k%4 +-------------------------+k2%4
-     1  | 1 11 13  7  9  3  5 15  |  2
+    k%4 +-------------------------+k2%2
+     1  | 1 11 13  7  9  3  5 15  |  1
      3  | 1  3  5  7  9 11 13 15  |  0
 
   */
   y0 = yp[0];
 
-  r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 2) & 8);		/* 4 bits */
-  r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7f));		/* 8 bits */
-  r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7fff));	/* 16 bits */
+  r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 3) & 8);			/* 4 bits */
+  r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3f));	/* 8 bits */
+  r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3fff));	/* 16 bits */
 #if GMP_NUMB_BITS > 16
   {
     unsigned prec = 16;
     do
       {
-	r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2));
+	r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2));
 	prec *= 2;
       }
     while (prec < GMP_NUMB_BITS);
@@ -115,25 +122,38 @@
   if (bn == 1)
     return;
 
-  /* This initialization doesn't matter for the result (any garbage is
-     cancelled in the iteration), but proper initialization makes
-     valgrind happier. */
-  MPN_ZERO (rp+1, bn-1);
-
   d = 0;
-  for (; bn > 1; bn = (bn + 1) >> 1)
+  for (; bn != 2; bn = (bn + 1) >> 1)
     order[d++] = bn;
 
-  for (i = d - 1; i >= 0; i--)
+  order[d] = 2;
+  bn = 1;
+
+  do
     {
-      bn = order[i];
+      mpn_sqr (tp, rp, bn); /* Result may overlap tp2 */
+      tp2[bn] = mpn_mul_1 (tp2, rp, bn, k2 << 1);
 
-      mpn_mul_1 (tp, rp, bn, k2);
+      bn = order[d];
+
+      mpn_powlo (rp, tp, &k2, 1, bn, tp3);
+      mpn_mullo_n (tp, yp, rp, bn);
 
-      mpn_powlo (tp2, rp, &k2, 1, bn, tp3);
-      mpn_mullo_n (rp, yp, tp2, bn);
-
-      mpn_sub_n (tp2, tp, rp, bn);
-      mpn_pi1_bdiv_q_1 (rp, tp2, bn, k, kinv, 0);
+      /* mpn_sub (tp, tp2, ((bn + 1) >> 1) + 1, tp, bn); */
+      /* The function above is not handled, ((bn + 1) >> 1) + 1 <= bn*/
+      {
+	mp_size_t pbn = (bn + 3) >> 1; /* Size of tp2 */
+	int borrow;
+	borrow = mpn_sub_n (tp, tp2, tp, pbn) != 0;
+	if (bn > pbn) /* 3 < bn */
+	  {
+	    if (borrow)
+	      mpn_com (tp + pbn, tp + pbn, bn - pbn);
+	    else
+	      mpn_neg (tp + pbn, tp + pbn, bn - pbn);
+	  }
+      }
+      mpn_pi1_bdiv_q_1 (rp, tp, bn, k, kinv, 0);
     }
+  while (--d >= 0);
 }
diff -r 31d5fb37995b -r a905169c7cdb mpn/generic/powlo.c
--- a/mpn/generic/powlo.c	Fri Aug 16 08:21:42 2019 +0200
+++ b/mpn/generic/powlo.c	Fri Aug 16 14:38:08 2019 +0200
@@ -1,6 +1,6 @@
 /* mpn_powlo -- Compute R = U^E mod B^n, where B is the limb base.
 
-Copyright 2007-2009, 2012, 2015, 2016 Free Software Foundation, Inc.
+Copyright 2007-2009, 2012, 2015, 2016, 2018 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -66,9 +66,9 @@
   unsigned k;
   static mp_bitcnt_t x[] = {7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
   ASSERT (eb > 1);
-  for (k = 0; eb > x[k]; ++k)
+  for (k = 0; eb > x[k++];)
     ;
-  return k + 1;
+  return k;
 }
 
 /* rp[n-1..0] = bp[n-1..0] ^ ep[en-1..0] mod B^n, B is the limb base.
@@ -85,8 +85,9 @@
   mp_bitcnt_t ebi;
   unsigned windowsize, this_windowsize;
   mp_limb_t expbits;
-  mp_limb_t *pp, *this_pp, *last_pp;
+  mp_limb_t *pp;
   long i;
+  int flipflop;
   TMP_DECL;
 
   ASSERT (en > 1 || (en == 1 && ep[0] > 1));
@@ -96,41 +97,56 @@
   MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
 
   windowsize = win_size (ebi);
-  ASSERT (windowsize < ebi);
+  if (windowsize > 1)
+    {
+      mp_limb_t *this_pp, *last_pp;
+      ASSERT (windowsize < ebi);
 
-  pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1)));
+      pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1)));
 
-  this_pp = pp;
+      this_pp = pp;
 
-  MPN_COPY (this_pp, bp, n);
+      MPN_COPY (this_pp, bp, n);
+
+      /* Store b^2 in tp.  */
+      mpn_sqrlo (tp, bp, n);
 
-  /* Store b^2 in tp.  */
-  mpn_sqrlo (tp, bp, n);
+      /* Precompute odd powers of b and put them in the temporary area at pp.  */
+      i = (1 << (windowsize - 1)) - 1;
+      do
+	{
+	  last_pp = this_pp;
+	  this_pp += n;
+	  mpn_mullo_n (this_pp, last_pp, tp, n);
+	} while (--i != 0);
+
+      expbits = getbits (ep, ebi, windowsize);
 
-  /* Precompute odd powers of b and put them in the temporary area at pp.  */
-  for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
+      /* THINK: Should we initialise the case expbits % 4 == 0 with a mullo? */
+      count_trailing_zeros (cnt, expbits);
+      ebi -= windowsize;
+      ebi += cnt;
+      expbits >>= cnt;
+
+      MPN_COPY (rp, pp + n * (expbits >> 1), n);
+    }
+  else
     {
-      last_pp = this_pp;
-      this_pp += n;
-      mpn_mullo_n (this_pp, last_pp, tp, n);
+      pp = tp + n;
+      MPN_COPY (pp, bp, n);
+      MPN_COPY (rp, bp, n);
+      --ebi;
     }
 
-  expbits = getbits (ep, ebi, windowsize);
-
-  /* FIXME: for even expbits, we can init with a mullo. */
-  count_trailing_zeros (cnt, expbits);
-  ebi -= windowsize;
-  ebi += cnt;
-  expbits >>= cnt;
-
-  MPN_COPY (rp, pp + n * (expbits >> 1), n);
+  flipflop = 0;
 
   do
     {
       while (getbit (ep, ebi) == 0)
 	{
 	  mpn_sqrlo (tp, rp, n);
-	  MPN_COPY (rp, tp, n);
+	  MP_PTR_SWAP (rp, tp);
+	  flipflop = ! flipflop;
 	  if (--ebi == 0)
 	    goto done;
 	}
@@ -139,14 +155,8 @@
 	 bits <= windowsize, and such that the least significant bit is 1.  */
 
       expbits = getbits (ep, ebi, windowsize);
-      this_windowsize = windowsize;
-      if (ebi < windowsize)
-	{
-	  this_windowsize -= windowsize - ebi;
-	  ebi = 0;
-	}
-      else
-	ebi -= windowsize;
+      this_windowsize = MIN (windowsize, ebi);
+      ebi -= this_windowsize;
 
       count_trailing_zeros (cnt, expbits);
       this_windowsize -= cnt;
@@ -163,11 +173,16 @@
       if (this_windowsize != 0)
 	mpn_sqrlo (tp, rp, n);
       else
-	MPN_COPY (tp, rp, n);
+	{
+	  MP_PTR_SWAP (rp, tp);
+	  flipflop = ! flipflop;
+	}
 
       mpn_mullo_n (rp, tp, pp + n * (expbits >> 1), n);
     } while (ebi != 0);


More information about the gmp-commit mailing list