[Gmp-commit] /var/hg/gmp: Add support for POWM_SEC_TABLE table.

Sun Nov 13 21:32:04 CET 2011

details:   /var/hg/gmp/rev/279f952ba4ae
changeset: 14438:279f952ba4ae
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Nov 13 21:31:57 2011 +0100
description:
Add support for POWM_SEC_TABLE table.

diffstat:

 mpn/generic/powm_sec.c |   14 ++++-
 tune/Makefile.am       |    2 +-
 tune/tuneup.c          |  135 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 147 insertions(+), 4 deletions(-)

diffs (210 lines):

diff -r 9069d170e53b -r 279f952ba4ae mpn/generic/powm_sec.c

--- a/mpn/generic/powm_sec.c	Sun Nov 13 20:24:47 2011 +0100
+++ b/mpn/generic/powm_sec.c	Sun Nov 13 21:31:57 2011 +0100
@@ -189,15 +189,27 @@
     }
 }
 
+#ifndef POWM_SEC_TABLE
+#if GMP_NUMB_BITS < 50
+#define POWM_SEC_TABLE  2,33,96,780,2741
+#else
+#define POWM_SEC_TABLE  2,130,524,2578
+#endif
+#endif
+
+#if TUNE_PROGRAM_BUILD
+extern int win_size (mp_bitcnt_t);
+#else
 static inline int
 win_size (mp_bitcnt_t eb)
 {
   int k;
-  static mp_bitcnt_t x[] = {0,4,27,100,325,1026,2905,7848,20457,51670,~(mp_bitcnt_t)0};
+  static mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0};
   for (k = 1; eb > x[k]; k++)
     ;
   return k;
 }
+#endif
 
 /* Convert U to REDC form, U_r = B^n * U mod M */
 static void
diff -r 9069d170e53b -r 279f952ba4ae tune/Makefile.am
--- a/tune/Makefile.am	Sun Nov 13 20:24:47 2011 +0100
+++ b/tune/Makefile.am	Sun Nov 13 21:31:57 2011 +0100
@@ -132,7 +132,7 @@
   invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c		\
   get_str.c set_str.c matrix22_mul.c					\
   hgcd.c hgcd_appr.c hgcd_reduce.c					\
-  mul_n.c sqr.c								\
+  mul_n.c sqr.c powm_sec.c						\
   mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c	\
   mulmid.c mulmid_n.c toom42_mulmid.c					\
   nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c	\
diff -r 9069d170e53b -r 279f952ba4ae tune/tuneup.c
--- a/tune/tuneup.c	Sun Nov 13 20:24:47 2011 +0100
+++ b/tune/tuneup.c	Sun Nov 13 21:31:57 2011 +0100
@@ -192,7 +192,6 @@
 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
-mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
 mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
@@ -1801,6 +1800,134 @@
   one (&gcdext_dc_threshold, &param);
 }
 
+/* In tune_powm_sec we compute the table used by the win_size function.  The
+   cutoff points are in exponent bits, disregarding other operand sizes.  It is
+   not possible to use the one framework since it currently uses a granilarity
+   of full limbs.
+*/
+
+/* This win_size replaces the variant in the powm code, allowing us to
+   control k in the k-ary algorithms.  */
+int winsize;
+int
+win_size (mp_bitcnt_t eb)
+{
+  return winsize;
+}
+
+void
+tune_powm_sec (void)
+{
+  mp_size_t n;
+  int k, i;
+  mp_size_t itch;
+  mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
+  const int n_max = 3000 / GMP_NUMB_BITS;
+  const int n_measurements = 5;
+  mp_ptr rp, bp, ep, mp, tp;
+  double ttab[n_measurements], tk, tkp1;
+  TMP_DECL;
+  TMP_MARK;
+
+  possible_nbits_cutoff = 0;
+
+  k = 1;
+
+  winsize = 10;			/* the itch function needs this */
+  itch = mpn_powm_sec_itch (n_max, n_max, n_max);
+
+  rp = TMP_ALLOC_LIMBS (n_max);
+  bp = TMP_ALLOC_LIMBS (n_max);
+  ep = TMP_ALLOC_LIMBS (n_max);
+  mp = TMP_ALLOC_LIMBS (n_max);
+  tp = TMP_ALLOC_LIMBS (itch);
+
+  mpn_random (bp, n_max);
+  mpn_random (mp, n_max);
+  mp[0] |= 1;
+
+/* How about taking the M operand size into account?
+
+   An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
+   B = O(M)).
+
+   Using k-ary and no sliding window, the precomputation will need time
+   O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
+   O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
+
+   An operation R=powm_sec(B,E,N) will take time like powm.
+
+   Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
+   main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
+   O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
+   table reads, respectively.  */
+
+  printf ("#define POWM_SEC_TABLE  ");
+
+  for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; )
+    {
+      n = (nbits - 1) / GMP_NUMB_BITS + 1;
+
+      /* Generate E such that sliding-window for k and k+1 works equally
+	 well/poorly (but sliding is not used in powm_sec, of course). */
+      for (i = 0; i < n; i++)
+	ep[i] = ~CNST_LIMB(0);
+
+      /* Truncate E to be exactly nbits large.  */
+      if (nbits % GMP_NUMB_BITS != 0)
+	mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS);
+      ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS;
+
+      winsize = k;
+      for (i = 0; i < n_measurements; i++)
+	{
+	  speed_starttime ();
+	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
+	  ttab[i] = speed_endtime ();
+	}
+      tk = median (ttab, n_measurements);
+
+      winsize = k + 1;
+      speed_starttime ();
+      for (i = 0; i < n_measurements; i++)
+	{
+	  speed_starttime ();
+	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
+	  ttab[i] = speed_endtime ();
+	}
+      tkp1 = median (ttab, n_measurements);
+/*
+      printf ("testing: %ld, %d", nbits, k, ep[n-1]);
+      printf ("   %10.5f  %10.5f\n", tk, tkp1);
+*/
+      if (tkp1 < tk)
+	{
+	  if (possible_nbits_cutoff)
+	    {
+	      /* Two consecutive sizes indicate k increase, obey.  */
+	      if (k > 1)
+		printf (",");
+	      printf ("%ld", (long) possible_nbits_cutoff);
+	      k++;
+	      possible_nbits_cutoff = 0;
+	    }
+	  else
+	    {
+	      /* One measurement indicate k increase, save nbits for further
+		 consideration.  */
+	      possible_nbits_cutoff = nbits;
+	    }
+	}
+      else
+	possible_nbits_cutoff = 0;
+
+      nbits_next = nbits * 65 / 64;
+      nbits = nbits_next + (nbits_next == nbits);
+    }
+  printf ("\n");
+  TMP_FREE;
+}
+
 
 /* size_extra==1 reflects the fact that with high<divisor one division is
    always skipped.  Forcing high<divisor while testing ensures consistency
@@ -1896,7 +2023,6 @@
     {
       static struct param_t  param;
       double   t1, t2;
-      int      method;
 
       s.size = 10;
       s.r = randlimb_half ();
@@ -2575,6 +2701,11 @@
   tune_sqrmod_bnm1 ();
   printf("\n");
 
+#if 1
+  tune_powm_sec ();
+  printf("\n");
+#endif
+
   tune_fft_mul ();
   printf("\n");