[Gmp-commit] /home/hgfiles/gmp: New Jacobi algorithm, based on Lehmer's algor...

Mon Apr 19 21:42:07 CEST 2010

details:   /home/hgfiles/gmp/rev/efcac1ec4a81
changeset: 13537:efcac1ec4a81
user:      Niels M?ller <nisse at lysator.liu.se>
date:      Mon Apr 19 21:16:09 2010 +0200
description:
New Jacobi algorithm, based on Lehmer's algorithm.

diffstat:

 configure.in                |    2 +-
 gmp-impl.h                  |   68 +++
 mpn/Makefile.am             |    2 +-
 mpn/generic/jacbase.c       |   12 +-
 mpn/generic/jacobi_lehmer.c |  927 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1003 insertions(+), 8 deletions(-)

diffs (truncated from 1094 to 300 lines):

diff -r 08ef31c0390d -r efcac1ec4a81 configure.in

--- a/configure.in	Wed Apr 14 21:49:00 2010 +0200
+++ b/configure.in	Mon Apr 19 21:16:09 2010 +0200
@@ -2515,7 +2515,7 @@
   perfsqr perfpow							   \
   gcd_1 gcd gcdext_1 gcdext gcd_lehmer gcd_subdiv_step			   \
   gcdext_lehmer gcdext_subdiv_step					   \
-  div_q tdiv_qr jacbase get_d						   \
+  div_q tdiv_qr jacbase jacobi_lehmer get_d				   \
   matrix22_mul matrix22_mul1_inverse_vector				   \
   hgcd2 hgcd mullo_n mullo_basecase					   \
   toom22_mul toom32_mul toom42_mul toom52_mul toom62_mul		   \
diff -r 08ef31c0390d -r efcac1ec4a81 gmp-impl.h
--- a/gmp-impl.h	Wed Apr 14 21:49:00 2010 +0200
+++ b/gmp-impl.h	Mon Apr 19 21:16:09 2010 +0200
@@ -876,6 +876,9 @@
 #define mpn_jacobi_base __MPN(jacobi_base)
 __GMP_DECLSPEC int mpn_jacobi_base __GMP_PROTO ((mp_limb_t, mp_limb_t, int)) ATTRIBUTE_CONST;
 
+#define mpn_jacobi_lehmer __MPN(jacobi_lehmer)
+__GMP_DECLSPEC int mpn_jacobi_lehmer __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, unsigned, mp_ptr));
+
 #define mpn_mod_1c __MPN(mod_1c)
 __GMP_DECLSPEC mp_limb_t mpn_mod_1c __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE;
 
@@ -3570,6 +3573,7 @@
 #endif
 
 
+/* FIXME: Macros for old jacobi code, review what's still needed. */
 
 /* BIT1 means a result value in bit 1 (second least significant bit), with a
    zero bit representing +1 and a one bit representing -1.  Bits other than
@@ -3715,6 +3719,70 @@
       }                                                                    \
   } while (0)
 
+/* For new jacobi code. */
+#define jacobi_table __gmp_jacobi_table
+__GMP_DECLSPEC extern const unsigned char jacobi_table[208];
+
+/* Bit layout for the initial state. b must be odd.
+
+      3  2  1 0
+   +--+--+--+--+
+   |a1|a0|b1| s|
+   +--+--+--+--+
+
+ */
+static inline unsigned
+mpn_jacobi_init (unsigned a, unsigned b, unsigned s)
+{
+  ASSERT (b & 1);
+  ASSERT (s <= 1);
+  return ((a & 3) << 2) + (b & 2) + s;
+}
+
+static inline int
+mpn_jacobi_finish (unsigned bits)
+{
+  /* (a, b) = (1,0) or (0,1) */
+  ASSERT ( (bits & 14) == 0);
+
+  return 1-2*(bits & 1);
+}
+
+static inline unsigned
+mpn_jacobi_update (unsigned bits, unsigned denominator, unsigned q)
+{
+  /* FIXME: Could halve table size by not including the e bit in the
+   * index, and instead xor when updating. Then the lookup would be
+   * like
+   *
+   *   bits ^= table[((bits & 30) << 2) + (denominator << 2) + q];
+   */
+
+  ASSERT (bits < 26);
+  ASSERT (denominator < 2);
+  ASSERT (q < 4);
+
+  /* For almost all calls, denominator is constant and quite often q
+     is constant too. So use addition rather than or, so the compiler
+     can put the constant part can into the offset of an indexed
+     addressing instruction.
+
+     With constant denominator, the below table lookup is compiled to
+
+       C Constant q = 1, constant denominator = 1
+       movzbl table+5(%eax,8), %eax       
+
+     or
+
+       C q in %edx, constant denominator = 1
+       movzbl table+4(%edx,%eax,8), %eax
+
+     One could maintain the state preshifted 3 bits, to save a shift
+     here, but at least on x86, that's no real saving.
+  */
+  return bits = jacobi_table[(bits << 3) + (denominator << 2) + q];  
+}
+
 /* Matrix multiplication */
 #define   mpn_matrix22_mul __MPN(matrix22_mul)
 __GMP_DECLSPEC void      mpn_matrix22_mul __GMP_PROTO ((mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t, mp_ptr));
diff -r 08ef31c0390d -r efcac1ec4a81 mpn/Makefile.am
--- a/mpn/Makefile.am	Wed Apr 14 21:49:00 2010 +0200
+++ b/mpn/Makefile.am	Mon Apr 19 21:16:09 2010 +0200
@@ -44,7 +44,7 @@
   dump.c fib2_ui.c gcd.c						    \
   gcd_1.c gcdext.c get_d.c get_str.c					    \
   hamdist.c hgcd2.c hgcd.c invert_limb.c				    \
-  ior_n.c iorn_n.c jacbase.c lshift.c					    \
+  ior_n.c iorn_n.c jacbase.c jacobi_lehmer.c lshift.c			    \
   matrix22_mul.c matrix22_mul1_inverse_vector.c mod_1.c mod_34lsub1.c mode1o.c  \
   mod_1_1.c mod_1_2.c mod_1_3.c mod_1_4.c				    \
   mul.c mul_1.c mul_2.c mul_3.c mul_4.c mul_fft.c mul_n.c mul_basecase.c    \
diff -r 08ef31c0390d -r efcac1ec4a81 mpn/generic/jacbase.c
--- a/mpn/generic/jacbase.c	Wed Apr 14 21:49:00 2010 +0200
+++ b/mpn/generic/jacbase.c	Mon Apr 19 21:16:09 2010 +0200
@@ -110,7 +110,7 @@
 
 #if JACOBI_BASE_METHOD < 4
 /* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
-   with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
+   with a restricted range of inputs accepted, namely b>1, b odd.
 
    The initial result_bit1 is taken as a parameter for the convenience of
    mpz_kronecker_ui() et al.  The sign changes both here and in those
@@ -122,17 +122,13 @@
 
    Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
    possible, but a couple of tests suggest it's not a significant speedup,
-   and may even be a slowdown, so what's here is good enough for now.
-
-   Future: The code doesn't demand a<=b actually, so maybe this could be
-   relaxed.  All the places this is used currently call with a<=b though.  */
+   and may even be a slowdown, so what's here is good enough for now. */
 
 int
 mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
 {
   ASSERT (b & 1);  /* b odd */
   ASSERT (b != 1);
-  ASSERT (a <= b);
 
   if (a == 0)
     return 0;
@@ -141,11 +137,15 @@
   if (a == 1)
     goto done;
 
+  if (a >= b)
+    goto a_gt_b;
+      
   for (;;)
     {
       result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
       MP_LIMB_T_SWAP (a, b);
 
+    a_gt_b:
       do
 	{
 	  /* working on (a/b), a,b odd, a>=b */
diff -r 08ef31c0390d -r efcac1ec4a81 mpn/generic/jacobi_lehmer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/generic/jacobi_lehmer.c	Mon Apr 19 21:16:09 2010 +0200
@@ -0,0 +1,927 @@
+/* jacobi_lehmer.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* FIXME: Bad name for this symbol. */
+#ifndef JACOBI_2_METHOD
+#define JACOBI_2_METHOD 2
+#endif
+
+#if GMP_NAIL_BITS > 0
+#error Nails not supported.
+#endif
+
+/* FIXME: Duplicated in hgcd2.c. Should move to gmp-impl.h, and
+   possibly be renamed. */
+static inline mp_limb_t
+div1 (mp_ptr rp,
+      mp_limb_t n0,
+      mp_limb_t d0)
+{
+  mp_limb_t q = 0;
+
+  if ((mp_limb_signed_t) n0 < 0)
+    {
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
+	{
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (n0 >= d0)
+	    {
+	      n0 = n0 - d0;
+	      q |= 1;
+	    }
+	  d0 = d0 >> 1;
+	  cnt--;
+	}
+    }
+  else
+    {
+      int cnt;
+      for (cnt = 0; n0 >= d0; cnt++)
+	{
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  d0 = d0 >> 1;
+	  q <<= 1;
+	  if (n0 >= d0)
+	    {
+	      n0 = n0 - d0;
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+    }
+  *rp = n0;
+  return q;
+}
+
+/* Two-limb division optimized for small quotients.  */
+static inline mp_limb_t
+div2 (mp_ptr rp,
+      mp_limb_t nh, mp_limb_t nl,
+      mp_limb_t dh, mp_limb_t dl)
+{
+  mp_limb_t q = 0;
+
+  if ((mp_limb_signed_t) nh < 0)
+    {
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
+	{
+	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
+	  dl = dl << 1;
+	}
+
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (nh > dh || (nh == dh && nl >= dl))
+	    {
+	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
+	      q |= 1;
+	    }
+	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
+	  dh = dh >> 1;
+	  cnt--;
+	}
+    }
+  else
+    {
+      int cnt;
+      for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
+	{
+	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
+	  dl = dl << 1;
+	}
+
+      while (cnt)
+	{