Fwd: Factorial improvements

Marsac Laurent laurent.marsac at esil.univ-mrs.fr
Sun Sep 18 01:45:54 CEST 2005


Paul Zimmermann wrote:

>   From: Marsac Laurent <laurent.marsac at esil.univ-mrs.fr>
>   Date: Sep 14, 2005 10:19 PM
>   Subject: Factorial improvements
>
>   Hello,
>
>   I'm currently trying to improve n! calculation (mpz/fac_ui.c) and i just
>   want to know if anyone else is already working on it, or who is
>   responsible of that file ?
>
>   As described in the TODO list, i'm doing the prime factorization of n!,
>   and shiftings for factors of 2. The first results seems to show my
>   algorithm improves performances and i hope i could send more details
>   here ASAP.
>
>   --
>   Laurent Marsac
>
>the best known algorithm is the one from Scho"nhage, Grotefeld and Vetter.
>See the implementation below. Timings on a 1.7Ghz Athlon with gmp-4.1.4:
>
>  
>
Great, your algo is better than mine, but my way to get the sieve and 
the exponents (p and e) seems to improve your code (juste a little of 
course for big n, but much for small n) :
(on amd64 3200+ with gmp 4.1.4)

$ ./fac_ui 100000
mpz_fac_ui took 169ms
mpz_fac_ui2 took 115ms
mpz_fac_ui2_lm took 88ms

$ ./fac_ui 1000000
mpz_fac_ui took 4931ms
mpz_fac_ui2 took 2531ms
mpz_fac_ui2_lm took 2221ms

$ ./fac_ui 2000000
mpz_fac_ui took 13079ms
mpz_fac_ui2 took 6411ms
mpz_fac_ui2_lm took 5769ms

$ ./fac_ui 5000000
mpz_fac_ui took 42674ms
mpz_fac_ui2 took 20211ms
mpz_fac_ui2_lm took 18544ms

See my small changes below

Other questions in my mind are :
should n! be assembly improved (with mmx for paralel small ints muls for 
example) ?
should we use precomputed tables ?
we should fix the use of one or another algorithm depending on the size of n

--
Laurent Marsac

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/resource.h>
#include "gmp.h"
#include <math.h>

/* implements algorithm page 226 of the book
   "Fast Algorithms, A Multitape Turing Machine Implementation",
   by A. Scho"nhage, A. F. W. Grotefeld and E. Vetter,
   BI-Wissenschaftsverlag, 1994.
*/

int cputime (void);
void mpz_fac_ui4 (mpz_ptr, unsigned long int *, unsigned long int);
void mpz_fac_ui3 (mpz_ptr, unsigned long int *, unsigned long int *,
                  unsigned long int *, unsigned long int);
void mpz_fac_ui2 (mpz_ptr, unsigned long int);

int
cputime ()
{
  struct rusage rus;

  getrusage (0, &rus);
  return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
}

/* result <- q[0]*q[1]*...*q[n-1] by binary splitting */
void
mpz_fac_ui4 (mpz_ptr result, unsigned long int *q, unsigned long int n)
{
  unsigned long int i, n0, n1;
  mpz_t tmp;

  if (n == 0)
    {
      mpz_set_ui (result, 1);
      return;
    }
 
  if (n <= 3)
    {
      mpz_set_ui (result, q[0]);
      for (i = 1; i < n; i++)
        mpz_mul_ui (result, result, q[i]);
      return;
    }

  n0 = n / 2;
  n1 = n - n0;
  mpz_fac_ui4 (result, q, n1);
  mpz_init (tmp);
  mpz_fac_ui4 (tmp, q + n1, n0);
  mpz_mul (result, result, tmp);
  mpz_clear (tmp);
}

/* put p[0]^e[0]*...*p[n-1]^e[n-1] in result,
   assuming e[0] >= e[1] >= ... >= e[n-1] > 0.

   Uses auxiliary table q[0...n-1].
*/
void
mpz_fac_ui3 (mpz_ptr result, unsigned long int *p, unsigned long int *e,
             unsigned long int *q, unsigned long int n)
{
  unsigned long int i, j, mask;
  mpz_t tmp;

  /* find largest power of two <= the exponent of two */
  for (mask = 1; e[0] >= mask << 1; mask <<= 1);
  mpz_init (tmp);
  mpz_set_ui (result, 1);
  while (mask)
    {
      /* put part corresponding to bit mask set in q[] */
      for (i = 0, j = 0; i < n; i++)
        if (e[i] & mask)
          q[j++] = p[i];
      mpz_fac_ui4 (tmp, q, j);
      mpz_mul (result, result, result);
      mpz_mul (result, result, tmp);
      mask >>= 1;
    }
  mpz_clear (tmp);
}

void
mpz_fac_ui2 (mpz_ptr result, unsigned long int n)
{
  unsigned long int *p;
  unsigned long int *e;
  unsigned long int *q;
  unsigned long int i, j, k;

  if (n <= 1)
    {
      mpz_set_ui (result, 1);
      return;
    }

  p = (unsigned long int*) malloc ((n + 1) * sizeof(unsigned long int));
  e = (unsigned long int*) malloc ((n + 1) * sizeof(unsigned long int));
  for (i = 2; i <= n; i++)
    {
      p[i] = i;
      e[i] = 1;
    }
  for (i = 2; i <= n / 2; i++)
    {
      if (p[i] != 1) /* i is prime */
        {
          /* remove factor i in 2i, 3i, ... */
          for (j = 2 * i; j <= n; j += i)
            {
              p[j] /= i;
              e[i] ++;
            }
          /* remove factor i in i2, 2i2, ..., i3, 2i3, ... */
          for (k = i; k <= n;)
            {
              /* check that k * i does not overflow */
              if (k > (~(0UL)) / i)
                break;
              k *= i;
              for (j = k; j <= n; j += k)
                {
                  p[j] /= i;
                  e[i] ++;
                }
            }
        }
    }
  /* shrink table */
  for (i = 2, j = 0; i <= n; i++)
    {
      if (p[i] != 1)
        {
          p[j] = p[i];
          e[j] = e[i];
          j++;
        }
    }

  q = (unsigned long int*) malloc (j * sizeof(unsigned long int));
  mpz_fac_ui3 (result, p + 1, e + 1, q, j - 1);
  /* treat apart exponent of two */
  mpz_mul_2exp (result, result, e[0]);
  free (q);
  free (p);
  free (e);
}

void
mpz_fac_ui2_lm (mpz_ptr result, unsigned long int n)
{
  unsigned long int *p;
  unsigned long int *e;
  unsigned long int *q;
  unsigned long int i, j, k;
 
  unsigned long int cr, ci, ct, cj, cd, cm, b, Sj, R, B, ak;

  if (n <= 1)
    {
      mpz_set_ui (result, 1);
      return;
    }

  ct = n/2;

  p = (unsigned long int*) malloc ((ct + 1) * sizeof(unsigned long int));
  e = (unsigned long int*) malloc ((ct + 1) * sizeof(unsigned long int));
 
  for (i = 0; i <= ct; i++)
      p[i] = 1;

  cm = sqrt(n)/2-1;

  /* calcul crible */
  for(ci=0; ci<=cm; ci++)
    {
      if( p[ci] )
        {
          cr = 2*ci+3;
          cd = cr*(ci+1)+ci;

          for(cj=cd; cj<=ct; cj+=cr)
            p[cj] = 0;
        }
    }

  /* calc power of 2 */
  b=2; Sj=1; R=n/b; B=1; e[0]=0;

  while(R)
    {
      ak = R%b;
      e[0] += ak*Sj;

      B *= b;
      Sj += B;

      R = R/b;
    }

  for(i=0,j=0; i<=ct; i++)
    {
      if( !p[i] )
          continue;

      p[j] = b; b = 2*i+3; j++;

      /* decompose n in base b (=primes[i])*/
      Sj = 1; R = n/b; B = 1; e[j] = 0;

      while(R)
        {
          ak = R%b;
          e[j] += ak*Sj;

          B *= b;
          Sj += B;

          R = R/b;
        }
    }
  p[j] = b;

  q = (unsigned long int*) malloc (j * sizeof(unsigned long int));
  mpz_fac_ui3 (result, p + 1, e + 1, q, j);
  /* treat apart exponent of two */
  mpz_mul_2exp (result, result, e[0]);
  free (q);
  free (p);
  free (e);
}


int
main (int argc, char *argv[])
{
  int n = atoi (argv[1]);
  mpz_t f, f2, f3;
  int st;

  mpz_init (f);
  mpz_init (f2);
  mpz_init (f3);
 
  st = cputime ();
  mpz_fac_ui (f, n);
  printf ("mpz_fac_ui took %dms\n", cputime () - st);

  st = cputime ();
  mpz_fac_ui2 (f2, n);
  printf ("mpz_fac_ui2 took %dms\n", cputime () - st);
 
  st = cputime ();
  mpz_fac_ui2_lm (f3, n);
  printf ("mpz_fac_ui2_lm took %dms\n", cputime () - st);

  if (mpz_cmp (f2, f) )
    {
      fprintf (stderr, "f and f2 differ\n");
      exit (1);
    }
  if (mpz_cmp (f3, f) )
    {
      fprintf (stderr, "f and f3 differ\n");
      exit (1);
    }
   

  mpz_clear (f);
  mpz_clear (f2);
  mpz_clear (f3);
 
  return 0;
}



-- 
Marsac Laurent
gsm : +33 6 80 63 21 55

Elève Ingénieur en Génie BioMédical à l'ESIL (Ecole Supérieure d'Ingénieurs de Luminy - www.esil.univ-mrs.fr)



More information about the gmp-discuss mailing list