mod_2

Niels Möller nisse at lysator.liu.se
Thu May 6 11:50:00 CEST 2010


And here's a simple test program for mod_2 using a loop around
udiv_qr_3by2. Beats current code (mpn_tdiv_qr, calling divrem_2) on both
x86 (3% for large sizes, 15% sizes around 20) and x86_64 (10% for most
sizes), even though divrem_2 seems to be implemented in assembler there.

Before rewriting divrem_2, I guess we should consider the interface. To
me, it would make sense to have two separate functions, mpn_div_qr_2,
mpn_div_r_2 (mpn_div_q_2 is less important, since the remainder is small
and almost free). What are the current users of the fraction feature?
Would it be ok to put the loop to generate fractional words in a
separate function? I also think it's highly desirable to not clobber the
large input, and instead extract and update the high limbs (possibly
normalizing them) on the fly.

Given the speedup, I think it would be a good idea to implement the
basic udiv_qr_3by2 loop in C (it's most likely a good method for small
sizes, and on architecture like sparc where umul is a lot slower than
umullo), before figuring out if more sophisticated mod_2 variants using
Montgomery's trick should also be used and before writing the assembly
code.

Regards,
/Niels

/* mod_2

   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.

Copyright (C) 2010 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include <stdio.h>
#include <stdlib.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

static void
mod_2_ref (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_ptr dp, mp_ptr scratch)
{
  ASSERT (un >= 2);

  mpn_tdiv_qr (scratch, rp, 0, up, un, dp, 2);
}

static void
mod_2_udiv_qr_3by2 (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_ptr dp)
{
  mp_limb_t d0, d1;
  mp_limb_t u0, u1;

  ASSERT (un >= 2);
  d0 = dp[0]; d1 = dp[1];
  ASSERT (d1 & GMP_NUMB_HIGHBIT);
  u0 = up[un-2]; u1 = up[un-1];

  if (u1 > d1 || (u1 == d1 && u0 >= d0))
    sub_ddmmss (u1, u0, u1, u0, d1, d0);

  if (un > 2)
    {
      gmp_pi1_t dinv;
      invert_pi1 (dinv, d1, d0);

      do
	{
	  mp_limb_t q;
	  udiv_qr_3by2 (q, u1, u0, u1, u0, up[un-3], d1, d0, dinv.inv32);
	}
      while (--un > 2);
    }
  rp[0] = u0;
  rp[1] = u1;
}

#include "speed.h"

static int
compare_double(const void *ap, const void *bp)
{
  double a = * (const double *) ap;
  double b = * (const double *) bp;

  if (a < b)
    return -1;
  else if (a > b)
    return 1;
  else
    return 0;
}

static double
median (double *v, size_t n)
{
  qsort(v, n, sizeof(*v), compare_double);

  return v[n/2];
}

#define MEDIAN_N 5
#define TIME(res, code) do {				\
  double time_measurement[MEDIAN_N];			\
  unsigned time_i;					\
							\
  for (time_i = 0; time_i < MEDIAN_N; time_i++)		\
    {							\
      speed_starttime();				\
      code;						\
      time_measurement[time_i] = speed_endtime();	\
    }							\
  res = median(time_measurement, MEDIAN_N);		\
} while (0)

static void
time_one (mp_size_t un)
{
  mp_limb_t d[2];
  mp_limb_t ref[2];
  mp_limb_t r[2];
  mp_ptr up;
  mp_ptr scratch;

  unsigned best;

  double t_ref, t_udiv_qr_3by2;

  TMP_DECL;

  TMP_MARK;
  up = TMP_ALLOC_LIMBS (un);
  
  mpn_random (d, 2);
  d[1] |= GMP_NUMB_HIGHBIT;

  mpn_random (up, un);

  scratch = TMP_ALLOC_LIMBS (un-1);
  
  TIME (t_ref, mod_2_ref (ref, up, un, d, scratch));
  TIME (t_udiv_qr_3by2, mod_2_udiv_qr_3by2 (r, up, un, d));

  if (mpn_cmp (r, ref, 2) != 0)
    gmp_fprintf (stderr, "error: d = %Nx, r = %Nx, ref = %Nx\n"
		 "  u = %Nx\n",
		 d, 2, r, 2, ref, 2, up, un);

  best = t_ref > t_udiv_qr_3by2;
  
  printf("%3d %5.3f%c %5.3f%c %5.3f\n", (int) un,
	 1e6*t_ref, best == 0 ? '#' : ' ',
	 1e6*t_udiv_qr_3by2, best == 1 ? '#' : ' ',
	 t_udiv_qr_3by2 / t_ref);
}

#define TEST_SIZE_MAX 100

int
main (int argc, char ** argv)
{
  mp_size_t un;

  if (argc > 1)
    time_one (atoi (argv[1]));
  else
    for (un = 2; un < TEST_SIZE_MAX; un++)
      {
	time_one (un);
      }
  return 0;
}

  


-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.



More information about the gmp-devel mailing list