Alternative div_qr_1

Fri May 7 22:51:36 CEST 2010

Here's an alternative implementation of div_qr_1, with nicer recurrency.
It's not fast, and I think this C implementation suffers badly from
having to check for carry from add_ssaaaa. (Is there any better way to
do that?)

Inner loop as follows (complete file at the end):

  invert_limb (dinv, d);
  B = - dinv * d;
  [...]
  for (j = un-1; j > 2; j--)
    {
      t = u1 + q0;
      q2 = t < q0;
      umul_ppmm (q1, q0, u1, dinv);
      q1 += t;
      q2 += q1 < t;

      umul_ppmm (r1, r0, u1, B);
      add_ssaaaa (u1, u0, u0, up[j-3], r1, r0);

      /* Clumsy way to detect carry */
      if (u1 < r1 || (u1 == r1 && u0 < r0))
	{
	  ASSERT (u1 < d);
	  u1 -= d;

	  add_ssaaaa (q2, q1, q2, q1, 0, 1);
	}
      qp[j-2] = q1;
      MPN_INCR_U (qp+j-1, un-j, q2);
    }

The recurrency is via u1 and u0. In x86 assembler, it would be

	mul
        add
  	adc
        lea
        cmov

and some moves.

The code above adds all quotient limbs together on the fly. I think that
should be reasonable, if the MPN_INCR_U results in an unlikely jump to a
carry propagation loop. But there are sure other ways to organize it.

(E.g, store q0, q1 for even iterations in the array at qp, and use a
temporary storage area for the q0, q1 from odd iterations. And then q2
in each iteration can be added into the (q1, q0) two iterations earlier,
and there can be no carry except in the case that d divides B^2-1, or
equivalently, B^2 = 1 (mod d), which one could handle specially.

On my x86_32 laptop it runs at 28 cycles per limb compared to 24 for
divrem_1. And on shell, 28 vs 19.

BTW, as far as I see there's no mod_1 variant implementing the above
method? That's is, the loop

      umul_ppmm (r1, r0, u1, B);
      add_ssaaaa (u1, u0, u0, up[j-3], r1, r0);

      if (carry from above)
	u1 -= d;

The nice thing is that it allows arbitrary d, and then the trick is to
handle carry from the recurrency variables by a conditional single-limb
subtraction of d. But then a conditional subtract on the recurrency
chain is somewhat expensive. I'm sure there are some other ways to
arrange that, but for quotient generation it is nice if we can avoid
involving B^3 mod d and the corresponding quotient (which is larger, two
limbs and a bit).

Regards,
/Niels

/* div_qr_1

   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

static __attribute__((noinline)) mp_limb_t
div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t un,
	  mp_limb_t d)
{
  mp_limb_t dinv;
  mp_limb_t B;
  mp_limb_t u1, u0;
  mp_limb_t q0, q1, q2;
  mp_limb_t r0, r1;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (un >= 3);

  u1 = up[un-1]; u0 = up[un-2];

  invert_limb (dinv, d);
  B = - dinv * d;

  umul_ppmm (q1, q0, u1, dinv);
  q1 += u1;
  q2 = q1 < u1;

  umul_ppmm (r1, r0, u1, B);
  add_ssaaaa (u1, u0, u0, up[un-3], r1, r0);

  /* Clumsy way to detect carry */
  if (u1 < r1 || (u1 == r1 && u0 < r0))
    {
      ASSERT (u1 < d);
      u1 -= d;

      add_ssaaaa (q2, q1, q2, q1, 0, 1);
    }

  *qh = q2;
  qp[un-2] = q1;

  for (j = un-1; j > 2; j--)
    {
      t = u1 + q0;
      q2 = t < q0;
      umul_ppmm (q1, q0, u1, dinv);
      q1 += t;
      q2 += q1 < t; /* Should be equivalent to u1 >= d */

      umul_ppmm (r1, r0, u1, B);
      add_ssaaaa (u1, u0, u0, up[j-3], r1, r0);

      /* Clumsy way to detect carry */
      if (u1 < r1 || (u1 == r1 && u0 < r0))
	{
	  ASSERT (u1 < d);
	  u1 -= d;

	  add_ssaaaa (q2, q1, q2, q1, 0, 1);
	}
      qp[j-2] = q1;
      MPN_INCR_U (qp+j-1, un-j, q2);
    }

  if ( (q2 = (u1 >= d)))
    u1 -= d;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  q0 += t;
  q2 += q0 < t;
  qp[0] = q0;

  MPN_INCR_U (qp+1, un-2, q2);

  return u0;
}

#include "speed.h"

static int
compare_double(const void *ap, const void *bp)
{
  double a = * (const double *) ap;
  double b = * (const double *) bp;

  if (a < b)
    return -1;
  else if (a > b)
    return 1;
  else
    return 0;
}

static double
median (double *v, size_t n)
{
  qsort(v, n, sizeof(*v), compare_double);

  return v[n/2];
}

#define MEDIAN_N 5
#define TIME(res, code) do {				\
  double time_measurement[MEDIAN_N];			\
  unsigned time_i;					\
							\
  for (time_i = 0; time_i < MEDIAN_N; time_i++)		\
    {							\
      speed_starttime();				\
      code;						\
      time_measurement[time_i] = speed_endtime();	\
    }							\
  res = median(time_measurement, MEDIAN_N);		\
} while (0)

static int use_cycle_time = 0;
static int use_per_limb = 0;

static void
time_one (mp_size_t un)
{
  mp_limb_t d;
  mp_limb_t r, r_ref;
  mp_ptr qp, qp_ref;
  mp_ptr up;

  unsigned best;

  double t_ref, t_div_qr_1;
  double scale;

  TMP_DECL;

  TMP_MARK;
  up = TMP_ALLOC_LIMBS (un);
  qp = TMP_ALLOC_LIMBS (un);
  qp_ref = TMP_ALLOC_LIMBS (un);

  mpn_random (&d, 1);
  d |= GMP_LIMB_HIGHBIT;

  mpn_random (up, un);

  TIME (t_ref, r_ref = mpn_divrem_1 (qp_ref, 0, up, un, d));
  TIME (t_div_qr_1, r = div_qr_1 (qp, qp + un-1, up, un, d));

  if (r != r_ref || mpn_cmp (qp, qp_ref, un) != 0)
    gmp_fprintf (stderr, "error: d = %Mx, r = %Mx, r_ref = %Mx\n"
		 "    u = %Nx\n"
		 "    q = %Nx\n"
		 "q_ref = %Nx\n",
		 d, r, r_ref, up, un, qp, un, qp_ref, un);

  best = t_ref > t_div_qr_1;

  scale = use_cycle_time ? 1.0 / speed_cycletime : 1e6;
  if (use_per_limb) scale /= un;

  printf("%3d %5.3f%c %5.3f%c %5.3f\n", (int) un,
	 scale*t_ref, best == 0 ? '#' : ' ',
	 scale*t_div_qr_1, best == 1 ? '#' : ' ',
	 t_div_qr_1 / t_ref);
}

#define TEST_SIZE_MAX 100

int
main (int argc, char ** argv)
{
  mp_size_t un = 0;
  int c;

  while ( (c = getopt(argc, argv, "ucC")) != -1)
    switch (c)
      {
      case 'c':
	use_cycle_time = 1;
	break;
      case 'C':
	use_cycle_time = 1;
	use_per_limb = 1;
	break;
      default:
	fprintf(stderr, "Bad argument.\n");
	return 1;
      }
  argc -= optind;
  argv += optind;

  speed_time_init();
  speed_cycletime_need_cycles();

  if (argc > 0)
    time_one (atoi (argv[0]));
  else
    for (un = 3; un < TEST_SIZE_MAX; un++)
      {
	time_one (un);
      }
  return 0;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.