Alternative div_qr_1

Mon May 10 17:23:59 CEST 2010

nisse at lysator.liu.se (Niels Möller) writes:

> That will be tough to beat (when we stay with _1-type methods). If one
> is really clever, maybe one can get the u1 recurrency down to mul + adc
> (6 cycles on amd). One then has to handle carry out from that addition
> using adjustment steps running in parallel with the mul of the next
> iteration. Here's one (untested) variant:

That shortcut, adding in B2 to the new high part, doesn't quite work; if
B2 is large we may get overflow, giving u2 = 2. What can be done is to
add in a correction term corresponding precisely to subtracting d from
u1. Then we need to conditionally add in the two-limb constant <c1, c0>
= (B-d) * B2, in parallel with the multiply.

Here's an implementation which seems to work (complete file including
both div_qr_1 and div_r_1 at the end).

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);
    }

Not sure how to best do that conditional subtraction in assembler, but
it could be something like

loop:
	mul	B2		C 0  6 12
        add	(%esi, j), a2	C 0  8 14
        adc	a0, t		C 1  9 15
        mov	a2, a0		C 1  9 15
        sbb	a2, a2		C 2 10 16
        add	%eax, a0	C 4 10 16
        mov	t, %eax	C 2 10 16
        adc	%edx, %eax	C 5 11 17
	sbb	$0, a2		C 6 12 18
	mov	c1, t
	and	c0, a2		C 7 13 19
        and	a2, t		C 7 13 19
        dec	j
        bne	loop

Do you think we can get this loop actually running in 6 cycles? On
x86_32, we don't have registers enough for the constants, so they'll
have to live on the stack.

In the corresponding div_qr_1 function, there are a bunch of additional
operations to keep track of the quotient.

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      t = (mask & dinv) + q0;
      q2 = t < q0;
      umul_ppmm (q1, q0, u1, dinv);
      t += u1;
      q2 += t < u1;
      q2 += u2;

      add_sssaaaa (q2, q1, q0, q1, q0, t, mask & B2);

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);

      qp[j-2] = q1;
      MPN_INCR_U (qp+j-1, un-j, q2);
    }

I haven't tried too hard to optimize that. It's possible the replace the
conditional addition of the magic constant <dinv, B2> by a conditional
adjustment of u1 if that's more efficient (there's no tight recurrency in
the computainon of the q:s).

Regards,
/Niels

/* div_qr_1

   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

/* Define some longlong.h-style macros, but for wider operations.
   * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
     carry-out into an additional sum opeand.
*/

#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"               \
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
	   : "0"  ((USItype)(s2)),                                      \
	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
#endif

#if defined (__amd64__) && W_TYPE_SIZE == 64
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"               \
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
	   : "0"  ((UDItype)(s2)),                                      \
	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
#endif

#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"        \
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
	   : "0"  ((UDItype)(s2)),                                      \
	     "r"  ((UDItype)(a1)), "r" ((UDItype)(b1)),                 \
	     "%2" ((UDItype)(a0)), "rI" ((UDItype)(b0)))
#endif

#ifndef add_sssaaaa
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  do {                                                                  \
    UWtype __s0, __s1, __c0, __c1;                                      \
    __s0 = (a0) + (b0);                                                 \
    __s1 = (a1) + (b1);                                                 \
    __c0 = __s0 < (a0);                                                 \
    __c1 = __s1 < (a1);                                                 \
    (s0) = __s0;                                                        \
    __s1 = __s1 + __c0;                                                 \
    (s1) = __s1;                                                        \
    (s2) += __c1 + (__s1 < __c0);                                       \
  } while (0)
#endif

static __attribute__((noinline)) mp_limb_t
div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t un,
	  mp_limb_t d)
{
  mp_limb_t dinv;
  mp_limb_t B2;
  mp_limb_t u0, u1, u2;
  mp_limb_t q0, q1, q2;
  mp_limb_t r0, r1;
  mp_limb_t c1, c0;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (un >= 3);

  u1 = up[un-1]; u0 = up[un-2];

  invert_limb (dinv, d);
  B2 = - dinv * d;

  umul_ppmm (c1, c0, B2, -d);

  umul_ppmm (q1, q0, u1, dinv);
  q1 += u1;
  q2 = q1 < u1;

  umul_ppmm (r1, r0, u1, B2);
  u2 = 0;
  add_sssaaaa (u2, u1, u0, u0, up[un-3], r1, r0);

  /* To ensure MPN_INCR_U never spills over the qp area, we must store
     the correct *qh up front, which means that u2 = 0 and u1 < d. */
  add_ssaaaa (q2, q1, q2, q1, 0, u2);
  u1 -= (-u2) & d;

  if (u1 > d)
    {
      u1 -= d;
      add_ssaaaa (q2, q1, q2, q1, 0, 1);
    }

  *qh = q2;
  qp[un-2] = q1;

  u2 = 0;

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      t = (mask & dinv) + q0;
      q2 = t < q0;
      umul_ppmm (q1, q0, u1, dinv);
      t += u1;
      q2 += t < u1;
      q2 += u2;

      add_sssaaaa (q2, q1, q0, q1, q0, t, mask & B2);

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);

      qp[j-2] = q1;
      MPN_INCR_U (qp+j-1, un-j, q2);
    }

  if (u2)
    u1 -= d;

  if ( (q1 = (u1 >= d)) )
    u1 -= d;

  q1 += u2;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  q0 += t;
  q1 += q0 < t;
  qp[0] = q0;

  MPN_INCR_U (qp+1, un-2, q1);

  return u0;
}

static __attribute__((noinline)) mp_limb_t
div_r_1 (mp_srcptr up, mp_size_t un,
	 mp_limb_t d)
{
  mp_limb_t dinv;
  mp_limb_t B2;
  mp_limb_t u0, u1, u2;
  mp_limb_t r0, r1;
  mp_limb_t c1, c0;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (un >= 3);

  u1 = up[un-1]; u0 = up[un-2];

  invert_limb (dinv, d);
  B2 = - dinv * d;

  umul_ppmm (c1, c0, B2, -d);

  umul_ppmm (r1, r0, u1, B2);
  u2 = 0;
  add_sssaaaa (u2, u1, u0, u0, up[un-3], r1, r0);

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);
    }

  if (u2)
    u1 -= d;

  if (u1 >= d)
    u1 -= d;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  return u0;
}

#include "speed.h"

static int
compare_double(const void *ap, const void *bp)
{
  double a = * (const double *) ap;
  double b = * (const double *) bp;

  if (a < b)
    return -1;
  else if (a > b)
    return 1;
  else
    return 0;
}

static double
median (double *v, size_t n)
{
  qsort(v, n, sizeof(*v), compare_double);

  return v[n/2];
}

#define MEDIAN_N 5
#define TIME(res, code) do {				\
  double time_measurement[MEDIAN_N];			\
  unsigned time_i;					\
							\
  for (time_i = 0; time_i < MEDIAN_N; time_i++)		\
    {							\
      speed_starttime();				\
      code;						\
      time_measurement[time_i] = speed_endtime();	\
    }							\
  res = median(time_measurement, MEDIAN_N);		\
} while (0)

static int use_cycle_time = 0;
static int use_per_limb = 0;
static int more_tests = 0;

static void
time_one (mp_size_t un)
{
  mp_limb_t d;
  mp_limb_t r, r_mod, r_ref;
  mp_ptr qp, qp_ref;
  mp_ptr up;

  unsigned best;

  double t_ref, t_div_qr_1, t_div_r_1;
  double scale;

  TMP_DECL;

  TMP_MARK;
  up = TMP_ALLOC_LIMBS (un);
  qp = TMP_ALLOC_LIMBS (un);
  qp_ref = TMP_ALLOC_LIMBS (un);

  mpn_random (&d, 1);
  d |= GMP_LIMB_HIGHBIT;

  mpn_random (up, un);

  if (more_tests)
    {
      unsigned j;
      for (j = 0; j < 100; j++)
	{
	  r_ref = mpn_divrem_1 (qp_ref, 0, up, un, d);
	  r = div_qr_1 (qp, qp + un-1, up, un, d);
	  r_mod = div_r_1 (up, un, d);
	  if (r != r_ref || r_mod != r_ref || mpn_cmp (qp, qp_ref, un) != 0)
	    break;

	  mpn_random (&d, 1);
	  d |= GMP_LIMB_HIGHBIT;
	  mpn_random (up, un);
	}
    }

  TIME (t_ref, r_ref = mpn_divrem_1 (qp_ref, 0, up, un, d));
  TIME (t_div_qr_1, r = div_qr_1 (qp, qp + un-1, up, un, d));
  TIME (t_div_r_1, r_mod = div_r_1 (up, un, d));

  if (r != r_ref || r != r_mod || mpn_cmp (qp, qp_ref, un) != 0)
    gmp_fprintf (stderr, "error: d = %Mx, r = %Mx, r_mod = %Mx, r_ref = %Mx\n"
		 "    u = %Nx\n"
		 "    q = %Nx\n"
		 "q_ref = %Nx\n",
		 d, r, r_mod, r_ref, up, un, qp, un, qp_ref, un);

  best = t_ref > t_div_qr_1;

  scale = use_cycle_time ? 1.0 / speed_cycletime : 1e6;
  if (use_per_limb) scale /= un;

  printf("%3d %5.3f%c %5.3f%c %5.3f, %5.3f\n", (int) un,
	 scale*t_ref, best == 0 ? '#' : ' ',
	 scale*t_div_qr_1, best == 1 ? '#' : ' ',
	 scale*t_div_r_1,
	 t_div_qr_1 / t_ref);
}

#define TEST_SIZE_MAX 100

int
main (int argc, char ** argv)
{
  mp_size_t un = 0;
  int c;

  while ( (c = getopt(argc, argv, "ucCt")) != -1)
    switch (c)
      {
      case 'c':
	use_cycle_time = 1;
	break;
      case 'C':
	use_cycle_time = 1;
	use_per_limb = 1;
	break;
      case 't':
	more_tests = 1;
	break;
      default:
	fprintf(stderr, "Bad argument.\n");
	return 1;
      }
  argc -= optind;
  argv += optind;

  speed_time_init();
  speed_cycletime_need_cycles();

  if (argc > 0)
    time_one (atoi (argv[0]));
  else
    for (un = 3; un < TEST_SIZE_MAX; un++)
      {
	time_one (un);
      }
  return 0;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.