Alternative div_qr_1

Niels Möller nisse at lysator.liu.se
Mon May 10 17:23:59 CEST 2010


nisse at lysator.liu.se (Niels Möller) writes:

> That will be tough to beat (when we stay with _1-type methods). If one
> is really clever, maybe one can get the u1 recurrency down to mul + adc
> (6 cycles on amd). One then has to handle carry out from that addition
> using adjustment steps running in parallel with the mul of the next
> iteration. Here's one (untested) variant:

That shortcut, adding in B2 to the new high part, doesn't quite work; if
B2 is large we may get overflow, giving u2 = 2. What can be done is to
add in a correction term corresponding precisely to subtracting d from
u1. Then we need to conditionally add in the two-limb constant <c1, c0>
= (B-d) * B2, in parallel with the multiply.

Here's an implementation which seems to work (complete file including
both div_qr_1 and div_r_1 at the end).

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);
    }

Not sure how to best do that conditional subtraction in assembler, but
it could be something like

loop:
	mul	B2		C 0  6 12
        add	(%esi, j), a2	C 0  8 14
        adc	a0, t		C 1  9 15
        mov	a2, a0		C 1  9 15
        sbb	a2, a2		C 2 10 16
        add	%eax, a0	C 4 10 16
        mov	t, %eax	C 2 10 16
        adc	%edx, %eax	C 5 11 17
	sbb	$0, a2		C 6 12 18
	mov	c1, t
	and	c0, a2		C 7 13 19
        and	a2, t		C 7 13 19
        dec	j
        bne	loop

Do you think we can get this loop actually running in 6 cycles? On
x86_32, we don't have registers enough for the constants, so they'll
have to live on the stack.

In the corresponding div_qr_1 function, there are a bunch of additional
operations to keep track of the quotient.

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      t = (mask & dinv) + q0;
      q2 = t < q0;
      umul_ppmm (q1, q0, u1, dinv);
      t += u1;
      q2 += t < u1;
      q2 += u2;

      add_sssaaaa (q2, q1, q0, q1, q0, t, mask & B2);

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);

      qp[j-2] = q1;
      MPN_INCR_U (qp+j-1, un-j, q2);
    }

I haven't tried too hard to optimize that. It's possible the replace the
conditional addition of the magic constant <dinv, B2> by a conditional
adjustment of u1 if that's more efficient (there's no tight recurrency in
the computainon of the q:s).

Regards,
/Niels

/* div_qr_1

   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.

Copyright (C) 2010 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

/* Define some longlong.h-style macros, but for wider operations.
   * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
     carry-out into an additional sum opeand.
*/

#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"               \
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
	   : "0"  ((USItype)(s2)),                                      \
	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
#endif

#if defined (__amd64__) && W_TYPE_SIZE == 64
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"               \
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
	   : "0"  ((UDItype)(s2)),                                      \
	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
#endif

#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"        \
	   : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
	   : "0"  ((UDItype)(s2)),                                      \
	     "r"  ((UDItype)(a1)), "r" ((UDItype)(b1)),                 \
	     "%2" ((UDItype)(a0)), "rI" ((UDItype)(b0)))
#endif

#ifndef add_sssaaaa
#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  do {                                                                  \
    UWtype __s0, __s1, __c0, __c1;                                      \
    __s0 = (a0) + (b0);                                                 \
    __s1 = (a1) + (b1);                                                 \
    __c0 = __s0 < (a0);                                                 \
    __c1 = __s1 < (a1);                                                 \
    (s0) = __s0;                                                        \
    __s1 = __s1 + __c0;                                                 \
    (s1) = __s1;                                                        \
    (s2) += __c1 + (__s1 < __c0);                                       \
  } while (0)
#endif

static __attribute__((noinline)) mp_limb_t
div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t un,
	  mp_limb_t d)
{
  mp_limb_t dinv;
  mp_limb_t B2;
  mp_limb_t u0, u1, u2;
  mp_limb_t q0, q1, q2;
  mp_limb_t r0, r1;
  mp_limb_t c1, c0;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (un >= 3);

  u1 = up[un-1]; u0 = up[un-2];

  invert_limb (dinv, d);
  B2 = - dinv * d;

  umul_ppmm (c1, c0, B2, -d);

  umul_ppmm (q1, q0, u1, dinv);
  q1 += u1;
  q2 = q1 < u1;

  umul_ppmm (r1, r0, u1, B2);
  u2 = 0;
  add_sssaaaa (u2, u1, u0, u0, up[un-3], r1, r0);

  /* To ensure MPN_INCR_U never spills over the qp area, we must store
     the correct *qh up front, which means that u2 = 0 and u1 < d. */
  add_ssaaaa (q2, q1, q2, q1, 0, u2);
  u1 -= (-u2) & d;

  if (u1 > d)
    {
      u1 -= d;
      add_ssaaaa (q2, q1, q2, q1, 0, 1);
    }

  *qh = q2;
  qp[un-2] = q1;

  u2 = 0;

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      t = (mask & dinv) + q0;
      q2 = t < q0;
      umul_ppmm (q1, q0, u1, dinv);
      t += u1;
      q2 += t < u1;
      q2 += u2;

      add_sssaaaa (q2, q1, q0, q1, q0, t, mask & B2);

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);

      qp[j-2] = q1;
      MPN_INCR_U (qp+j-1, un-j, q2);
    }

  if (u2)
    u1 -= d;

  if ( (q1 = (u1 >= d)) )
    u1 -= d;

  q1 += u2;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  q0 += t;
  q1 += q0 < t;
  qp[0] = q0;

  MPN_INCR_U (qp+1, un-2, q1);

  return u0;
}

static __attribute__((noinline)) mp_limb_t
div_r_1 (mp_srcptr up, mp_size_t un,
	 mp_limb_t d)
{
  mp_limb_t dinv;
  mp_limb_t B2;
  mp_limb_t u0, u1, u2;
  mp_limb_t r0, r1;
  mp_limb_t c1, c0;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (un >= 3);

  u1 = up[un-1]; u0 = up[un-2];

  invert_limb (dinv, d);
  B2 = - dinv * d;

  umul_ppmm (c1, c0, B2, -d);

  umul_ppmm (r1, r0, u1, B2);
  u2 = 0;
  add_sssaaaa (u2, u1, u0, u0, up[un-3], r1, r0);

  for (j = un-1; j > 2; j--)
    {
      mp_limb_t mask;
      mask = -u2;

      umul_ppmm (r1, r0, u1, B2);
      u2 = 0;
      add_sssaaaa (u2, u1, u0, u0, up[j-3], mask & c1, mask & c0);
      add_sssaaaa (u2, u1, u0, u1, u0, r1, r0);
    }

  if (u2)
    u1 -= d;

  if (u1 >= d)
    u1 -= d;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  return u0;
}

#include "speed.h"

static int
compare_double(const void *ap, const void *bp)
{
  double a = * (const double *) ap;
  double b = * (const double *) bp;

  if (a < b)
    return -1;
  else if (a > b)
    return 1;
  else
    return 0;
}

static double
median (double *v, size_t n)
{
  qsort(v, n, sizeof(*v), compare_double);

  return v[n/2];
}

#define MEDIAN_N 5
#define TIME(res, code) do {				\
  double time_measurement[MEDIAN_N];			\
  unsigned time_i;					\
							\
  for (time_i = 0; time_i < MEDIAN_N; time_i++)		\
    {							\
      speed_starttime();				\
      code;						\
      time_measurement[time_i] = speed_endtime();	\
    }							\
  res = median(time_measurement, MEDIAN_N);		\
} while (0)

static int use_cycle_time = 0;
static int use_per_limb = 0;
static int more_tests = 0;

static void
time_one (mp_size_t un)
{
  mp_limb_t d;
  mp_limb_t r, r_mod, r_ref;
  mp_ptr qp, qp_ref;
  mp_ptr up;

  unsigned best;

  double t_ref, t_div_qr_1, t_div_r_1;
  double scale;

  TMP_DECL;

  TMP_MARK;
  up = TMP_ALLOC_LIMBS (un);
  qp = TMP_ALLOC_LIMBS (un);
  qp_ref = TMP_ALLOC_LIMBS (un);

  mpn_random (&d, 1);
  d |= GMP_LIMB_HIGHBIT;

  mpn_random (up, un);

  if (more_tests)
    {
      unsigned j;
      for (j = 0; j < 100; j++)
	{
	  r_ref = mpn_divrem_1 (qp_ref, 0, up, un, d);
	  r = div_qr_1 (qp, qp + un-1, up, un, d);
	  r_mod = div_r_1 (up, un, d);
	  if (r != r_ref || r_mod != r_ref || mpn_cmp (qp, qp_ref, un) != 0)
	    break;

	  mpn_random (&d, 1);
	  d |= GMP_LIMB_HIGHBIT;
	  mpn_random (up, un);
	}
    }


  TIME (t_ref, r_ref = mpn_divrem_1 (qp_ref, 0, up, un, d));
  TIME (t_div_qr_1, r = div_qr_1 (qp, qp + un-1, up, un, d));
  TIME (t_div_r_1, r_mod = div_r_1 (up, un, d));

  if (r != r_ref || r != r_mod || mpn_cmp (qp, qp_ref, un) != 0)
    gmp_fprintf (stderr, "error: d = %Mx, r = %Mx, r_mod = %Mx, r_ref = %Mx\n"
		 "    u = %Nx\n"
		 "    q = %Nx\n"
		 "q_ref = %Nx\n",
		 d, r, r_mod, r_ref, up, un, qp, un, qp_ref, un);

  best = t_ref > t_div_qr_1;

  scale = use_cycle_time ? 1.0 / speed_cycletime : 1e6;
  if (use_per_limb) scale /= un;

  printf("%3d %5.3f%c %5.3f%c %5.3f, %5.3f\n", (int) un,
	 scale*t_ref, best == 0 ? '#' : ' ',
	 scale*t_div_qr_1, best == 1 ? '#' : ' ',
	 scale*t_div_r_1,
	 t_div_qr_1 / t_ref);
}

#define TEST_SIZE_MAX 100

int
main (int argc, char ** argv)
{
  mp_size_t un = 0;
  int c;

  while ( (c = getopt(argc, argv, "ucCt")) != -1)
    switch (c)
      {
      case 'c':
	use_cycle_time = 1;
	break;
      case 'C':
	use_cycle_time = 1;
	use_per_limb = 1;
	break;
      case 't':
	more_tests = 1;
	break;
      default:
	fprintf(stderr, "Bad argument.\n");
	return 1;
      }
  argc -= optind;
  argv += optind;

  speed_time_init();
  speed_cycletime_need_cycles();

  if (argc > 0)
    time_one (atoi (argv[0]));
  else
    for (un = 3; un < TEST_SIZE_MAX; un++)
      {
	time_one (un);
      }
  return 0;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.


More information about the gmp-devel mailing list