Division interfaces

Tue Mar 15 08:35:36 CET 2011

nisse at lysator.liu.se (Niels Möller) writes:

> And then we have the issue of unnormalized divisors (it looks like
> mpn_sbpi1_div_qr still requires that dp[dn-1] has the high bit set.

I'm attaching a first crude version with on-the-fly normalization of the
high limbs.

If one reads the top 3 limbs of n and top 2 of d and shift left to
normalize d (shifting in zeros), and use 3/2 division, one will in
effect do a (2 limbs + k bits)/(1 limb + k bits) division, where 0 < k =
(GMP_LIMB_BITS - shift_count). Then the probability of the "unlikely"
update in schoolbook division is on the order of 2^{-k}, which for small
k (large shift count) actually isn't very unlikely.

For this reason, the code uses 4 limbs of n and 3 limbs of d (and a 4/3
division built from 3/2 division and an extra update step). This is
overkill when the shift count is small, so maybe one should have three
different variants,

1. Normalized d.

2. Shift count < (GMP_LIMB_BITS - 10). Use 3/2 division, the "unlikely"
   update happens with probability < 2^{-10}.

3. Larger shift counts. Use 4/3 division (like the attached code), then
   the unlikely update probability is < 2^{-GMP_LIMB_BITS}.

/Niels

-------------- next part --------------
/* mpn_sbpi1_div_qr -- Schoolbook division using the MÃ¶ller-Granlund 3/2
   division algorithm.

   Contributed to the GNU project by Torbjorn Granlund.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.

Copyright 2007, 2009, 2011 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include <stdlib.h>
#include <stdio.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

#define udiv_qr_4by3(q, r2, r1, r0, n3, n2, n1, n0, d2, d1, d0, dinv)	\
  do {									\
    mp_limb_t _q, _p1, _p0, _cy;					\
    udiv_qr_3by2 (_q, (r2), (r1), (n3), (n2), (n1), (d2), (d1), (dinv)); \
    umul_ppmm (_p1, _p0, _q, (d0));					\
    _p1 += _p0 > (n0);							\
    (r0) = (n0) - _p0;							\
    _cy = _p1 > (r1);							\
    (r1) -= _p1;							\
    if (UNLIKELY (_cy > (r2)))						\
      {	/* _cy == 1, r2 == 0 */						\
	(r0) += (d0);							\
	_cy = (r0) < (d0);						\
	(r1) += _cy;							\
	_cy = (r1) < _cy;						\
	add_ssaaaa ((r2), (r1), _cy - 1, (r1), (d2), (d1));		\
	_q--;								\
      }									\
    else								\
      (r2) -= _cy;							\
    (q) = _q;								\
  } while (0)

mp_limb_t
sb_div_qr (mp_ptr qp,
	   mp_ptr np, mp_size_t nn,
	   mp_srcptr dp, mp_size_t dn,
	   mp_limb_t d1, mp_limb_t d0,
	   mp_limb_t dinv, int shift)
{
  mp_limb_t qh;

  ASSERT (dn > 2);
  ASSERT (nn >= dn);

  qp += nn - dn;

  if (shift == 0)
    {
      mp_size_t i;
      mp_limb_t n1, n0;
      mp_limb_t cy, cy1;
      mp_limb_t q;

      np += nn;

      ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

      qh = mpn_cmp (np - dn, dp, dn) >= 0;
      if (qh != 0)
	mpn_sub_n (np - dn, np - dn, dp, dn);

      dn -= 2;			/* offset dn by 2 for main division loops,
				   saving two iterations in mpn_submul_1.  */
      np -= 2;

      n1 = np[1];

      for (i = nn - (dn + 2); i > 0; i--)
	{
	  np--;
	  if (UNLIKELY (n1 == d1) && np[1] == d0)
	    {
	      q = GMP_NUMB_MASK;
	      mpn_submul_1 (np - dn, dp, dn + 2, q);
	      n1 = np[1];		/* update n1, last loop's value will now be invalid */
	    }
	  else
	    {
	      udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);

	      cy = mpn_submul_1 (np - dn, dp, dn, q);

	      cy1 = n0 < cy;
	      n0 = (n0 - cy) & GMP_NUMB_MASK;
	      cy = n1 < cy1;
	      n1 = (n1 - cy1) & GMP_NUMB_MASK;
	      np[0] = n0;

	      if (UNLIKELY (cy != 0))
		{
		  n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
		  q--;
		}
	    }

	  *--qp = q;
	}
      np[1] = n1;
    }
  else
    {
      mp_limb_t n3, n2, n1, n0;
      mp_limb_t cy;
      mp_limb_t dl;
      mp_limb_t c1, c0;

      /*   +----+----+----+----+----
	d: |    |    |    |    | ...
	   +----+----+----+----+----
	     `----'----'----'
	       d1   d0   dl
      */

      dl = dp[dn-3] << shift;

      n3 = (np[nn-1] >> (GMP_LIMB_BITS - shift));
      n2 = (np[nn-1] << shift) | (np[nn-2] >> (GMP_LIMB_BITS - shift));
      n1 = (np[nn-2] << shift) | (np[nn-3] >> (GMP_LIMB_BITS - shift));
      n0 = (np[nn-3] << shift);

      udiv_qr_4by3 (qh, n3, n2, n1, n3, n2, n1, n0, d1, d0, dl, dinv);

      cy = mpn_submul_1 (np + nn - dn, dp, dn - 3, qh);
      c0 = cy << shift;
      c1 = cy >> (GMP_LIMB_BITS - shift);

      cy = (n2 <= c1) && (n2 < c1 || n1 < c0);

      sub_ddmmss (n2, n1, n2, n1, c1, c0);

      if (UNLIKELY (n3 < cy))
	{
	  /* Add d back */
	  n1 += dl;
	  cy = n1 < dl;
	  n2 += cy;
	  cy = (n2 < cy);
	  add_ssaaaa (n3, n2, cy, n2, d1 - 1, d0);

	  cy = mpn_add_n (np + nn - dn, np + nn - dn, dp, dn - 3);

	  cy <<= shift;
	  n1 += cy;
	  cy = n1 < cy;
	  n2 += cy;
	  n3 += (n2 < cy);

	  qh--;
	}
      else
	n3 -= cy;

      for (; nn > dn; nn--)
	{
	  mp_limb_t q;
	  /*   +----+----+----+----+----+----
            n: |    |    |    |    |    | ...
	       +----+----+----+----+----+----
	          `----'----'----'----'
		    n3   n2   n1   n0
	  */
	  n1 |= np[nn-4] >> (GMP_LIMB_BITS - shift);
	  n0 = np[nn-4] << shift;

	  if (UNLIKELY (n3 == d1) && n2 == d1)
	    {
	      q = MP_LIMB_T_MAX;

	      /* NOTE: Not always true that n1 <= dl; then n3 may
		 overflow, and this overflow is cancelled when
		 subtracting the low part. */
	      /* <d1, d0, n1, n0> - (B-1) <d1, d0, dl>
		 = <d1, d0, dl> + <n1, n0> - B dl */

	      n3 = d1;
	      n2 = d0 + n1;
	      n3 += (n2 < d0);
	      n1 = dl + n0;
	      cy = n1 < dl;
	      n2 += cy;
	      n3 += (n2 < cy);

	      n3 -= (n2 < dl);
	      n2 -= dl;

	      /* Can't share code with the common case, because n3 may
		 have overflowed. */
	      cy = mpn_submul_1 (np + nn - dn - 1, dp, dn - 3, q);
	      c0 = cy << shift;
	      c1 = cy >> (GMP_LIMB_BITS - shift);
	      cy = (n2 <= c1) && (n2 < c1 || n1 < c0);

	      sub_ddmmss (n2, n1, n2, n1, c1, c0);

	      n3 -= cy;
	    }
	  else
	    {
	      udiv_qr_4by3 (q, n3, n2, n1, n3, n2, n1, n0, d1, d0, dl, dinv);

	      cy = mpn_submul_1 (np + nn - dn - 1, dp, dn - 3, q);
	      c0 = cy << shift;
	      c1 = cy >> (GMP_LIMB_BITS - shift);
	      cy = (n2 <= c1) && (n2 < c1 || n1 < c0);

	      sub_ddmmss (n2, n1, n2, n1, c1, c0);

	      if (UNLIKELY (n3 < cy))
		{
		  /* Add d back */
		  n1 += dl;
		  cy = n1 < dl;
		  n2 += cy;
		  cy = (n2 < cy);
		  add_ssaaaa (n3, n2, cy, n2, d1 - 1, d0);

		  cy = mpn_add_n (np + nn - dn - 1, np + nn - dn - 1, dp, dn - 3);

		  cy <<= shift;
		  n1 += cy;
		  cy = n1 < cy;
		  n2 += cy;
		  n3 += (n2 < cy);

		  q--;
		}
	      else
		n3 -= cy;
	    }

	  *--qp = q;
	}

      np[nn-1] = (n3 >> shift);
      np[nn-2] = (n2 >> shift) | (n3 << (GMP_LIMB_BITS - shift));      
      np[nn-3] = (n1 >> shift) | (n2 << (GMP_LIMB_BITS - shift));

    }
  return qh;
}

static mp_limb_t
random_word (gmp_randstate_ptr rs)
{
  mpz_t x;
  mp_limb_t r;
  TMP_DECL;
  TMP_MARK;

  MPZ_TMP_INIT (x, 2);
  mpz_urandomb (x, rs, 32);
  r = mpz_get_ui (x);
  TMP_FREE;
  return r;
}

static void
check (const mpz_t n, const mpz_t d, const mpz_t q, const mpz_t r)
{
  mpz_t t;
  mpz_init (t);
  mpz_mul (t, q, d);
  mpz_add (t, t, r);

  if (mpz_cmp (t, n))
    {
      gmp_fprintf (stderr, "n = %Zx\n"
		   "d = %Zx\n"
		   "q = %Zx\n"
		   "r = %Zx\n"
		   "q d + r = %Zx\n",
		   n, d, q, r, t);
      abort();
    }
  mpz_clear (t);
}

/* Maximum d and q */
#define MAXBITS 1000
#define MAXLIMBS (MAXBITS / GMP_NUMB_BITS + 10)

int
main (int argc, char **argv)
{
  gmp_randstate_ptr rands;
  unsigned nbits;
  unsigned qbits;
  unsigned dbits;
  unsigned i;
  mpz_t n, d, q, r;
  mp_size_t qn;
  mp_size_t rn;

  rands = RANDS;

  mpz_init (n);
  mpz_init (d);
  mpz_init (q);
  mpz_init (r);

  _mpz_realloc (q, MAXLIMBS);

  for (i = 0; i < 500; i++)
    {
      mp_limb_t d1;
      mp_limb_t d0;
      gmp_pi1_t dinv;
      mp_limb_t qh;
      int shift;
      mp_ptr dp;
      mp_size_t dn;

      dbits = 3 * GMP_NUMB_BITS + random_word (rands) % 200;
      qbits = 1 + random_word (rands) % 200;
      nbits = dbits + qbits;

      mpz_rrandomb (d, rands, dbits);
      mpz_rrandomb (n, rands, nbits);

      dp = PTR(d);
      dn = SIZ(d);

      d1 = dp[dn-1];
      if (! (GMP_NUMB_HIGHBIT & d1))
	{
	  count_leading_zeros (shift, d1);
	  d1 = (d1 << shift) | (dp[dn-2] >> (GMP_LIMB_BITS - shift));
	  d0 = (dp[dn-2] << shift) | (dp[dn-3] >> (GMP_LIMB_BITS - shift));
	}
      else
	{
	  shift = 0;
	  d0 = dp[dn-2];
	}
      invert_pi1(dinv, d1, d0);

      qn = SIZ(n) - SIZ(d);
      ASSERT (rn <= MAXLIMBS);
      ASSERT (qn < MAXLIMBS);

      mpz_set (r, n);
      qh = sb_div_qr (PTR(q), PTR(r), SIZ(r), PTR(d), SIZ(d),
		      d1, d0, dinv.inv32, shift);
      if (qh > 0)
	PTR(q)[qn++] = qh;
      SIZ(q) = qn;
      rn = SIZ(d);      
      MPN_NORMALIZE (PTR(r), rn);
      SIZ(r) = rn;

      check (n, d, q, r);
    }
  return EXIT_SUCCESS;
}
-------------- next part --------------

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.