div_qr_1n_pi1

Wed Jun 2 18:20:39 UTC 2021

Hi, I had a look at the mpn_qr_1n_pi1 function, where the METHOD == 2
version was added in 2010. I had to dig quite deep in my mailbox to
remember how it works.

The main idea was to write B^2 = v d + B', where d is normalized, v is
the usual reciprocal, one limb + an implicit one bit, and 1 <= B' < B.

And use this for a loop where each iteration replaces

  {u_2, u_1, u_0} = u_2 B' + { u_1, u_0 } + u_2 v d

The product u_2 B' let's us reduce the partial remander by one limb, and
the independent product u_2 v goes into the collection of quotient
limbs.

I've noticed one microoptimization for the METHOD == 2 version, see
patch below. This gives a very modest speedup when I benchmark on my
laptop (AMD Ryzen 5), 0.3 cycles per limb out of 10. 

I've also tried a variant that eliminates the carry from u_2 B' + { u_1,
u_0 } earlier (since B' <= d, if carry occurs, it can be cancelled by
subtracting B d, with a corresponding update of the quotient
accumulation). On my machine, that gives a more significant speedup,
saving almost 2 cycles per limb, and it's only about 0.5 cycles/limb
slower than the assembly routine.

The critical path, via the u1 variable, is 

      umul_ppmm (p1, p0, u1, B2);
      add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
      u1 -= cy & d;

Not sure if any of the current mod_1_1 variants do it this way. It looks
like the closest variant postpons the handling of this carry until the
next iteration, which shortens the critical path. But when also
producing the quotient, early carry handling seems to save quite a lot
of work.

I haven't look at corresponding changes to the assembly loops.

Regards,
/Niels

*** /tmp/extdiff.av7tqrid/gmp.1b45731027e8/mpn/generic/div_qr_1n_pi1.c	2021-05-25 21:11:14.000000000 +0200
--- /home/nisse/hack/gmp/mpn/generic/div_qr_1n_pi1.c	2021-06-02 06:26:47.500602956 +0200
***************
*** 249,263 ****
        */
        umul_ppmm (p1, t, u1, dinv);
        add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
!       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
        add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
        q0 = t;

        umul_ppmm (p1, p0, u1, B2);
-       ADDC_LIMB (cy, u0, u0, u2 & B2);
-       u0 -= (-cy) & d;

        /* Final q update */
-       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
        qp[j+1] = q1;
        MPN_INCR_U (qp+j+2, n-j-2, q2);
--- 249,263 ----
        */
        umul_ppmm (p1, t, u1, dinv);
+       ADDC_LIMB (cy, u0, u0, u2 & B2);
+       u0 -= (-cy) & d;
        add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
!       /* p1 <= B-2, so cy can be added in without overflow. */
!       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy);
        add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
        q0 = t;

        umul_ppmm (p1, p0, u1, B2);

        /* Final q update */
        qp[j+1] = q1;
        MPN_INCR_U (qp+j+2, n-j-2, q2);

mp_limb_t
mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
		   mp_limb_t d, mp_limb_t dinv)
{
  mp_limb_t B2;
  mp_limb_t cy, u0;
  mp_limb_t q0, q1;
  mp_limb_t p0, p1;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (n > 0);
  ASSERT (u1 < d);

  if (n == 1)
    {
      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
      return u1;
    }

  /* FIXME: Could be precomputed */
  B2 = -d*dinv;

  umul_ppmm (q1, q0, dinv, u1);
  umul_ppmm (p1, p0, B2, u1);
  q1 += u1;
  ASSERT (q1 >= u1);
  u0 = up[n-1];	/* Early read, to allow qp == up. */

  add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
  u1 -= cy & d;
  q1 -= cy;
  qp[n-1] = q1;

  /* FIXME: Keep q1 in a variable between iterations, to reduce number
     of memory accesses. */
  for (j = n-2; j-- > 0; )
    {
      mp_limb_t q2, cy;
      mp_limb_t t1, t0;

      /* Additions for the q update:
       *	+-------+
       *        |u1 * v |
       *        +---+---+
       *        | u1|
       *        +---+
       *        | 1 |  (conditional on {u1, u0} carry)
       *        +---+
       * +      | q0|
       *   -+---+---+---+
       *    | q2| q1| q0|
       *    +---+---+---+
       *
       * Additions for the u update:
       *        +-------+
       *        |u1 * B2|
       *        +---+---+
       *      + |u0 |u-1|
       *        +---+---+
       *      - | d |     (conditional on carry)
       *     ---+---+---+
       *        |u1 | u0|
       *        +---+---+
       *
      */
      umul_ppmm (p1, p0, u1, B2);
      ADDC_LIMB (q2, q1, u1, q0);
      umul_ppmm (t1, t0, u1, dinv);
      add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
      u1 -= cy & d;

      /* t1 <= B-2, so cy can be added in without overflow. */
      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
      q0 = t0;

      /* Final q update */
      qp[j+1] = q1;
      MPN_INCR_U (qp+j+2, n-j-2, q2);
    }

  q1 = (u1 >= d);
  u1 -= (-q1) & d;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);

  MPN_INCR_U (qp+1, n-1, q1);

  qp[0] = q0;
  return u0;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.