gcd_22

Tue Aug 20 18:51:03 UTC 2019

tg at gmplib.org (Torbjörn Granlund) writes:

> Why would it be faster?  Because while cnd1 needs to wait for the
> previous iteration's double-limb shifting, cnd2 depends on just the
> single-limb shifting of the high U world.

The below implementation appears to pass tests, and give a modest
speedup of 0.2 cycles per input bit, or 2.5%. Benchmark, comparing C
implementations of gcd_11 and gcd_22:

$ ./tune/speed -p 100000 -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22
overhead 4.01 cycles, precision 100000 units of 1.03e-09 secs, CPU freq
974.93 MHz
           mpn_gcd_11    mpn_gcd_22
1             #7.0542       12.8931
4             #2.8895        6.7366
7             #3.2135        7.0463
10            #3.8417        7.2487
13            #4.4648        7.4802
16            #5.0947        7.4609
19            #5.2140        7.6955
22            #5.3684        7.5914
25            #5.2278        7.7002
28            #5.4690        7.6998
31            #5.3778        7.7374
34            #5.4362        7.7808
37            #5.5201        7.7397
40            #5.3979        7.7096
43            #5.4339        7.7887
46            #5.4229        7.7318
49            #5.4557        7.7160
52            #5.4047        7.6908
55            #5.4043        7.6791
58            #5.3859        7.6680
61            #5.4084        7.7529
64            #5.4277        7.6582

Regards,
/Niels

mp_double_limb_t
mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0)
{
  mp_double_limb_t g;
  ASSERT_ALWAYS (u0 & v0 & 1);

  /* Implicit least significant bit */
  u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1));
  u1 >>= 1;

  v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1));
  v1 >>= 1;

  while (u1 || v1) /* u1 == 0 can happen at most twice per call */
    {
      mp_limb_t vgtu, t1, t0;
      sub_ddmmss (t1, t0, u1, u0, v1, v0);
      vgtu = LIMB_HIGHBIT_TO_MASK(u1 - v1);

      if (UNLIKELY (t0 == 0))
        {
          if (t1 == 0)
            {
              g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1));
              g.d0 = (u0 << 1) | 1;
              return g;
            }
          int c;
          count_trailing_zeros (c, t1);

          /* v1 = min (u1, v1) */
          v1 += (vgtu & t1);
          /* u0 = |u1 - v1| */
          u0 = (t1 ^ vgtu) - vgtu;
          ASSERT (c < GMP_LIMB_BITS - 1);
          u0 >>= c + 1;
          u1 = 0;
        }
      else
        {
          int c;
          count_trailing_zeros (c, t0);
          c++;
          /* V <-- min (U, V).

             Assembly version should use cmov. Another alternative,
             avoiding carry propagation, would be

             v0 += vgtu & t0; v1 += vtgu & (u1 - v1);
          */
          add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0);
          /* U  <--  |U - V|
             No carry handling needed in this conditional negation,
             since t0 != 0. */
          u0 = (t0 ^ vgtu) - vgtu;
          u1 = t1 ^ vgtu;
          if (UNLIKELY ((mp_limb_signed_t) u1 < 0))
            {
              u0 = -u0;
              u1 = ~u1;
            }
          if (UNLIKELY (c == GMP_LIMB_BITS))
            {
              u0 = u1;
              u1 = 0;
            }
          else
            {
              u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c));
              u1 >>= c;
            }
        }
    }
  while ((v0 | u0) & GMP_LIMB_HIGHBIT)
    { /* At most two iterations */
      mp_limb_t vgtu, t0;
      int c;
      sub_ddmmss (vgtu, t0, 0, u0, 0, v0);
      if (UNLIKELY (t0 == 0))
        {
          g.d1 = u0 >> (GMP_LIMB_BITS - 1);
          g.d0 = (u0 << 1) | 1;
          return g;
        }

      /* v <-- min (u, v) */
      v0 += (vgtu & t0);

      /* u <-- |u - v| */
      u0 = (t0 ^ vgtu) - vgtu;

      count_trailing_zeros (c, t0);
      u0 = (u0 >> 1) >> c;
    }

  g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1);
  g.d1 = 0;
  return g;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.