Small operands gcd improvements

Wed Aug 14 19:56:04 UTC 2019

tg at gmplib.org (Torbjörn Granlund) writes:

> I believe an amalgamation of Niels code which uses an implicit lsb and
> his present _22 code would be best.  The present requirement for
> sub_mddmmss is a portability inconvenience, and for subtract-with-carry
> challenged processors a major hassle.

Below another version which does pass some tests. It has an implicit one
bit, which causes some impedance mismatch. Maybe it's better to have
some special iterations at the top to eliminate the top bits, before
falling into the main loop.

Not sure I've identified the main loop correctly in the generated code,
but I think it's this (x86_64, gcc-8.3 -O3):

  4a:     mov    %rsi,%r9
  4d:     mov    %rbx,%rax
  50:     sub    %r11,%rax
  53:     sbb    %r8,%r9
  56:     mov    %r9,%rdi
  59:     sar    $0x3f,%rdi
  5d:     test   %rax,%rax
  60:     je     130 <gcd_22+0x130>
  66:     bsf    %rax,%r10
  6a:     mov    %r9,%rdx
  6d:     mov    %rax,%rsi
  70:     add    $0x1,%r10d
  74:     xor    %rdi,%rax
  77:     mov    %r11,%rcx
  7a:     and    %rdi,%rdx
  7d:     and    %rdi,%rsi
  80:     sub    %rdi,%rax
  83:     add    %rsi,%rcx
  86:     adc    %rdx,%r8
  89:     xor    %rdi,%r9
  8c:     mov    %rcx,%r11
  8f:     cmp    $0x40,%r10d
  93:     je     168 <gcd_22+0x168>
  99:     mov    %r10d,%ecx
  9c:     mov    %r9,%rbx
  9f:     mov    %r9,%rsi
  a2:     shr    %cl,%rax
  a5:     mov    %ebp,%ecx
  a7:     sub    %r10d,%ecx
  aa:     shl    %cl,%rbx
  ad:     mov    %r10d,%ecx
  b0:     shr    %cl,%rsi
  b3:     or     %rax,%rbx
  b6:     mov    %rsi,%rax
  b9:     or     %r8,%rax
  bc:     jne    4a <gcd_22+0x4a>

with the branches out going to unlikely code paths.

Regards,
/Niels

----8<------
#include <stdio.h>
#include <stdlib.h>

#include "gmp.h"
#include "gmp-impl.h"

#include "longlong.h"

typedef struct {
  mp_limb_t lo;
  mp_limb_t hi;
} double_limb_t;

static double_limb_t
ref_gcd_22 (mp_limb_t uh, mp_limb_t ul, mp_limb_t vh, mp_limb_t vl)
{
  double_limb_t res;
  mpz_t u;
  mpz_t v;
  mpz_t g;
  mp_limb_t ua[2] = { ul, uh };
  mp_limb_t va[2] = { vl, vh };
  mpz_init (g);
  mpz_gcd (g, mpz_roinit_n (u, ua, 2), mpz_roinit_n (v, va, 2));
  res.lo = mpz_getlimbn (g, 0);
  res.hi = mpz_getlimbn (g, 1);
  return res;
}

static double_limb_t
gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0)
{
  double_limb_t r;
  ASSERT (u0 & v0 & 1);

  u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1));
  u1 >>= 1;

  v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1));
  v1 >>= 1;

  while (u1 || v1) /* u1 == 0 can happen at most twice per call */
    {
      mp_limb_t vgtu, t1, t0;
      sub_ddmmss (t1, t0, u1, u0, v1, v0);
      vgtu = LIMB_HIGHBIT_TO_MASK(t1);

      if (UNLIKELY (t0 == 0))
	{
	  if (t1 == 0)
	    {
	      r.hi = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1));
	      r.lo = (u0 << 1) | 1;
	      return r;
	    }
	  int c;
	  count_trailing_zeros (c, t1);

	  /* v1 = min (u1, v1) */
	  v1 += (vgtu & t1);
	  /* u0 = |u1 - v1| */
	  u0 = (t1 ^ vgtu) - vgtu;
	  ASSERT (c < GMP_LIMB_BITS - 1);
	  u0 >>= c + 1;
	  u1 = 0;
	}
      else 
	{
	  int c;
	  count_trailing_zeros (c, t0);
	  c++;
	  /* V <-- min (U, V). 

	     Assembly version should use cmov. Another alternative,
	     avoiding carry propagation, would be

	     v0 += vgtu & t0; v1 += vtgu & (u1 - v1); 
	  */
	  add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0);
	  /* U  <--  |U - V| 
	     No carry handling needed in this conditional negation,
	     since t0 != 0. */
	  u0 = (t0 ^ vgtu) - vgtu;
	  u1 = t1 ^ vgtu;
	  if (UNLIKELY (c == GMP_LIMB_BITS))
	    {
	      u0 = u1;
	      u1 = 0;
	    }
	  else
	    {
	      u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c));
	      u1 >>= c;
	    }
	}
    }
  while ((v0 | u0) & GMP_LIMB_HIGHBIT)
    { /* At most two iterations */
      mp_limb_t vgtu, t0;
      int c;
      sub_ddmmss (vgtu, t0, 0, u0, 0, v0);
      if (UNLIKELY (t0 == 0))
	{
	  r.hi = u0 >> (GMP_LIMB_BITS - 1);
	  r.lo = (u0 << 1) | 1;
	  return r;
	}

      /* v <-- min (u, v) */
      v0 += (vgtu & t0);

      /* u <-- |u - v| */
      u0 = (t0 ^ vgtu) - vgtu;

      count_trailing_zeros (c, t0);
      u0 = (u0 >> 1) >> c;
    }

  r.lo = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1);
  r.hi = 0;
  return r;
}

static void
one_test (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, double_limb_t ref)
{
  double_limb_t r = gcd_22 (ah, al, bh, bl);
  if (r.lo != ref.lo || r.hi != ref.hi)
    {
      gmp_fprintf (stderr,
		   "gcd_22 (0x%Mx %08Mx, 0x%Mx %08Mx) failed, got: 0x%Mx %08Mx, ref: 0x%Mx %08Mx\n",
		   ah, al, bh, bl, r.hi, r.lo, ref.hi, ref.lo);
      abort();
    }
}

int main (int argc, char **argv) {
  mpz_t a, b;
  int count = 100000;
  int test;
  gmp_randstate_t rands;
  gmp_randinit_default (rands);
  mpz_init (a);
  mpz_init (b);

  for (test = 0; test < count; test++)
    {
      mp_limb_t al, ah, bl, bh;
      mp_bitcnt_t size = 1 + gmp_urandomm_ui(rands, 2*GMP_NUMB_BITS);
      if (test & 1)
	{
	  mpz_urandomb (a, rands, size);
	  mpz_urandomb (b, rands, size);
	}
      else
	{
	  mpz_rrandomb (a, rands, size);
	  mpz_rrandomb (b, rands, size);
	}

      mpz_setbit (a, 0);
      mpz_setbit (b, 0);
      al = mpz_getlimbn (a, 0);
      ah = mpz_getlimbn (a, 1);
      bl = mpz_getlimbn (b, 0);
      bh = mpz_getlimbn (b, 1);
      one_test (ah, al, bh, bl, ref_gcd_22 (ah, al, bh, bl));
    }

  mpz_clear (a);
  mpz_clear (b);

  gmp_randclear (rands);

  return EXIT_SUCCESS;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.