Small operands gcd improvements
Niels Möller
nisse at lysator.liu.se
Wed Aug 14 19:56:04 UTC 2019
tg at gmplib.org (Torbjörn Granlund) writes:
> I believe an amalgamation of Niels code which uses an implicit lsb and
> his present _22 code would be best. The present requirement for
> sub_mddmmss is a portability inconvenience, and for subtract-with-carry
> challenged processors a major hassle.
Below another version which does pass some tests. It has an implicit one
bit, which causes some impedance mismatch. Maybe it's better to have
some special iterations at the top to eliminate the top bits, before
falling into the main loop.
Not sure I've identified the main loop correctly in the generated code,
but I think it's this (x86_64, gcc-8.3 -O3):
4a: mov %rsi,%r9
4d: mov %rbx,%rax
50: sub %r11,%rax
53: sbb %r8,%r9
56: mov %r9,%rdi
59: sar $0x3f,%rdi
5d: test %rax,%rax
60: je 130 <gcd_22+0x130>
66: bsf %rax,%r10
6a: mov %r9,%rdx
6d: mov %rax,%rsi
70: add $0x1,%r10d
74: xor %rdi,%rax
77: mov %r11,%rcx
7a: and %rdi,%rdx
7d: and %rdi,%rsi
80: sub %rdi,%rax
83: add %rsi,%rcx
86: adc %rdx,%r8
89: xor %rdi,%r9
8c: mov %rcx,%r11
8f: cmp $0x40,%r10d
93: je 168 <gcd_22+0x168>
99: mov %r10d,%ecx
9c: mov %r9,%rbx
9f: mov %r9,%rsi
a2: shr %cl,%rax
a5: mov %ebp,%ecx
a7: sub %r10d,%ecx
aa: shl %cl,%rbx
ad: mov %r10d,%ecx
b0: shr %cl,%rsi
b3: or %rax,%rbx
b6: mov %rsi,%rax
b9: or %r8,%rax
bc: jne 4a <gcd_22+0x4a>
with the branches out going to unlikely code paths.
Regards,
/Niels
----8<------
#include <stdio.h>
#include <stdlib.h>
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
typedef struct {
mp_limb_t lo;
mp_limb_t hi;
} double_limb_t;
static double_limb_t
ref_gcd_22 (mp_limb_t uh, mp_limb_t ul, mp_limb_t vh, mp_limb_t vl)
{
double_limb_t res;
mpz_t u;
mpz_t v;
mpz_t g;
mp_limb_t ua[2] = { ul, uh };
mp_limb_t va[2] = { vl, vh };
mpz_init (g);
mpz_gcd (g, mpz_roinit_n (u, ua, 2), mpz_roinit_n (v, va, 2));
res.lo = mpz_getlimbn (g, 0);
res.hi = mpz_getlimbn (g, 1);
return res;
}
static double_limb_t
gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0)
{
double_limb_t r;
ASSERT (u0 & v0 & 1);
u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1));
u1 >>= 1;
v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1));
v1 >>= 1;
while (u1 || v1) /* u1 == 0 can happen at most twice per call */
{
mp_limb_t vgtu, t1, t0;
sub_ddmmss (t1, t0, u1, u0, v1, v0);
vgtu = LIMB_HIGHBIT_TO_MASK(t1);
if (UNLIKELY (t0 == 0))
{
if (t1 == 0)
{
r.hi = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1));
r.lo = (u0 << 1) | 1;
return r;
}
int c;
count_trailing_zeros (c, t1);
/* v1 = min (u1, v1) */
v1 += (vgtu & t1);
/* u0 = |u1 - v1| */
u0 = (t1 ^ vgtu) - vgtu;
ASSERT (c < GMP_LIMB_BITS - 1);
u0 >>= c + 1;
u1 = 0;
}
else
{
int c;
count_trailing_zeros (c, t0);
c++;
/* V <-- min (U, V).
Assembly version should use cmov. Another alternative,
avoiding carry propagation, would be
v0 += vgtu & t0; v1 += vtgu & (u1 - v1);
*/
add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0);
/* U <-- |U - V|
No carry handling needed in this conditional negation,
since t0 != 0. */
u0 = (t0 ^ vgtu) - vgtu;
u1 = t1 ^ vgtu;
if (UNLIKELY (c == GMP_LIMB_BITS))
{
u0 = u1;
u1 = 0;
}
else
{
u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c));
u1 >>= c;
}
}
}
while ((v0 | u0) & GMP_LIMB_HIGHBIT)
{ /* At most two iterations */
mp_limb_t vgtu, t0;
int c;
sub_ddmmss (vgtu, t0, 0, u0, 0, v0);
if (UNLIKELY (t0 == 0))
{
r.hi = u0 >> (GMP_LIMB_BITS - 1);
r.lo = (u0 << 1) | 1;
return r;
}
/* v <-- min (u, v) */
v0 += (vgtu & t0);
/* u <-- |u - v| */
u0 = (t0 ^ vgtu) - vgtu;
count_trailing_zeros (c, t0);
u0 = (u0 >> 1) >> c;
}
r.lo = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1);
r.hi = 0;
return r;
}
static void
one_test (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, double_limb_t ref)
{
double_limb_t r = gcd_22 (ah, al, bh, bl);
if (r.lo != ref.lo || r.hi != ref.hi)
{
gmp_fprintf (stderr,
"gcd_22 (0x%Mx %08Mx, 0x%Mx %08Mx) failed, got: 0x%Mx %08Mx, ref: 0x%Mx %08Mx\n",
ah, al, bh, bl, r.hi, r.lo, ref.hi, ref.lo);
abort();
}
}
int main (int argc, char **argv) {
mpz_t a, b;
int count = 100000;
int test;
gmp_randstate_t rands;
gmp_randinit_default (rands);
mpz_init (a);
mpz_init (b);
for (test = 0; test < count; test++)
{
mp_limb_t al, ah, bl, bh;
mp_bitcnt_t size = 1 + gmp_urandomm_ui(rands, 2*GMP_NUMB_BITS);
if (test & 1)
{
mpz_urandomb (a, rands, size);
mpz_urandomb (b, rands, size);
}
else
{
mpz_rrandomb (a, rands, size);
mpz_rrandomb (b, rands, size);
}
mpz_setbit (a, 0);
mpz_setbit (b, 0);
al = mpz_getlimbn (a, 0);
ah = mpz_getlimbn (a, 1);
bl = mpz_getlimbn (b, 0);
bh = mpz_getlimbn (b, 1);
one_test (ah, al, bh, bl, ref_gcd_22 (ah, al, bh, bl));
}
mpz_clear (a);
mpz_clear (b);
gmp_randclear (rands);
return EXIT_SUCCESS;
}
--
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list