gcd_22
Marco Bodrato
bodrato at mail.dm.unipi.it
Tue Aug 27 06:01:00 UTC 2019
Ciao,
Il Dom, 25 Agosto 2019 2:28 am, Torbjörn Granlund ha scritto:
> Now we have a nice set of x86_64 gcd_22. The code is not as well tuned
> as the gcd_11 code, but it runs somewhat fast.
So if I suggest to reorder some instructions in the loop, you will not
upset :-)
If we can change cmovc-s then there are a couple of instructions that can
be removed from the "unlikely" branch in gcd_22:
***
/tmp/extdiff.1GZGxB/gmp.746b5528f6a5/mpn/x86_64/core2/gcd_22.asm 2019-08-25
10:32:39.000000000 +0200
--- /home/bodrato/gmp/mpn/x86_64/core2/gcd_22.asm 2019-08-26
23:23:30.321470451 +0200
***************
*** 94,108 ****
bsf t0, cnt
sub v0, u0
sbb v1, u1
- L(bck): cmovc t0, u0 C u = |u - v|
cmovc t1, u1 C u = |u - v|
cmovc s0, v0 C v = min(u,v)
cmovc s1, v1 C v = min(u,v)
shrd R8(cnt), u1, u0
shr R8(cnt), u1
mov v1, t1
--- 94,108 ----
bsf t0, cnt
sub v0, u0
sbb v1, u1
cmovc t1, u1 C u = |u - v|
cmovc s0, v0 C v = min(u,v)
+ L(bck): cmovc t0, u0 C u = |u - v|
cmovc s1, v1 C v = min(u,v)
shrd R8(cnt), u1, u0
shr R8(cnt), u1
mov v1, t1
***************
*** 118,131 ****
C 1. If v1 - u1 = 0, then gcd is u = v.
C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
mov v1, t0
sub u1, t0
je L(end)
! xor t1, t1
! mov u0, s0
mov u1, s1
bsf t0, cnt
mov u1, u0
xor u1, u1
sub v1, u0
jmp L(bck)
--- 118,131 ----
C 1. If v1 - u1 = 0, then gcd is u = v.
C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
mov v1, t0
sub u1, t0
je L(end)
! C xor t1, t1
! C mov u0, s0
mov u1, s1
bsf t0, cnt
mov u1, u0
xor u1, u1
sub v1, u0
jmp L(bck)
The same applies to other x86_64 gcd_22...
> I haven't explored the table based variant which gives 3 bits of
> progress per iteration. It might make the new code obsolete for
> machines with fast multiply.
I'm not sure it's easy to handle the case with u+v that overflows...
Ĝis,
m
--
http://bodrato.it/papers/
More information about the gmp-devel
mailing list