[Gmp-commit] /var/hg/gmp: mpn/x86_64/mod_1_1.asm: Simpler and faster mpn_mod_...
mercurial at gmplib.org
mercurial at gmplib.org
Mon Feb 28 21:13:38 CET 2011
details: /var/hg/gmp/rev/7af6550aaca0
changeset: 13942:7af6550aaca0
user: Niels M?ller <nisse at lysator.liu.se>
date: Mon Feb 28 21:13:25 2011 +0100
description:
mpn/x86_64/mod_1_1.asm: Simpler and faster mpn_mod_1_1p_cps.
diffstat:
ChangeLog | 4 ++++
mpn/x86_64/mod_1_1.asm | 31 ++++++++++---------------------
2 files changed, 14 insertions(+), 21 deletions(-)
diffs (96 lines):
diff -r 4828d99fcfb3 -r 7af6550aaca0 ChangeLog
--- a/ChangeLog Mon Feb 28 16:54:52 2011 +0100
+++ b/ChangeLog Mon Feb 28 21:13:25 2011 +0100
@@ -1,5 +1,9 @@
2011-02-28 Niels Möller <nisse at lysator.liu.se>
+ * mpn/x86_64/mod_1_1.asm (mpn_mod_1_1p_cps): Simplified
+ computation of B2modb, use B^2 mod (normalized b).
+ (mpn_mod_1_1p): Corresponding changes. Don't shift b.
+
* mpn/generic/pre_mod_1.c (mpn_preinv_mod_1): Use udiv_rnnd_preinv
rather than udiv_qrnnd_preinv.
diff -r 4828d99fcfb3 -r 7af6550aaca0 mpn/x86_64/mod_1_1.asm
--- a/mpn/x86_64/mod_1_1.asm Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/x86_64/mod_1_1.asm Mon Feb 28 21:13:25 2011 +0100
@@ -48,10 +48,6 @@
C The pre array contains bi, cnt, B1modb, B2modb
C Note: This implementaion needs B1modb only when cnt > 0
-C Currently needs b to not be preshifted, we actually have to undo shift done
-C by caller. Perhaps b shouldn't be passed at all, it should be in the pre
-C block where the cps function is free to store whatever is needed.
-
C The iteration is almost as follows,
C
C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
@@ -80,9 +76,6 @@
mov %rdx, b
mov %rcx, pre
- mov 8(pre), R32(%rcx)
- shr R8(%rcx), b
-
mov -8(ap, n, 8), %rax
cmp $3, n
jnc L(first)
@@ -128,7 +121,7 @@
test R32(%rcx), R32(%rcx)
jz L(normalized)
- C Unnormalized, use B1modb to reduce to size < B b
+ C Unnormalized, use B1modb to reduce to size < B (b+1)
mulq 16(pre)
xor t0, t0
add %rax, r0
@@ -136,7 +129,6 @@
mov t0, %rax
C Left-shift to normalize
- shl R8(%rcx), b
ifdef(`SHLD_SLOW',`
shl R8(%rcx), %rax
mov r0, t0
@@ -192,13 +184,18 @@
mov %r12, %r8
mov %rax, (%rbx) C store bi
mov %rbp, 8(%rbx) C store cnt
+ imul %rax, %r12
+ neg %r12
+ mov %r12, 24(%rbx) C store B2modb
+ mov R32(%rbp), R32(%rcx)
+ test R32(%rcx), R32(%rcx)
+ jz L(z)
neg %r8
- mov R32(%rbp), R32(%rcx)
+
mov $1, R32(%rdx)
ifdef(`SHLD_SLOW',`
shl R8(%rcx), %rdx
neg R32(%rcx)
- je L(z)
mov %rax, %rbp
shr R8(%rcx), %rax
or %rax, %rdx
@@ -208,18 +205,10 @@
shld R8(%rcx), %rax, %rdx
')
imul %rdx, %r8
-L(z): mul %r8
- add %r8, %rdx
- not %rdx
- imul %r12, %rdx
- add %rdx, %r12
- cmp %rdx, %rax
- cmovc %r12, %rdx
shr R8(%rcx), %r8
- shr R8(%rcx), %rdx
- mov %r8, 16(%rbx) C store B1modb
+ mov %r8, 16(%rbx) C store B1modb
+L(z):
pop %r12
- mov %rdx, 24(%rbx) C store B2modb
pop %rbx
pop %rbp
ret
More information about the gmp-commit
mailing list