[Gmp-commit] /var/hg/gmp: mpn/x86_64/mod_1_1.asm: Simpler and faster mpn_mod_...

Mon Feb 28 21:13:38 CET 2011

details:   /var/hg/gmp/rev/7af6550aaca0
changeset: 13942:7af6550aaca0
user:      Niels M?ller <nisse at lysator.liu.se>
date:      Mon Feb 28 21:13:25 2011 +0100
description:
mpn/x86_64/mod_1_1.asm: Simpler and faster mpn_mod_1_1p_cps.

diffstat:

 ChangeLog              |   4 ++++
 mpn/x86_64/mod_1_1.asm |  31 ++++++++++---------------------
 2 files changed, 14 insertions(+), 21 deletions(-)

diffs (96 lines):

diff -r 4828d99fcfb3 -r 7af6550aaca0 ChangeLog

--- a/ChangeLog	Mon Feb 28 16:54:52 2011 +0100
+++ b/ChangeLog	Mon Feb 28 21:13:25 2011 +0100
@@ -1,5 +1,9 @@
 2011-02-28  Niels Möller  <nisse at lysator.liu.se>
 
+	* mpn/x86_64/mod_1_1.asm (mpn_mod_1_1p_cps): Simplified
+	computation of B2modb, use B^2 mod (normalized b).
+	(mpn_mod_1_1p): Corresponding changes. Don't shift b.
+
 	* mpn/generic/pre_mod_1.c (mpn_preinv_mod_1): Use udiv_rnnd_preinv
 	rather than udiv_qrnnd_preinv.
 
diff -r 4828d99fcfb3 -r 7af6550aaca0 mpn/x86_64/mod_1_1.asm
--- a/mpn/x86_64/mod_1_1.asm	Mon Feb 28 16:54:52 2011 +0100
+++ b/mpn/x86_64/mod_1_1.asm	Mon Feb 28 21:13:25 2011 +0100
@@ -48,10 +48,6 @@
 C The pre array contains bi, cnt, B1modb, B2modb
 C Note: This implementaion needs B1modb only when cnt > 0
 
-C Currently needs b to not be preshifted, we actually have to undo shift done
-C by caller.  Perhaps b shouldn't be passed at all, it should be in the pre
-C block where the cps function is free to store whatever is needed.
-
 C The iteration is almost as follows,
 C
 C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
@@ -80,9 +76,6 @@
 	mov	%rdx, b
 	mov	%rcx, pre
 
-	mov	8(pre), R32(%rcx)
-	shr	R8(%rcx), b
-
 	mov	-8(ap, n, 8), %rax
 	cmp	$3, n
 	jnc	L(first)
@@ -128,7 +121,7 @@
 	test	R32(%rcx), R32(%rcx)
 	jz	L(normalized)
 
-	C Unnormalized, use B1modb to reduce to size < B b
+	C Unnormalized, use B1modb to reduce to size < B (b+1)
 	mulq	16(pre)
 	xor	t0, t0
 	add	%rax, r0
@@ -136,7 +129,6 @@
 	mov	t0, %rax
 
 	C Left-shift to normalize
-	shl	R8(%rcx), b
 ifdef(`SHLD_SLOW',`
 	shl	R8(%rcx), %rax
 	mov	r0, t0
@@ -192,13 +184,18 @@
 	mov	%r12, %r8
 	mov	%rax, (%rbx)		C store bi
 	mov	%rbp, 8(%rbx)		C store cnt
+	imul	%rax, %r12
+	neg	%r12
+	mov	%r12, 24(%rbx)		C store B2modb
+	mov	R32(%rbp), R32(%rcx)
+	test	R32(%rcx), R32(%rcx)
+	jz	L(z)
 	neg	%r8
-	mov	R32(%rbp), R32(%rcx)
+
 	mov	$1, R32(%rdx)
 ifdef(`SHLD_SLOW',`
 	shl	R8(%rcx), %rdx
 	neg	R32(%rcx)
-	je	L(z)
 	mov	%rax, %rbp
 	shr	R8(%rcx), %rax
 	or	%rax, %rdx
@@ -208,18 +205,10 @@
 	shld	R8(%rcx), %rax, %rdx
 ')
 	imul	%rdx, %r8
-L(z):	mul	%r8
-	add	%r8, %rdx
-	not	%rdx
-	imul	%r12, %rdx
-	add	%rdx, %r12
-	cmp	%rdx, %rax
-	cmovc	%r12, %rdx
 	shr	R8(%rcx), %r8
-	shr	R8(%rcx), %rdx
-	mov	%r8, 16(%rbx)		C store B1modb
+	mov	%r8, 16(%rbx)		C store B1modb	
+L(z):
 	pop	%r12
-	mov	%rdx, 24(%rbx)		C store B2modb
 	pop	%rbx
 	pop	%rbp
 	ret