[Gmp-commit] /var/hg/gmp: (mpn_mod_1_1p): Rewrite using the same algorithm as...

Thu Feb 24 21:08:52 CET 2011

details:   /var/hg/gmp/rev/e2be96b70ba6
changeset: 13892:e2be96b70ba6
user:      Niels M?ller <nisse at lysator.liu.se>
date:      Thu Feb 24 21:08:30 2011 +0100
description:
(mpn_mod_1_1p): Rewrite using the same algorithm as the x86_64 version.

diffstat:

 ChangeLog              |    5 +
 mpn/x86/k7/mod_1_1.asm |  197 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 139 insertions(+), 63 deletions(-)

diffs (250 lines):

diff -r 6c79ae2572a4 -r e2be96b70ba6 ChangeLog

--- a/ChangeLog	Wed Feb 23 22:12:01 2011 +0100
+++ b/ChangeLog	Thu Feb 24 21:08:30 2011 +0100
@@ -1,3 +1,8 @@
+2011-02-24  Niels Möller  <nisse at lysator.liu.se>
+
+	* mpn/x86/k7/mod_1_1.asm (mpn_mod_1_1p): Rewrite using the same
+	algorithm as the x86_64 version.
+
 2011-02-23 Marco Bodrato <bodrato at mail.dm.unipi.it>
 
 	* mpn/x86/atom/logops_n.asm: New file (same loop as aors_n).
diff -r 6c79ae2572a4 -r e2be96b70ba6 mpn/x86/k7/mod_1_1.asm
--- a/mpn/x86/k7/mod_1_1.asm	Wed Feb 23 22:12:01 2011 +0100
+++ b/mpn/x86/k7/mod_1_1.asm	Thu Feb 24 21:08:30 2011 +0100
@@ -1,8 +1,8 @@
 dnl  x86-32 mpn_mod_1_1p, requiring cmov.
 
-dnl  Contributed to the GNU project by Torbjorn Granlund.
+dnl  Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
 dnl
-dnl  Copyright 2010 Free Software Foundation, Inc.
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
 dnl
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -25,16 +25,45 @@
 C P5				 ?
 C P6 model 0-8,10-12		 ?
 C P6 model 9  (Banias)		 ?
-C P6 model 13 (Dothan)		11.75
+C P6 model 13 (Dothan)		 ?
 C P4 model 0  (Willamette)	 ?
 C P4 model 1  (?)		 ?
 C P4 model 2  (Northwood)	 ?
 C P4 model 3  (Prescott)	 ?
 C P4 model 4  (Nocona)		 ?
 C AMD K6			 ?
-C AMD K7			 8
+C AMD K7			 7
 C AMD K8			 ?
 
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx')  C Also shift count
+
+C Stack frame
+C	pre	36(%esp)
+C	b	32(%esp)
+C	n	28(%esp)
+C	ap	24(%esp)
+C	return	20(%esp)
+C	%ebp	16(%esp)
+C	%edi	12(%esp)
+C	%esi	8(%esp)
+C	%ebx	4(%esp)
+C	B2mod	(%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
 ASM_START()
 	TEXT
 	ALIGN(8)
@@ -43,74 +72,116 @@
 	push	%edi
 	push	%esi
 	push	%ebx
-	mov	24(%esp), %ebx
-	mov	20(%esp), %esi
-	mov	32(%esp), %ebp		C cps[]
-	lea	(%esi,%ebx,4), %esi
+	mov	32(%esp), %ebp		C pre[]
 
-	mov	8(%ebp), %edi		C B1modb
-	mov	12(%ebp), %ebp		C B2modb
-	mov	-4(%esi), %eax
-	mul	%edi
-	xor	%ecx, %ecx
-	add	-8(%esi), %eax
-	adc	%edx, %ecx
-	sub	$2, 24(%esp)
-	jle	L(end)
+	mov	12(%ebp), %eax		C B2modb
+	push	%eax			C Put it on stack
+
+	mov	4(%ebp), %cl
+	shrl	%cl, b
+
+	mov	n, %edx
+	mov	24(%esp), ap
+
+	lea	(ap, %edx, 4), ap
+	mov	-4(ap), %eax
+	cmp	$3, %edx
+	jnc	L(first)
+	mov	-8(ap), r0
+	jmp	L(reduce_two)
+
+L(first):
+	C First iteration, no r2
+	mull	B2modb
+	mov	-12(ap), r0
+	add	%eax, r0
+	mov	-8(ap), %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+	sub	$3, n
+	lea	-16(ap), ap
+	jz	L(reduce_three)
+
+	mov	B2modb, B2mb
+	sub	b, B2mb
+	lea	(B2mb, r0), t0
+	jmp	L(mid)
 
 	ALIGN(16)
-L(top):	mul	%edi			C 0
-	mov	-12(%esi), %ebx		C
-	add	%eax, %ebx		C 4
-	mov	%ecx, %eax		C 2
-	mov	$0, %ecx		C
-	adc	%edx, %ecx		C 6
-	mul	%ebp			C 3
-	add	%ebx, %eax		C 7
-	adc	%edx, %ecx		C 9
-	decl	24(%esp)
-	lea	-4(%esi), %esi
-	jg	L(top)
+L(top): C Loopmixed to 7 c/l on k7
+	add	%eax, r0
+	lea	(B2mb, r0), t0
+	mov	r2, %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+L(mid):	mull	B2modb
+	and	B2modb, r2
+	add	r0, r2
+	decl	n
+	mov	(ap), r0
+	cmovc(	t0, r2)
+	lea	-4(ap), ap
+	jnz	L(top)
 
-L(end):	mov	%eax, %ebp
-	mov	%ecx, %eax
-	mul	%edi
-	mov	32(%esp), %edi
-	add	%eax, %ebp
-	adc	$0, %edx
-	mov	4(%edi), %ecx
-	mov	%edx, %eax		C rh
-	mov	%ebp, %esi		C rl
-	sal	%cl, %eax
-	mov	%ecx, %ebx
-	test	%ecx, %ecx
-	je	L(nrm)
-	neg	%ecx
-	shr	%cl, %esi
-	or	%esi, %eax
-	neg	%ecx
-L(nrm):	lea	1(%eax), %esi
-	mull	(%edi)
-	mov	%eax, %ebx
-	mov	%ebp, %eax
-	mov	28(%esp), %ebp
-	sal	%cl, %eax
-	add	%eax, %ebx
-	adc	%esi, %edx
-	imul	%ebp, %edx
-	sub	%edx, %eax
-	lea	(%eax,%ebp), %edx
-	cmp	%eax, %ebx
-	cmovc(	%edx, %eax)
-	mov	%eax, %edx
-	sub	%ebp, %eax
-	cmovc(	%edx, %eax)
+	add	%eax, r0
+	mov	r2, %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+
+L(reduce_three):
+	C Eliminate r2
+	and	b, r2
+	sub	r2, %eax
+
+L(reduce_two):
+	mov	pre, %ebp
+	movb	4(%ebp), %cl
+	test	%cl, %cl
+	jz	L(normalized)
+
+	C Unnormalized, use B1modb to reduce to size < B b
+	mull	8(%ebp)
+	xor	t0, t0
+	add	%eax, r0
+	adc	%edx, t0
+	mov	t0, %eax
+
+	C Left-shift to normalize
+	shll	%cl, b
+	shld	%cl, r0, %eax C Always use shld?
+
+	shl	%cl, r0
+	jmp	L(udiv)
+
+L(normalized):
+	mov	%eax, t0
+	sub	b, t0
+	cmovnc(	t0, %eax)
+
+L(udiv):
+	lea	1(%eax), t0
+	mull	(%ebp)
+	mov	b, %ebx		C Needed in register for lea
+	add	r0, %eax
+	adc	t0, %edx
+	imul	%ebx, %edx
+	sub	%edx, r0
+	cmp	r0, %eax
+	lea	(%ebx, r0), %eax
+	cmovnc(	r0, %eax)
+	cmp	%ebx, %eax
+	jnc	L(fix)
+L(ok):	shr	%cl, %eax
+
+	add	$4, %esp
 	pop	%ebx
 	pop	%esi
 	pop	%edi
 	pop	%ebp
-	shr	%cl, %eax
+
 	ret
+L(fix):	sub	%ebx, %eax
+	jmp	L(ok)
 EPILOGUE()
 
 PROLOGUE(mpn_mod_1_1p_cps)