[Gmp-commit] /var/hg/gmp: Small changes to atom/32 sqr

Mon Mar 7 22:05:18 CET 2011

details:   /var/hg/gmp/rev/f6ab07c011b0
changeset: 14008:f6ab07c011b0
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Mon Mar 07 22:05:14 2011 +0100
description:
Small changes to atom/32 sqr

diffstat:

 ChangeLog                          |   4 ++
 mpn/x86/atom/sse2/sqr_basecase.asm |  59 +++++++++++++++++--------------------
 2 files changed, 31 insertions(+), 32 deletions(-)

diffs (146 lines):

diff -r 23aac43a7ddc -r f6ab07c011b0 ChangeLog

--- a/ChangeLog	Mon Mar 07 17:11:10 2011 +0100
+++ b/ChangeLog	Mon Mar 07 22:05:14 2011 +0100
@@ -1,3 +1,7 @@
+2011-03-07 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpn/x86/atom/sse2/sqr_basecase.asm: Small code cleanup.
+
 2011-03-07  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/x86/atom/sse2/mul_basecase.asm: Replace addmul_1 loops.
diff -r 23aac43a7ddc -r f6ab07c011b0 mpn/x86/atom/sse2/sqr_basecase.asm
--- a/mpn/x86/atom/sse2/sqr_basecase.asm	Mon Mar 07 17:11:10 2011 +0100
+++ b/mpn/x86/atom/sse2/sqr_basecase.asm	Mon Mar 07 22:05:14 2011 +0100
@@ -60,19 +60,18 @@
 	mov	28(%esp), n
 
 	lea	4(rp), rp	C write triangular product starting at rp[1]
-	lea	-1(n), %eax
-	neg	n
+	dec	n
 	movd	(up), %mm7
 
-	test	%eax, %eax
 	jz	L(one)
+	lea	4(up), up
+	mov	n, %eax
 
-	movd	4(up), %mm0
-	lea	4(up), up
+	movd	(up), %mm0
+	neg	n
 	pmuludq	%mm7, %mm0
 	pxor	%mm6, %mm6
-	lea	1(n), un	C decr ABSOLUTE value
-	lea	1(n), n		C decr ABSOLUTE value
+	mov	n, un
 
 	and	$3, %eax
 	jz	L(of0)
@@ -246,9 +245,6 @@
 L(eq1):
 	psrlq	$32, %mm1
 	movd	%mm1, %eax
-	add	%ebx, 4(rp)
-	adc	un, %eax
-	mov	%eax, 8(rp)
 	jmp	L(cj1)
 
 L(la1):	adc	$0, %edx
@@ -303,10 +299,11 @@
 	adc	%edx, %ebx
 	movd	%mm1, %eax
 	adc	un, %eax
+L(cj1):
 	add	%ebx, 4(rp)
 	adc	un, %eax
 	mov	%eax, 8(rp)
-L(cj1):
+
 	inc	n
 	jz	L(done)
 
@@ -537,38 +534,36 @@
 
 C ================================================================
 
-L(done):
+L(done):			C n is zero here
 	mov	24(%esp), up
 	mov	28(%esp), %eax
 
 	movd	(up), %mm0
-	xor	%ebp, %ebp
+	inc	%eax
 	pmuludq	%mm0, %mm0
+	lea	4(up), up
 	mov	20(%esp), rp
 	shr	%eax
 	movd	%mm0, (rp)
 	psrlq	$32, %mm0
+	lea	-12(rp), rp
 	mov	%eax, 28(%esp)
-	jc	L(odd)
+	jnc	L(odd)
 
-	movd	%mm0, %ecx
-	movd	4(up), %mm0
-	lea	-4(rp), rp
+	movd	%mm0, %ebp
+	movd	(up), %mm0
+	lea	8(rp), rp
 	pmuludq	%mm0, %mm0
-	add	8(rp), %ecx
+	lea	-4(up), up
+	add	8(rp), %ebp
 	movd	%mm0, %edx
 	adc	12(rp), %edx
-	rcr	%ebp
+	rcr	n
 	jmp	L(ent)
 
-L(odd):	clc			C clear carry  FIXME: use test/and
-	lea	4(rp), rp
-C	jz	L(end)
-	lea	4(up), up
-
 C	ALIGN(16)		C alignment seems irrelevant
 L(top):	movd	(up), %mm1
-	adc	%ebp, %ebp
+	adc	n, n
 	movd	%mm0, %eax
 	pmuludq	%mm1, %mm1
 	movd	4(up), %mm0
@@ -577,24 +572,24 @@
 	pmuludq	%mm0, %mm0
 	psrlq	$32, %mm1
 	adc	4(rp), %ebx
-	movd	%mm1, %ecx
+	movd	%mm1, %ebp
 	movd	%mm0, %edx
-	adc	8(rp), %ecx
+	adc	8(rp), %ebp
 	adc	12(rp), %edx
-	rcr	%ebp		C FIXME: isn't this awfully slow on atom???
+	rcr	n		C FIXME: isn't this awfully slow on atom???
 	adc	%eax, (rp)
 	adc	%ebx, 4(rp)
 L(ent):	lea	8(up), up
-	adc	%ecx, 8(rp)
+	adc	%ebp, 8(rp)
 	psrlq	$32, %mm0
 	adc	%edx, 12(rp)
-	decl	28(%esp)
+L(odd):	decl	28(%esp)
 	lea	16(rp), rp
 	jnz	L(top)
 
-L(end):	adc	%ebp, %ebp
+L(end):	adc	n, n
 	movd	%mm0, %eax
-	adc	%ebp, %eax
+	adc	n, %eax
 	mov	%eax, (rp)
 
 L(rtn):	emms