[Gmp-commit] /var/hg/gmp: 5 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Thu Mar 10 14:08:10 CET 2011


details:   /var/hg/gmp/rev/47068e673ecd
changeset: 14025:47068e673ecd
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 10 13:59:44 2011 +0100
description:
Move new aorrlsh_n.asm to new k8 dir.  Revert mpn/x86_64/aorrlsh_n.asm.

details:   /var/hg/gmp/rev/63b28e8d6496
changeset: 14026:63b28e8d6496
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 10 14:01:28 2011 +0100
description:
Setup path for new k8 directory.

details:   /var/hg/gmp/rev/2c2dbed32630
changeset: 14027:2c2dbed32630
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 10 14:06:14 2011 +0100
description:
Suppress wind-down rp updates.

details:   /var/hg/gmp/rev/4b0c70b6cf9c
changeset: 14028:4b0c70b6cf9c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 10 14:07:13 2011 +0100
description:
Use 'n' instead of 'r11' directly.

details:   /var/hg/gmp/rev/c6568fc594aa
changeset: 14029:c6568fc594aa
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 10 14:07:49 2011 +0100
description:
Add some more table entries (blank for now).

diffstat:

 configure.in                       |    3 +-
 mpn/x86/aorsmul_1.asm              |    3 +-
 mpn/x86/atom/sse2/mul_basecase.asm |   42 ++---
 mpn/x86_64/aorrlsh_n.asm           |  237 +++++++++++++++---------------------
 mpn/x86_64/aorsmul_1.asm           |    2 +-
 mpn/x86_64/k8/aorrlsh_n.asm        |  200 +++++++++++++++++++++++++++++++
 6 files changed, 322 insertions(+), 165 deletions(-)

diffs (truncated from 695 to 300 lines):

diff -r c55f4c6e9cb1 -r c6568fc594aa configure.in
--- a/configure.in	Thu Mar 10 11:16:38 2011 +0100
+++ b/configure.in	Thu Mar 10 14:07:49 2011 +0100
@@ -1504,9 +1504,10 @@
 	  x86_64)
 	    ;;
 	  k10 | bobcat | bulldozer)
-	    path_64="x86_64/k10 $path_64"
+	    path_64="x86_64/k10 x86_64/k8 $path_64"
 	    ;;
 	  athlon64 | k8)
+	    path_64="x86_64/k8 $path_64"
 	    ;;
 	  pentium4)
 	    path_64="x86_64/pentium4 $path_64"
diff -r c55f4c6e9cb1 -r c6568fc594aa mpn/x86/aorsmul_1.asm
--- a/mpn/x86/aorsmul_1.asm	Thu Mar 10 11:16:38 2011 +0100
+++ b/mpn/x86/aorsmul_1.asm	Thu Mar 10 14:07:49 2011 +0100
@@ -21,7 +21,6 @@
 
 include(`../config.m4')
 
-
 C			    cycles/limb
 C P5				14.75
 C P6 model 0-8,10-12		 7.5
@@ -32,9 +31,11 @@
 C P4 model 2  (Northwood)	24.0
 C P4 model 3  (Prescott)
 C P4 model 4  (Nocona)
+C Intel Atom
 C AMD K6			12.5
 C AMD K7			 5.25
 C AMD K8
+C AMD K10
 
 
 ifdef(`OPERATION_addmul_1',`
diff -r c55f4c6e9cb1 -r c6568fc594aa mpn/x86/atom/sse2/mul_basecase.asm
--- a/mpn/x86/atom/sse2/mul_basecase.asm	Thu Mar 10 11:16:38 2011 +0100
+++ b/mpn/x86/atom/sse2/mul_basecase.asm	Thu Mar 10 14:07:49 2011 +0100
@@ -100,14 +100,14 @@
 
 	decl	vn
 	jz	L(done)
-	lea	8(rp), rp
+	lea	-8(rp), rp
 
 L(ol3):	mov	28(%esp), un
 	neg	un
 	lea	4(vp), vp
 	movd	(vp), %mm7	C read next V limb
 	mov	24(%esp), up
-	lea	(rp,un,4), rp
+	lea	16(rp,un,4), rp
 
 	movd	(up), %mm0
 	pmuludq	%mm7, %mm0
@@ -159,14 +159,13 @@
 	adc	un, %edx	C un is zero here
 	add	%eax, 12(rp)
 	movd	%mm0, %ebx
-	lea	16(rp), rp
 	psrlq	$32, %mm0
 	adc	%edx, %ebx
 	movd	%mm0, %eax
 	adc	un, %eax
-	add	%ebx, (rp)
+	add	%ebx, 16(rp)
 	adc	un, %eax
-	mov	%eax, 4(rp)
+	mov	%eax, 20(rp)
 
 	decl	vn
 	jnz	L(ol3)
@@ -204,24 +203,23 @@
 
 	decl	vn
 	jz	L(done)
-	lea	12(rp), rp
+	lea	-4(rp), rp
 
 L(ol0):	mov	28(%esp), un
 	neg	un
 	lea	4(vp), vp
 	movd	(vp), %mm7	C read next V limb
 	mov	24(%esp), up
-	lea	4(rp,un,4), rp
+	lea	20(rp,un,4), rp
 
 	movd	(up), %mm1
 	pmuludq	%mm7, %mm1
 	sar	$2, un
-	xor	%edx, %edx
 	movd	4(up), %mm0
 	lea	-4(up), up
 	movd	%mm1, %eax
 	pmuludq	%mm7, %mm0
-
+	xor	%edx, %edx	C zero edx and CF
 	jmp	L(a0)
 
 L(la0):	movd	4(up), %mm1
@@ -264,14 +262,13 @@
 	adc	un, %edx	C un is zero here
 	add	%eax, 12(rp)
 	movd	%mm0, %ebx
-	lea	16(rp), rp
 	psrlq	$32, %mm0
 	adc	%edx, %ebx
 	movd	%mm0, %eax
 	adc	un, %eax
-	add	%ebx, (rp)
+	add	%ebx, 16(rp)
 	adc	un, %eax
-	mov	%eax, 4(rp)
+	mov	%eax, 20(rp)
 
 	decl	vn
 	jnz	L(ol0)
@@ -309,13 +306,14 @@
 
 	decl	vn
 	jz	L(done)
+	lea	-16(rp), rp
 
 L(ol1):	mov	28(%esp), un
 	neg	un
 	lea	4(vp), vp
 	movd	(vp), %mm7	C read next V limb
 	mov	24(%esp), up
-	lea	8(rp,un,4), rp
+	lea	24(rp,un,4), rp
 
 	movd	(up), %mm0
 	pmuludq	%mm7, %mm0
@@ -364,17 +362,16 @@
 	inc	un
 	jnz	L(la1)
 
-	adc	un, %edx		C un is zero here
+	adc	un, %edx	C un is zero here
 	add	%eax, 12(rp)
 	movd	%mm0, %ebx
-	lea	16(rp), rp
 	psrlq	$32, %mm0
 	adc	%edx, %ebx
 	movd	%mm0, %eax
 	adc	un, %eax
-	add	%ebx, (rp)
+	add	%ebx, 16(rp)
 	adc	un, %eax
-	mov	%eax, 4(rp)
+	mov	%eax, 20(rp)
 
 	decl	vn
 	jnz	L(ol1)
@@ -412,14 +409,14 @@
 
 	decl	vn
 	jz	L(done)
-	lea	4(rp), rp
+	lea	-12(rp), rp
 
 L(ol2):	mov	28(%esp), un
 	neg	un
 	lea	4(vp), vp
 	movd	(vp), %mm7	C read next V limb
 	mov	24(%esp), up
-	lea	-4(rp,un,4), rp
+	lea	12(rp,un,4), rp
 
 	movd	(up), %mm1
 	pmuludq	%mm7, %mm1
@@ -467,17 +464,16 @@
 	inc	un
 	jnz	L(la2)
 
-	adc	un, %edx		C un is zero here
+	adc	un, %edx	C un is zero here
 	add	%eax, 12(rp)
 	movd	%mm0, %ebx
-	lea	16(rp), rp
 	psrlq	$32, %mm0
 	adc	%edx, %ebx
 	movd	%mm0, %eax
 	adc	un, %eax
-	add	%ebx, (rp)
+	add	%ebx, 16(rp)
 	adc	un, %eax
-	mov	%eax, 4(rp)
+	mov	%eax, 20(rp)
 
 	decl	vn
 	jnz	L(ol2)
diff -r c55f4c6e9cb1 -r c6568fc594aa mpn/x86_64/aorrlsh_n.asm
--- a/mpn/x86_64/aorrlsh_n.asm	Thu Mar 10 11:16:38 2011 +0100
+++ b/mpn/x86_64/aorrlsh_n.asm	Thu Mar 10 14:07:49 2011 +0100
@@ -19,36 +19,37 @@
 
 include(`../config.m4')
 
+
 C	     cycles/limb
-C AMD K8,K9	 2.87	< 3.85 for lshift + add_n
-C AMD K10	 2.75	< 3.85 for lshift + add_n
-C Intel P4	22	> 7.33 for lshift + add_n
-C Intel core2	 4.1	> 3.27 for lshift + add_n
-C Intel NHM	 4.4	> 3.75 for lshift + add_n
-C Intel SBR	 3.17	< 3.46 for lshift + add_n
-C Intel atom	 ?	? 8.75 for lshift + add_n
+C AMD K8,K9	 3.1	< 3.85 for lshift + add_n
+C AMD K10	 3.1	< 3.85 for lshift + add_n
+C Intel P4	14.6	> 7.33 for lshift + add_n
+C Intel core2	 3.87	> 3.27 for lshift + add_n
+C Intel NHM	 4	> 3.75 for lshift + add_n
+C Intel SBR	(5.8)	> 3.46 for lshift + add_n
+C Intel atom	(7.75)	< 8.75 for lshift + add_n
 C VIA nano	 4.7	< 6.25 for lshift + add_n
 
-C TODO
-C  * Can we propagate carry into rdx instead of using a special carry register?
-C    That could save enough insns to get to 10 cycles/iteration.
+C This was written quickly and not optimized at all.  Surely one could get
+C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
+C   1) Use indexing to save the 3 LEA
+C   2) Write reasonable feed-in code
+C   3) Be more clever about register usage
+C   4) Unroll more, handling CL negation, carry save/restore cost much now
+C   5) Reschedule
 
-define(`rp',       `%rdi')
-define(`up',       `%rsi')
-define(`vp_param', `%rdx')
-define(`n_param',  `%rcx')
-define(`cnt',      `%r8')
-
-define(`vp',    `%r12')
-define(`n',     `%rbp')
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnt',	`%r8')
 
 ifdef(`OPERATION_addlsh_n',`
-  define(ADDSUB,       `add')
   define(ADCSBB,       `adc')
   define(func, mpn_addlsh_n)
 ')
 ifdef(`OPERATION_rsblsh_n',`
-  define(ADDSUB,       `sub')
   define(ADCSBB,       `sbb')
   define(func, mpn_rsblsh_n)
 ')
@@ -56,145 +57,103 @@
 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
 
 ASM_START()
-        TEXT
-        ALIGN(16)
+	TEXT
+	ALIGN(16)
 PROLOGUE(func)
 	push	%r12
+	push	%r13
+	push	%r14
 	push	%rbp
 	push	%rbx
 
-	mov	(vp_param), %rax	C load first V limb early
+	mov	n, %rax
+	xor	R32(%rbx), R32(%rbx)	C clear carry save register
+	mov	R32(%r8), R32(%rcx)	C shift count
+	xor	R32(%rbp), R32(%rbp)	C limb carry
 
-	mov	$0, R32(n)
-	sub	n_param, n
+	mov	R32(%rax), R32(%r11)
+	and	$3, R32(%r11)
+	je	L(4)
+	sub	$1, R32(%r11)
 
-	lea	-16(up,n_param,8), up
-	lea	-16(rp,n_param,8), rp
-	lea	16(vp_param,n_param,8), vp
+L(012):	mov	(vp), %r8
+	mov	%r8, %r12
+	shl	R8(%rcx), %r8
+	or	%rbp, %r8
+	neg	R8(%rcx)
+	mov	%r12, %rbp
+	shr	R8(%rcx), %rbp
+	neg	R8(%rcx)
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	sbb	R32(%rbx), R32(%rbx)
+	lea	8(up), up


More information about the gmp-commit mailing list