[Gmp-commit] /var/hg/gmp: 5 new changesets

Sun Feb 13 00:37:50 CET 2011

details:   /var/hg/gmp/rev/3cb34e9e9bf2
changeset: 13847:3cb34e9e9bf2
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Feb 12 19:11:54 2011 +0100
description:
Add more c/l numbers.

details:   /var/hg/gmp/rev/091da29d9813
changeset: 13848:091da29d9813
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Feb 12 19:12:51 2011 +0100
description:
Add comment.

details:   /var/hg/gmp/rev/7521314d019c
changeset: 13849:7521314d019c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Feb 12 19:15:27 2011 +0100
description:
Minor tweaks, update c/l numbers.

details:   /var/hg/gmp/rev/fad5399da6b8
changeset: 13850:fad5399da6b8
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Feb 13 00:11:22 2011 +0100
description:
New file for Atom/64.

details:   /var/hg/gmp/rev/87361d967b0f
changeset: 13851:87361d967b0f
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Feb 13 00:12:12 2011 +0100
description:
*** empty log message ***

diffstat:

 ChangeLog                       |    8 +-
 mpn/x86_64/aorrlshC_n.asm       |    3 +-
 mpn/x86_64/aorrlsh_n.asm        |  121 +++++++++++++--------------
 mpn/x86_64/aors_n.asm           |    3 +-
 mpn/x86_64/atom/aorrlsh2_n.asm  |  174 ++++++++++++++++++++++++++++++++++++++++
 mpn/x86_64/core2/aors_n.asm     |    3 +-
 mpn/x86_64/core2/rsh1aors_n.asm |    3 +-
 mpn/x86_64/core2/sublshC_n.asm  |    3 +-
 mpn/x86_64/divrem_2.asm         |    2 +-
 mpn/x86_64/rsh1aors_n.asm       |    1 +
 10 files changed, 252 insertions(+), 69 deletions(-)

diffs (truncated from 489 to 300 lines):

diff -r ebcc7e666700 -r 87361d967b0f ChangeLog

--- a/ChangeLog	Sat Feb 12 16:08:29 2011 +0100
+++ b/ChangeLog	Sun Feb 13 00:12:12 2011 +0100
@@ -1,5 +1,11 @@
+2011-02-13  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/atom/aorrlsh2_n.asm: New file.
+
 2011-02-12  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/x86_64/aorrlsh_n.asm: Minor tweaks, update c/l numbers.
+
 	* mpn/x86_64/atom/sublsh1_n.asm: New file.
 
 	* mpn/x86_64/atom/aorrlsh1_n.asm: New file.
@@ -55,7 +61,7 @@
 	* tests/mpn/t-toom8h.c: No tests below MPN_TOOM8H_MIN.
 
 	* mpz/lucnum_ui.c: Use mpn_addlsh2_n.
-	
+
 2011-02-04  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/x86_64/atom/rsh1aors_n.asm: Add a MULFUNC_PROLOGUE.
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/aorrlshC_n.asm
--- a/mpn/x86_64/aorrlshC_n.asm	Sat Feb 12 16:08:29 2011 +0100
+++ b/mpn/x86_64/aorrlshC_n.asm	Sun Feb 13 00:12:12 2011 +0100
@@ -24,7 +24,8 @@
 C AMD K10	 2
 C Intel P4	 ?
 C Intel core2	 3
-C Intel corei	 2.75
+C Intel NHM	 2.75
+C Intel SBR	 2.55
 C Intel atom	 ?
 C VIA nano	 ?
 
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/aorrlsh_n.asm
--- a/mpn/x86_64/aorrlsh_n.asm	Sat Feb 12 16:08:29 2011 +0100
+++ b/mpn/x86_64/aorrlsh_n.asm	Sun Feb 13 00:12:12 2011 +0100
@@ -2,7 +2,7 @@
 dnl  ("rsb" means reversed subtract, name mandated by mpn_sublsh1_n which
 dnl  subtacts the shifted operand from the unshifted operand.)
 
-dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
+dnl  Copyright 2006, 2010, 2011 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -23,13 +23,14 @@
 
 
 C	     cycles/limb
-C AMD K8,K9	 3.25	(mpn_lshift + mpn_add_n costs 3.85 c/l)
-C AMD K10	 3.25	(mpn_lshift + mpn_add_n costs 3.85 c/l)
-C Intel P4	15	(mpn_lshift + mpn_add_n costs 7.33 c/l)
-C Intel core2	 4	(mpn_lshift + mpn_add_n costs 3.27 c/l)
-C Intel corei	 4	(mpn_lshift + mpn_add_n costs 3.75 c/l)
-C Intel atom	 ?
-C VIA nano	 4.7	(mpn_lshift + mpn_add_n costs 6.25 c/l)
+C AMD K8,K9	 3.1	< 3.85 for lshift + add_n, using mul might reach 2.83
+C AMD K10	 3.1	< 3.85 for lshift + add_n, using mul might reach 2.83
+C Intel P4	14.6	> 7.33 for lshift + add_n
+C Intel core2	 3.87	> 3.27 for lshift + add_n
+C Intel NHM	 4	> 3.75 for lshift + add_n
+C Intel SBR	(5.8)	> 3.46 for lshift + add_n
+C Intel atom	(7.75)	< 8.75 for lshift + add_n
+C VIA nano	 4.7	< 6.25 for lshift + add_n
 
 C This was written quickly and not optimized at all.  Surely one could get
 C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
@@ -48,11 +49,11 @@
 define(`cnt',	`%r8')
 
 ifdef(`OPERATION_addlsh_n',`
-  define(ADDSUBC,       `adc')
+  define(ADCSBB,       `adc')
   define(func, mpn_addlsh_n)
 ')
 ifdef(`OPERATION_rsblsh_n',`
-  define(ADDSUBC,       `sbb')
+  define(ADCSBB,       `sbb')
   define(func, mpn_rsblsh_n)
 ')
 
@@ -62,48 +63,45 @@
 	TEXT
 	ALIGN(16)
 PROLOGUE(func)
-
 	push	%r12
 	push	%r13
 	push	%r14
-	push	%r15
+	push	%rbp
 	push	%rbx
 
 	mov	n, %rax
-	xor	%ebx, %ebx		C clear carry save register
-	mov	%r8d, %ecx		C shift count
-	xor	%r15d, %r15d		C limb carry
+	xor	R32(%rbx), R32(%rbx)	C clear carry save register
+	mov	R32(%r8), R32(%rcx)	C shift count
+	xor	R32(%rbp), R32(%rbp)	C limb carry
 
-	mov	%eax, %r11d
-	and	$3, %r11d
+	mov	R32(%rax), R32(%r11)
+	and	$3, R32(%r11)
 	je	L(4)
-	sub	$1, %r11d
+	sub	$1, R32(%r11)
 
-L(oopette):
-	mov	0(vp), %r8
+L(012):	mov	(vp), %r8
 	mov	%r8, %r12
-	shl	%cl, %r8
-	or	%r15, %r8
-	neg	%cl
-	mov	%r12, %r15
-	shr	%cl, %r15
-	neg	%cl
-	add	%ebx, %ebx
-	ADDSUBC	0(up), %r8
-	mov	%r8, 0(rp)
-	sbb	%ebx, %ebx
+	shl	R8(%rcx), %r8
+	or	%rbp, %r8
+	neg	R8(%rcx)
+	mov	%r12, %rbp
+	shr	R8(%rcx), %rbp
+	neg	R8(%rcx)
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	sbb	R32(%rbx), R32(%rbx)
 	lea	8(up), up
 	lea	8(vp), vp
 	lea	8(rp), rp
-	sub	$1, %r11d
-	jnc	L(oopette)
+	sub	$1, R32(%r11)
+	jnc	L(012)
 
-L(4):
-	sub	$4, %rax
+L(4):	sub	$4, %rax
 	jc	L(end)
 
-L(oop):
-	mov	0(vp), %r8
+	ALIGN(16)
+L(top):	mov	(vp), %r8
 	mov	%r8, %r12
 	mov	8(vp), %r9
 	mov	%r9, %r13
@@ -111,55 +109,54 @@
 	mov	%r10, %r14
 	mov	24(vp), %r11
 
-	shl	%cl, %r8
-	shl	%cl, %r9
-	shl	%cl, %r10
-	or	%r15, %r8
-	mov	%r11, %r15
-	shl	%cl, %r11
+	shl	R8(%rcx), %r8
+	shl	R8(%rcx), %r9
+	shl	R8(%rcx), %r10
+	or	%rbp, %r8
+	mov	%r11, %rbp
+	shl	R8(%rcx), %r11
 
-	neg	%cl
+	neg	R8(%rcx)
 
-	shr	%cl, %r12
-	shr	%cl, %r13
-	shr	%cl, %r14
-	shr	%cl, %r15		C used next loop
+	shr	R8(%rcx), %r12
+	shr	R8(%rcx), %r13
+	shr	R8(%rcx), %r14
+	shr	R8(%rcx), %rbp		C used next iteration
 
 	or	%r12, %r9
 	or	%r13, %r10
 	or	%r14, %r11
 
-	neg	%cl
+	neg	R8(%rcx)
 
-	add	%ebx, %ebx		C restore carry flag
+	add	R32(%rbx), R32(%rbx)	C restore carry flag
 
-	ADDSUBC	0(up), %r8
-	ADDSUBC	8(up), %r9
-	ADDSUBC	16(up), %r10
-	ADDSUBC	24(up), %r11
+	ADCSBB	(up), %r8
+	ADCSBB	8(up), %r9
+	ADCSBB	16(up), %r10
+	ADCSBB	24(up), %r11
 
-	mov	%r8, 0(rp)
+	mov	%r8, (rp)
 	mov	%r9, 8(rp)
 	mov	%r10, 16(rp)
 	mov	%r11, 24(rp)
 
-	sbb	%ebx, %ebx		C save carry flag
+	sbb	R32(%rbx), R32(%rbx)	C save carry flag
 
 	lea	32(up), up
 	lea	32(vp), vp
 	lea	32(rp), rp
 
 	sub	$4, %rax
-	jnc	L(oop)
-L(end):
-	add	%ebx, %ebx
-	ADDSUBC	$0, %r15
-	mov	%r15, %rax
+	jnc	L(top)
+
+L(end):	add	R32(%rbx), R32(%rbx)
+	ADCSBB	$0, %rbp
+	mov	%rbp, %rax
 	pop	%rbx
-	pop	%r15
+	pop	%rbp
 	pop	%r14
 	pop	%r13
 	pop	%r12
-
 	ret
 EPILOGUE()
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/aors_n.asm
--- a/mpn/x86_64/aors_n.asm	Sat Feb 12 16:08:29 2011 +0100
+++ b/mpn/x86_64/aors_n.asm	Sun Feb 13 00:12:12 2011 +0100
@@ -25,7 +25,8 @@
 C AMD K10	 1.5
 C Intel P4	 ?
 C Intel core2	 4.9
-C Intel corei	 ?
+C Intel NHM	 5.5
+C Intel SBR	 1.59
 C Intel atom	 4
 C VIA nano	 3.25
 
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/atom/aorrlsh2_n.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/aorrlsh2_n.asm	Sun Feb 13 00:12:12 2011 +0100
@@ -0,0 +1,174 @@
+dnl  AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+dnl  Optimised for Intel Atom.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 5.75
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp',       `%rdx')
+define(`n',        `%rcx')
+define(`cy',       `%r8')
+
+define(`LSH', 2)
+define(`RSH', 62)
+define(M, eval(m4_lshift(1,LSH)))
+
+ifdef(`OPERATION_addlsh2_n', `