[Gmp-commit] /var/hg/gmp: 3 new changesets

Mon Feb 27 23:23:16 CET 2012

details:   /var/hg/gmp/rev/fdd2e51d4cc6
changeset: 14693:fdd2e51d4cc6
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Feb 27 23:21:50 2012 +0100
description:
More x86_64/fastsse code.

details:   /var/hg/gmp/rev/07cb807cd3eb
changeset: 14694:07cb807cd3eb
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Feb 27 23:22:38 2012 +0100
description:
Add basic fastsse docs.

details:   /var/hg/gmp/rev/af165a15e6e7
changeset: 14695:af165a15e6e7
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Feb 27 23:23:12 2012 +0100
description:
Remove an unused label.

diffstat:

 ChangeLog                      |    3 +
 mpn/x86_64/fastsse/README      |   19 ++++
 mpn/x86_64/fastsse/com.asm     |  148 ++++++++++++++++++++++++++++++++++++
 mpn/x86_64/fastsse/lshift.asm  |    2 +-
 mpn/x86_64/fastsse/lshiftc.asm |  168 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 339 insertions(+), 1 deletions(-)

diffs (truncated from 371 to 300 lines):

diff -r a1ec003a1124 -r af165a15e6e7 ChangeLog

--- a/ChangeLog	Mon Feb 27 14:57:45 2012 +0100
+++ b/ChangeLog	Mon Feb 27 23:23:12 2012 +0100
@@ -1,5 +1,8 @@
 2012-02-27  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/x86_64/fastsse/lshiftc.asm: New file.
+	* mpn/x86_64/fastsse/com.asm: New file.
+
 	* mpn/x86_64/bd1/popcount.asm: New file.
 	* mpn/x86_64/bd1/hamdist.asm: New file.
 
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/README
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastsse/README	Mon Feb 27 23:23:12 2012 +0100
@@ -0,0 +1,19 @@
+This directory contains code for x86-64 processors with fast
+implementations of SSE operations, hence the name "fastsse".
+
+Current processors that might benefit from this code are:
+  AMD K10
+  AMD Bulldozer
+  Intel Nocona
+  Intel Nehalem/Westmere
+  Intel Sandybridge/Ivybridge
+  VIA Nano
+
+Current processors that do not benefit from this code are:
+
+  AMD K8
+  AMD Bobcat
+  Intel Atom
+
+Intel Conroe/Penryn is a border case; its handling of non-aligned
+memory operands is poor.
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/com.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastsse/com.asm	Mon Feb 27 23:23:12 2012 +0100
@@ -0,0 +1,148 @@
+dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	    cycles/limb		  good for cpu?
+C AMD K8,K9
+C AMD K10	 0.85			Y
+C AMD bd1	 0.92			Y
+C AMD bobcat
+C Intel P4	 2.28			Y
+C Intel core2	 1
+C Intel NHM	 0.5			Y
+C Intel SBR	 0.5			Y
+C Intel atom
+C VIA nano	 1.1			Y
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU.  This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_com)
+	DOS64_ENTRY(3)
+
+	test	n, n
+	jz	L(don)
+
+	pcmpeqb	%xmm7, %xmm7		C set to 111...111
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(ali)			C jump if rp aligned
+	mov	(up), %rax
+	lea	8(up), up
+	not	%rax
+	mov	%rax, (rp)
+	lea	8(rp), rp
+	dec	n
+
+	sub	$14, n
+	jc	L(sma)
+
+	ALIGN(16)
+L(top):	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	movdqu	32(up), %xmm2
+	movdqu	48(up), %xmm3
+	movdqu	64(up), %xmm4
+	movdqu	80(up), %xmm5
+	movdqu	96(up), %xmm6
+	lea	112(up), up
+	pxor	%xmm7, %xmm0
+	pxor	%xmm7, %xmm1
+	pxor	%xmm7, %xmm2
+	pxor	%xmm7, %xmm3
+	pxor	%xmm7, %xmm4
+	pxor	%xmm7, %xmm5
+	pxor	%xmm7, %xmm6
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	movdqa	%xmm4, 64(rp)
+	movdqa	%xmm5, 80(rp)
+	movdqa	%xmm6, 96(rp)
+	lea	112(rp), rp
+L(ali):	sub	$14, n
+	jnc	L(top)
+
+L(sma):	add	$14, n
+	test	$8, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	movdqu	32(up), %xmm2
+	movdqu	48(up), %xmm3
+	lea	64(up), up
+	pxor	%xmm7, %xmm0
+	pxor	%xmm7, %xmm1
+	pxor	%xmm7, %xmm2
+	pxor	%xmm7, %xmm3
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	lea	64(rp), rp
+1:
+	test	$4, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	lea	32(up), up
+	pxor	%xmm7, %xmm0
+	pxor	%xmm7, %xmm1
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	lea	32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	lea	16(up), up
+	pxor	%xmm7, %xmm0
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+1:
+	test	$1, R8(n)
+	jz	1f
+	mov	(up), %rax
+	not	%rax
+	mov	%rax, (rp)
+1:
+L(don):	DOS64_EXIT()
+	ret
+EPILOGUE()
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/lshift.asm
--- a/mpn/x86_64/fastsse/lshift.asm	Mon Feb 27 14:57:45 2012 +0100
+++ b/mpn/x86_64/fastsse/lshift.asm	Mon Feb 27 23:23:12 2012 +0100
@@ -145,7 +145,7 @@
 	ALIGN(16)
 L(le2):	jne	L(end8)
 
-L(2):	movq	8(ap), %xmm0
+	movq	8(ap), %xmm0
 	movq	(ap), %xmm1
 	psllq	%xmm4, %xmm0
 	psrlq	%xmm5, %xmm1
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/lshiftc.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastsse/lshiftc.asm	Mon Feb 27 23:23:12 2012 +0100
@@ -0,0 +1,168 @@
+dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
+
+dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb	     cycles/limb	      good
+C          16-byte aligned         16-byte unaligned	     for cpu
+C AMD K8,K9	 ?			 ?
+C AMD K10	 1.85			 1.9			Y
+C AMD bd1	 1.82			 1.82			Y
+C AMD bobcat	 4.5			 4.5
+C Intel P4	 3.6			 3.6			Y
+C Intel core2	 2.05			 2.55
+C Intel NHM	 2.05			 2.6
+C Intel SBR	 1.55			 2			Y
+C Intel atom	 ?			 ?
+C VIA nano	 2.55			 2.55			Y
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  We can always write using
+C 16-byte operations, we read with both 8-byte and 16-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C   (1) The unaligned case makes too many reads.
+C   (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	-8(ap,n,8), %rax
+	shr	R8(%rcx), %rax
+
+	pcmpeqb	%xmm7, %xmm7		C set to 111...111
+
+	cmp	$2, n
+	jle	L(le2)
+
+	lea	(rp,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	-8(ap,n,8), %xmm0
+	movq	-16(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm7, %xmm0
+	movq	%xmm0, -8(rp,n,8)
+	dec	n
+
+L(rp_aligned):
+	lea	(ap,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(aent)
+	jmp	L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).