[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Feb 27 23:23:16 CET 2012
details: /var/hg/gmp/rev/fdd2e51d4cc6
changeset: 14693:fdd2e51d4cc6
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Feb 27 23:21:50 2012 +0100
description:
More x86_64/fastsse code.
details: /var/hg/gmp/rev/07cb807cd3eb
changeset: 14694:07cb807cd3eb
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Feb 27 23:22:38 2012 +0100
description:
Add basic fastsse docs.
details: /var/hg/gmp/rev/af165a15e6e7
changeset: 14695:af165a15e6e7
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Feb 27 23:23:12 2012 +0100
description:
Remove an unused label.
diffstat:
ChangeLog | 3 +
mpn/x86_64/fastsse/README | 19 ++++
mpn/x86_64/fastsse/com.asm | 148 ++++++++++++++++++++++++++++++++++++
mpn/x86_64/fastsse/lshift.asm | 2 +-
mpn/x86_64/fastsse/lshiftc.asm | 168 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 339 insertions(+), 1 deletions(-)
diffs (truncated from 371 to 300 lines):
diff -r a1ec003a1124 -r af165a15e6e7 ChangeLog
--- a/ChangeLog Mon Feb 27 14:57:45 2012 +0100
+++ b/ChangeLog Mon Feb 27 23:23:12 2012 +0100
@@ -1,5 +1,8 @@
2012-02-27 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/x86_64/fastsse/lshiftc.asm: New file.
+ * mpn/x86_64/fastsse/com.asm: New file.
+
* mpn/x86_64/bd1/popcount.asm: New file.
* mpn/x86_64/bd1/hamdist.asm: New file.
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastsse/README Mon Feb 27 23:23:12 2012 +0100
@@ -0,0 +1,19 @@
+This directory contains code for x86-64 processors with fast
+implementations of SSE operations, hence the name "fastsse".
+
+Current processors that might benefit from this code are:
+ AMD K10
+ AMD Bulldozer
+ Intel Nocona
+ Intel Nehalem/Westmere
+ Intel Sandybridge/Ivybridge
+ VIA Nano
+
+Current processors that do not benefit from this code are:
+
+ AMD K8
+ AMD Bobcat
+ Intel Atom
+
+Intel Conroe/Penryn is a border case; its handling of non-aligned
+memory operands is poor.
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/com.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastsse/com.asm Mon Feb 27 23:23:12 2012 +0100
@@ -0,0 +1,148 @@
+dnl AMD64 mpn_com optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb good for cpu?
+C AMD K8,K9
+C AMD K10 0.85 Y
+C AMD bd1 0.92 Y
+C AMD bobcat
+C Intel P4 2.28 Y
+C Intel core2 1
+C Intel NHM 0.5 Y
+C Intel SBR 0.5 Y
+C Intel atom
+C VIA nano 1.1 Y
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_com)
+ DOS64_ENTRY(3)
+
+ test n, n
+ jz L(don)
+
+ pcmpeqb %xmm7, %xmm7 C set to 111...111
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ mov (up), %rax
+ lea 8(up), up
+ not %rax
+ mov %rax, (rp)
+ lea 8(rp), rp
+ dec n
+
+ sub $14, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ movdqu 64(up), %xmm4
+ movdqu 80(up), %xmm5
+ movdqu 96(up), %xmm6
+ lea 112(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ pxor %xmm7, %xmm2
+ pxor %xmm7, %xmm3
+ pxor %xmm7, %xmm4
+ pxor %xmm7, %xmm5
+ pxor %xmm7, %xmm6
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ movdqa %xmm4, 64(rp)
+ movdqa %xmm5, 80(rp)
+ movdqa %xmm6, 96(rp)
+ lea 112(rp), rp
+L(ali): sub $14, n
+ jnc L(top)
+
+L(sma): add $14, n
+ test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ lea 64(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ pxor %xmm7, %xmm2
+ pxor %xmm7, %xmm3
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ lea 32(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea 16(up), up
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov (up), %rax
+ not %rax
+ mov %rax, (rp)
+1:
+L(don): DOS64_EXIT()
+ ret
+EPILOGUE()
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/lshift.asm
--- a/mpn/x86_64/fastsse/lshift.asm Mon Feb 27 14:57:45 2012 +0100
+++ b/mpn/x86_64/fastsse/lshift.asm Mon Feb 27 23:23:12 2012 +0100
@@ -145,7 +145,7 @@
ALIGN(16)
L(le2): jne L(end8)
-L(2): movq 8(ap), %xmm0
+ movq 8(ap), %xmm0
movq (ap), %xmm1
psllq %xmm4, %xmm0
psrlq %xmm5, %xmm1
diff -r a1ec003a1124 -r af165a15e6e7 mpn/x86_64/fastsse/lshiftc.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastsse/lshiftc.asm Mon Feb 27 23:23:12 2012 +0100
@@ -0,0 +1,168 @@
+dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
+
+dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb good
+C 16-byte aligned 16-byte unaligned for cpu
+C AMD K8,K9 ? ?
+C AMD K10 1.85 1.9 Y
+C AMD bd1 1.82 1.82 Y
+C AMD bobcat 4.5 4.5
+C Intel P4 3.6 3.6 Y
+C Intel core2 2.05 2.55
+C Intel NHM 2.05 2.6
+C Intel SBR 1.55 2 Y
+C Intel atom ? ?
+C VIA nano 2.55 2.55 Y
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C 16-byte operations, we read with both 8-byte and 16-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true. The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C (1) The unaligned case makes too many reads.
+C (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ pcmpeqb %xmm7, %xmm7 C set to 111...111
+
+ cmp $2, n
+ jle L(le2)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm7, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea (ap,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(aent)
+ jmp L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
More information about the gmp-commit
mailing list