[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Sep 9 01:17:24 CEST 2013
details: /var/hg/gmp/rev/9eb5c79ca7a2
changeset: 15974:9eb5c79ca7a2
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Sep 09 01:08:24 2013 +0200
description:
Provide basic AVX copyi/copyd.
details: /var/hg/gmp/rev/f8a778d9a508
changeset: 15975:f8a778d9a508
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Sep 09 01:12:45 2013 +0200
description:
Use some fastsse code for bd1.
details: /var/hg/gmp/rev/8ca5b95c966f
changeset: 15976:8ca5b95c966f
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Sep 09 01:15:06 2013 +0200
description:
Use fastsse com for atom.
details: /var/hg/gmp/rev/d0d816cb0b8e
changeset: 15977:d0d816cb0b8e
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Sep 09 01:16:54 2013 +0200
description:
Remove accidentally left-in debugging code.
diffstat:
mpn/x86_64/atom/com.asm | 26 +++++
mpn/x86_64/bd1/README | 11 ++
mpn/x86_64/bd1/com.asm | 26 +++++
mpn/x86_64/bd1/copyd.asm | 26 +++++
mpn/x86_64/bd1/copyi.asm | 26 +++++
mpn/x86_64/fastavx/copyd.asm | 159 +++++++++++++++++++++++++++++++++++
mpn/x86_64/fastavx/copyi.asm | 156 ++++++++++++++++++++++++++++++++++
mpn/x86_64/fastsse/copyi-palignr.asm | 4 +-
8 files changed, 432 insertions(+), 2 deletions(-)
diffs (truncated from 475 to 300 lines):
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/atom/com.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/com.asm Mon Sep 09 01:16:54 2013 +0200
@@ -0,0 +1,26 @@
+dnl X86-64 mpn_com optimised for Intel Atom.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/bd1/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/README Mon Sep 09 01:16:54 2013 +0200
@@ -0,0 +1,11 @@
+This directory contains code for AMD bulldozer including its piledriver update.
+
+We currently make limited use of SIMD instructions, both via the MPN_PATH and
+via inclusion of x86_64/fastsse files.
+
+The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably
+means that an all-core GMP load (such as a HPC load) might run slower if there
+is significant SIMD dependency.
+
+We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any
+SIMD code.
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/bd1/com.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/com.asm Mon Sep 09 01:16:54 2013 +0200
@@ -0,0 +1,26 @@
+dnl X86-64 mpn_com optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/bd1/copyd.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/copyd.asm Mon Sep 09 01:16:54 2013 +0200
@@ -0,0 +1,26 @@
+dnl X86-64 mpn_copyd optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/bd1/copyi.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/copyi.asm Mon Sep 09 01:16:54 2013 +0200
@@ -0,0 +1,26 @@
+dnl X86-64 mpn_copyi optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/fastavx/copyd.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/fastavx/copyd.asm Mon Sep 09 01:16:54 2013 +0200
@@ -0,0 +1,159 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast AVX.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2013 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb aligned unaligned best seen for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bull n/a
+C AMD pile 4.87 4.87 N
+C AMD steam ? ?
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel core n/a
+C Intel NHM n/a
+C Intel SBR 0.50 0.91 N
+C Intel IBR ?
+C Intel HWL 0.25 0.30 Y
+C Intel BWL ?
+C Intel atom n/a
+C VIA nano n/a
+
+C We try to do as many 32-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ lea -32(rp,n,8), rp
+ lea -32(up,n,8), up
+
+ cmp $7, n C basecase needed for correctness
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(a2) C jump if rp aligned
+ mov 24(up), %rax
+ lea -8(up), up
+ mov %rax, 24(rp)
+ lea -8(rp), rp
+ dec n
+L(a2): test $16, R8(rp) C is rp 32-byte aligned?
+ jz L(a3) C jump if rp aligned
+ vmovdqu 16(up), %xmm0
+ lea -16(up), up
+ vmovdqa %xmm0, 16(rp)
+ lea -16(rp), rp
+ sub $2, n
+L(a3): sub $16, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): vmovdqu (up), %ymm0
+ vmovdqu -32(up), %ymm1
+ vmovdqu -64(up), %ymm2
+ vmovdqu -96(up), %ymm3
+ lea -128(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, -32(rp)
+ vmovdqa %ymm2, -64(rp)
+ vmovdqa %ymm3, -96(rp)
+ lea -128(rp), rp
+L(ali): sub $16, n
+ jnc L(top)
+
+L(sma): test $8, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ vmovdqu -32(up), %ymm1
+ lea -64(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, -32(rp)
+ lea -64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ lea -32(up), up
+ vmovdqa %ymm0, (rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ vmovdqu 16(up), %xmm0
+ lea -16(up), up
+ vmovdqa %xmm0, 16(rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 24(up), %r8
+ mov %r8, 24(rp)
+1:
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(bc): test $4, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov 16(up), %rcx
+ mov 8(up), %r8
+ mov (up), %r9
+ lea -32(up), up
+ mov %rax, 24(rp)
+ mov %rcx, 16(rp)
+ mov %r8, 8(rp)
+ mov %r9, (rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov 16(up), %rcx
+ lea -16(up), up
+ mov %rax, 24(rp)
+ mov %rcx, 16(rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov %rax, 24(rp)
+1:
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff -r fb4b3934029f -r d0d816cb0b8e mpn/x86_64/fastavx/copyi.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
More information about the gmp-commit
mailing list