[Gmp-commit] /var/hg/gmp: 8 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Aug 4 22:16:36 CEST 2013
details: /var/hg/gmp/rev/f056e81865d6
changeset: 15918:f056e81865d6
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 13:40:19 2013 +0200
description:
Provide Haswell param file.
details: /var/hg/gmp/rev/278d8d37d000
changeset: 15919:278d8d37d000
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 19:52:10 2013 +0200
description:
Provide bulldozer/piledriver mul_2.
details: /var/hg/gmp/rev/d9b6f610ddb1
changeset: 15920:d9b6f610ddb1
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 19:53:28 2013 +0200
description:
Misc.
details: /var/hg/gmp/rev/716e54116b92
changeset: 15921:716e54116b92
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 20:18:59 2013 +0200
description:
Fix typo.
details: /var/hg/gmp/rev/813f18cea7f7
changeset: 15922:813f18cea7f7
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 20:22:21 2013 +0200
description:
Update comment.
details: /var/hg/gmp/rev/eec24cf4ec44
changeset: 15923:eec24cf4ec44
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 20:22:42 2013 +0200
description:
Remove obsolete comment.
details: /var/hg/gmp/rev/0618a268eb03
changeset: 15924:0618a268eb03
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 22:16:20 2013 +0200
description:
Provide haswell mul_basecase.
details: /var/hg/gmp/rev/3b7f3825b746
changeset: 15925:3b7f3825b746
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Aug 04 22:16:30 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 8 +
mpn/x86/p6/lshsub_n.asm | 2 +-
mpn/x86_64/bd1/mul_2.asm | 181 ++++++++++++
mpn/x86_64/bd1/mul_basecase.asm | 3 +-
mpn/x86_64/coreihwl/gmp-mparam.h | 152 ++++++++++
mpn/x86_64/coreihwl/mulx/mul_basecase.asm | 431 ++++++++++++++++++++++++++++++
mpn/x86_64/coreisbr/popcount.asm | 3 -
7 files changed, 774 insertions(+), 6 deletions(-)
diffs (truncated from 829 to 300 lines):
diff -r 4e9337c30cf4 -r 3b7f3825b746 ChangeLog
--- a/ChangeLog Sun Aug 04 02:13:11 2013 +0200
+++ b/ChangeLog Sun Aug 04 22:16:30 2013 +0200
@@ -1,3 +1,11 @@
+2013-08-04 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/coreihwl/mulx/mul_basecase.asm: New file.
+
+ * mpn/x86_64/bd1/mul_2.asm: New file.
+
+ * mpn/x86_64/coreihwl/gmp-mparam.h: New file.
+
2013-08-03 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/coreihwl/mulx/mul_2.asm: New file.
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86/p6/lshsub_n.asm
--- a/mpn/x86/p6/lshsub_n.asm Sun Aug 04 02:13:11 2013 +0200
+++ b/mpn/x86/p6/lshsub_n.asm Sun Aug 04 22:16:30 2013 +0200
@@ -21,7 +21,7 @@
C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12)
-C (1) The loop is is not scheduled in any way, and scheduling attempts have not
+C (1) The loop is not scheduled in any way, and scheduling attempts have not
C improved speed on P6/13. Presumably, the K7 will want scheduling, if it
C at all wants to use MMX.
C (2) We could save a register by not alternatingly using eax and edx in the
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86_64/bd1/mul_2.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/mul_2.asm Sun Aug 04 22:16:30 2013 +0200
@@ -0,0 +1,181 @@
+dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull 4.36 average, quite fluctuating
+C AMD pile 4.38 slighty fluctuating
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+C Scheme: genxmul --mul
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ mov n_param, n
+ mul v0
+ neg n
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov (up,n,8), %rax
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov %rdx, w3
+ mov (up,n,8), %rax
+ xor R32(w0), R32(w0)
+ mul v1
+ add $-2, n
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ mul v1
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov %rdx, w2
+ mov (up,n,8), %rax
+ xor R32(w3), R32(w3)
+ dec n
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mov -8(up,n,8), %rax
+ mul v1
+ mov w2, -16(rp,n,8)
+L(lo1): add %rax, w0
+ mov w3, -8(rp,n,8)
+ adc %rdx, w1
+ mov (up,n,8), %rax
+ mul v0
+ mov $0, R32(w2)
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mov (up,n,8), %rax
+L(lo0): mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w1
+ mov w0, (rp,n,8)
+ mov $0, R32(w3)
+ mov 8(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ mov $0, R32(w0)
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov w1, 8(rp,n,8)
+L(lo2): add %rax, w3
+ adc %rdx, w0
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ mov $0, R32(w1)
+ adc $0, R32(w1)
+ add $4, n
+ jnc L(top)
+
+L(end): mov -8(up,n,8), %rax
+ mul v1
+ mov w2, -16(rp,n,8)
+ add %rax, w0
+ mov w3, -8(rp,n,8)
+ adc %rdx, w1
+ mov w0, (rp,n,8)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86_64/bd1/mul_basecase.asm
--- a/mpn/x86_64/bd1/mul_basecase.asm Sun Aug 04 02:13:11 2013 +0200
+++ b/mpn/x86_64/bd1/mul_basecase.asm Sun Aug 04 22:16:30 2013 +0200
@@ -46,8 +46,7 @@
C * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
C Alternatively, we could tweak the present code (which was loopmixed for a
C different CPU).
-C * Merge faster mul_2. Current fastest mul_2 code is non-indexed, causing
-C some structure headaches.
+C * Merge faster mul_2, such as the one in the same directory as this file.
C * Further micro-optimise.
C When playing with pointers, set this to $2 to fall back to conservative
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86_64/coreihwl/gmp-mparam.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreihwl/gmp-mparam.h Sun Aug 04 22:16:30 2013 +0200
@@ -0,0 +1,152 @@
+/* Haswell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 2900 MHz Core i5 Haswell */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD 35
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 25
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM44_THRESHOLD 181
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 107
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 125
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 128
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 85
+#define SQR_TOOM4_THRESHOLD 226
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 406
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
More information about the gmp-commit
mailing list