[Gmp-commit] /var/hg/gmp: 8 new changesets
    mercurial at gmplib.org 
    mercurial at gmplib.org
       
    Sun Aug  4 22:16:36 CEST 2013
    
    
  
details:   /var/hg/gmp/rev/f056e81865d6
changeset: 15918:f056e81865d6
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 13:40:19 2013 +0200
description:
Provide Haswell param file.
details:   /var/hg/gmp/rev/278d8d37d000
changeset: 15919:278d8d37d000
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 19:52:10 2013 +0200
description:
Provide bulldozer/piledriver mul_2.
details:   /var/hg/gmp/rev/d9b6f610ddb1
changeset: 15920:d9b6f610ddb1
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 19:53:28 2013 +0200
description:
Misc.
details:   /var/hg/gmp/rev/716e54116b92
changeset: 15921:716e54116b92
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 20:18:59 2013 +0200
description:
Fix typo.
details:   /var/hg/gmp/rev/813f18cea7f7
changeset: 15922:813f18cea7f7
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 20:22:21 2013 +0200
description:
Update comment.
details:   /var/hg/gmp/rev/eec24cf4ec44
changeset: 15923:eec24cf4ec44
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 20:22:42 2013 +0200
description:
Remove obsolete comment.
details:   /var/hg/gmp/rev/0618a268eb03
changeset: 15924:0618a268eb03
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 22:16:20 2013 +0200
description:
Provide haswell mul_basecase.
details:   /var/hg/gmp/rev/3b7f3825b746
changeset: 15925:3b7f3825b746
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Aug 04 22:16:30 2013 +0200
description:
ChangeLog
diffstat:
 ChangeLog                                 |    8 +
 mpn/x86/p6/lshsub_n.asm                   |    2 +-
 mpn/x86_64/bd1/mul_2.asm                  |  181 ++++++++++++
 mpn/x86_64/bd1/mul_basecase.asm           |    3 +-
 mpn/x86_64/coreihwl/gmp-mparam.h          |  152 ++++++++++
 mpn/x86_64/coreihwl/mulx/mul_basecase.asm |  431 ++++++++++++++++++++++++++++++
 mpn/x86_64/coreisbr/popcount.asm          |    3 -
 7 files changed, 774 insertions(+), 6 deletions(-)
diffs (truncated from 829 to 300 lines):
diff -r 4e9337c30cf4 -r 3b7f3825b746 ChangeLog
--- a/ChangeLog	Sun Aug 04 02:13:11 2013 +0200
+++ b/ChangeLog	Sun Aug 04 22:16:30 2013 +0200
@@ -1,3 +1,11 @@
+2013-08-04  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/coreihwl/mulx/mul_basecase.asm: New file.
+
+	* mpn/x86_64/bd1/mul_2.asm: New file.
+
+	* mpn/x86_64/coreihwl/gmp-mparam.h: New file.
+
 2013-08-03  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/x86_64/coreihwl/mulx/mul_2.asm: New file.
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86/p6/lshsub_n.asm
--- a/mpn/x86/p6/lshsub_n.asm	Sun Aug 04 02:13:11 2013 +0200
+++ b/mpn/x86/p6/lshsub_n.asm	Sun Aug 04 22:16:30 2013 +0200
@@ -21,7 +21,7 @@
 
 C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
 
-C (1) The loop is is not scheduled in any way, and scheduling attempts have not
+C (1) The loop is not scheduled in any way, and scheduling attempts have not
 C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
 C     at all wants to use MMX.
 C (2) We could save a register by not alternatingly using eax and edx in the
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86_64/bd1/mul_2.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/mul_2.asm	Sun Aug 04 22:16:30 2013 +0200
@@ -0,0 +1,181 @@
+dnl  AMD64 mpn_mul_2 optimised for AMD Bulldozer.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2011, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull	4.36		average, quite fluctuating
+C AMD pile	4.38		slighty fluctuating
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+C Scheme: genxmul --mul
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %rax
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	(up,n_param,8), up
+	lea	(rp,n_param,8), rp
+
+	mov	n_param, n
+	mul	v0
+	neg	n
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	mov	%rax, w0
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mov	(up,n,8), %rax
+	jmp	L(lo0)
+
+L(b10):	mov	%rax, w2
+	mov	%rdx, w3
+	mov	(up,n,8), %rax
+	xor	R32(w0), R32(w0)
+	mul	v1
+	add	$-2, n
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n)
+	jz	L(b11)
+
+L(b01):	mov	%rax, w3
+	mov	%rdx, w0
+	mov	(up,n,8), %rax
+	mul	v1
+	xor	R32(w1), R32(w1)
+	inc	n
+	jmp	L(lo1)
+
+L(b11):	mov	%rax, w1
+	mov	%rdx, w2
+	mov	(up,n,8), %rax
+	xor	R32(w3), R32(w3)
+	dec	n
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	w2, -16(rp,n,8)
+L(lo1):	add	%rax, w0
+	mov	w3, -8(rp,n,8)
+	adc	%rdx, w1
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	$0, R32(w2)
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mov	(up,n,8), %rax
+L(lo0):	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,n,8), %rax
+	mul	v0
+	add	%rax, w1
+	mov	w0, (rp,n,8)
+	mov	$0, R32(w3)
+	mov	8(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(lo3):	mul	v1
+	add	%rax, w2
+	mov	16(up,n,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	add	%rax, w2
+	mov	16(up,n,8), %rax
+	mov	$0, R32(w0)
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	w1, 8(rp,n,8)
+L(lo2):	add	%rax, w3
+	adc	%rdx, w0
+	mov	24(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	$0, R32(w1)
+	adc	$0, R32(w1)
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	w2, -16(rp,n,8)
+	add	%rax, w0
+	mov	w3, -8(rp,n,8)
+	adc	%rdx, w1
+	mov	w0, (rp,n,8)
+	mov	w1, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86_64/bd1/mul_basecase.asm
--- a/mpn/x86_64/bd1/mul_basecase.asm	Sun Aug 04 02:13:11 2013 +0200
+++ b/mpn/x86_64/bd1/mul_basecase.asm	Sun Aug 04 22:16:30 2013 +0200
@@ -46,8 +46,7 @@
 C  * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
 C    Alternatively, we could tweak the present code (which was loopmixed for a
 C    different CPU).
-C  * Merge faster mul_2.  Current fastest mul_2 code is non-indexed, causing
-C    some structure headaches.
+C  * Merge faster mul_2, such as the one in the same directory as this file.
 C  * Further micro-optimise.
 
 C When playing with pointers, set this to $2 to fall back to conservative
diff -r 4e9337c30cf4 -r 3b7f3825b746 mpn/x86_64/coreihwl/gmp-mparam.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreihwl/gmp-mparam.h	Sun Aug 04 22:16:30 2013 +0200
@@ -0,0 +1,152 @@
+/* Haswell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 2900 MHz Core i5 Haswell */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD              35
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           25
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                66
+#define MUL_TOOM44_THRESHOLD               181
+#define MUL_TOOM6H_THRESHOLD               274
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     107
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     125
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     128
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 22
+#define SQR_TOOM3_THRESHOLD                 85
+#define SQR_TOOM4_THRESHOLD                226
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                406
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               14
+
+#define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    380, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
    
    
More information about the gmp-commit
mailing list