[Gmp-commit] /var/hg/gmp: 9 new changesets

Sat May 2 11:01:33 UTC 2015

details:   /var/hg/gmp/rev/9b735677050d
changeset: 16604:9b735677050d
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Fri May 01 14:31:31 2015 +0200
description:
(all): Make GCD tuning last since it is not robust.

details:   /var/hg/gmp/rev/9561f6f74019
changeset: 16605:9561f6f74019
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:53:10 2015 +0200
description:
Add cycle counts for new CPUs.

details:   /var/hg/gmp/rev/ed237cf7cfca
changeset: 16606:ed237cf7cfca
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:55:14 2015 +0200
description:
Add cycle counts for new CPUs.

details:   /var/hg/gmp/rev/8b051e6d9b83
changeset: 16607:8b051e6d9b83
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:55:43 2015 +0200
description:
Add new CPUs.

details:   /var/hg/gmp/rev/c0258b7e9585
changeset: 16608:c0258b7e9585
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:56:30 2015 +0200
description:
Comment.

details:   /var/hg/gmp/rev/5ceaed930971
changeset: 16609:5ceaed930971
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:58:39 2015 +0200
description:
Comment.

details:   /var/hg/gmp/rev/d5c6fb42e882
changeset: 16610:d5c6fb42e882
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:58:57 2015 +0200
description:
Spacing fix.

details:   /var/hg/gmp/rev/2d492987a0a9
changeset: 16611:2d492987a0a9
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 12:59:42 2015 +0200
description:
(JMPENT): Suppress redundant newlines.

details:   /var/hg/gmp/rev/4fe5cd26ebf2
changeset: 16612:4fe5cd26ebf2
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sat May 02 13:01:30 2015 +0200
description:
ChangeLog

diffstat:

 ChangeLog                            |   4 ++++
 mpn/x86_64/bobcat/gmp-mparam.h       |   1 +
 mpn/x86_64/coreibwl/addmul_1.asm     |   3 ++-
 mpn/x86_64/coreibwl/mul_1.asm        |   2 +-
 mpn/x86_64/fastavx/copyd.asm         |  11 ++++++-----
 mpn/x86_64/fastavx/copyi.asm         |  11 ++++++-----
 mpn/x86_64/fastsse/README            |   3 ++-
 mpn/x86_64/fastsse/com-palignr.asm   |  16 ++++++++++++----
 mpn/x86_64/fastsse/com.asm           |  15 ++++++++++++---
 mpn/x86_64/fastsse/copyd-palignr.asm |  17 ++++++++++-------
 mpn/x86_64/fastsse/copyd.asm         |  33 +++++++++++++++++++++++----------
 mpn/x86_64/fastsse/copyi-palignr.asm |  11 +++++++----
 mpn/x86_64/fastsse/copyi.asm         |  31 ++++++++++++++++++++-----------
 mpn/x86_64/gmp-mparam.h              |   1 -
 mpn/x86_64/x86_64-defs.m4            |   4 ++--
 tune/tuneup.c                        |  14 +++++++-------
 16 files changed, 115 insertions(+), 62 deletions(-)

diffs (truncated from 445 to 300 lines):

diff -r 6e11cd70e19e -r 4fe5cd26ebf2 ChangeLog

--- a/ChangeLog	Mon Apr 27 22:46:53 2015 +0200
+++ b/ChangeLog	Sat May 02 13:01:30 2015 +0200
@@ -1,3 +1,7 @@
+2015-05-01    <torbjorng at google.com>
+
+	* tune/tuneup.c (all): Make GCD tuning last since it is not robust.
+
 2015-04-27    <torbjorng at google.com>
 
 	* mpn/x86_64/coreibwl/gmp-mparam.h: New file.
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/bobcat/gmp-mparam.h
--- a/mpn/x86_64/bobcat/gmp-mparam.h	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/bobcat/gmp-mparam.h	Sat May 02 13:01:30 2015 +0200
@@ -31,6 +31,7 @@
 #define GMP_LIMB_BITS 64
 #define GMP_LIMB_BYTES 8
 
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
 #undef HAVE_NATIVE_mpn_mul_2
 #undef HAVE_NATIVE_mpn_addmul_2
 
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/coreibwl/addmul_1.asm
--- a/mpn/x86_64/coreibwl/addmul_1.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/coreibwl/addmul_1.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,4 +1,4 @@
-dnl  AMD64 mpn_addmul_1 optimised for Broadwell.
+dnl  AMD64 mpn_addmul_1 optimised for Intel Broadwell.
 
 dnl  Copyright 2015 Free Software Foundation, Inc.
 
@@ -36,6 +36,7 @@
 C AMD bull	n/a
 C AMD pile	n/a
 C AMD steam	n/a
+C AMD excavator	 ?
 C AMD bobcat	n/a
 C AMD jaguar	n/a
 C Intel P4	n/a
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/coreibwl/mul_1.asm
--- a/mpn/x86_64/coreibwl/mul_1.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/coreibwl/mul_1.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,4 +1,4 @@
-dnl  AMD64 mpn_mul_1 optimised for Broadwell.
+dnl  AMD64 mpn_mul_1 optimised for Intel Broadwell.
 
 dnl  Copyright 2015 Free Software Foundation, Inc.
 
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastavx/copyd.asm
--- a/mpn/x86_64/fastavx/copyd.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastavx/copyd.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,9 +1,9 @@
 dnl  AMD64 mpn_copyd optimised for CPUs with fast AVX.
 
+dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
 dnl  Contributed to the GNU project by TorbjÃ¶rn Granlund.
 
-dnl  Copyright 2003, 2005, 2007, 2011-2013 Free Software Foundation, Inc.
-
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
@@ -32,7 +32,8 @@
 
 include(`../config.m4')
 
-C cycles/limb aligned	      unaligned	      best seen	     for cpu?
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
 C AMD K8,K9	n/a
 C AMD K10	n/a
 C AMD bull	n/a
@@ -44,9 +45,9 @@
 C Intel core	n/a
 C Intel NHM	n/a
 C Intel SBR	 0.50		 0.91				N
-C Intel IBR	 ?
+C Intel IBR	 0.50		 0.65				N
 C Intel HWL	 0.25		 0.30				Y
-C Intel BWL	 ?
+C Intel BWL	 0.28		 0.37				Y
 C Intel atom	n/a
 C VIA nano	n/a
 
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastavx/copyi.asm
--- a/mpn/x86_64/fastavx/copyi.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastavx/copyi.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,9 +1,9 @@
 dnl  AMD64 mpn_copyi optimised for CPUs with fast AVX.
 
+dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
 dnl  Contributed to the GNU project by TorbjÃ¶rn Granlund.
 
-dnl  Copyright 2003, 2005, 2007, 2011-2013 Free Software Foundation, Inc.
-
 dnl  This file is part of the GNU MP Library.
 dnl
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
@@ -32,7 +32,8 @@
 
 include(`../config.m4')
 
-C cycles/limb aligned	      unaligned	      best seen	     for cpu?
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
 C AMD K8,K9	n/a
 C AMD K10	n/a
 C AMD bull	n/a
@@ -44,9 +45,9 @@
 C Intel core	n/a
 C Intel NHM	n/a
 C Intel SBR	 0.50		 0.91				N
-C Intel IBR	 ?
+C Intel IBR	 0.50		 0.65				N
 C Intel HWL	 0.25		 0.30				Y
-C Intel BWL	 ?
+C Intel BWL	 0.28		 0.37				Y
 C Intel atom	n/a
 C VIA nano	n/a
 
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastsse/README
--- a/mpn/x86_64/fastsse/README	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastsse/README	Sat May 02 13:01:30 2015 +0200
@@ -4,10 +4,11 @@
 Current processors that might benefit from this code are:
 
   AMD K10
-  AMD Bulldozer
+  AMD Bulldozer/Piledriver/Steamroller/Excavator
   Intel Nocona
   Intel Nehalem/Westmere
   Intel Sandybridge/Ivybridge
+  Intel Haswell/Broadwell
   VIA Nano
 
 Current processors that do not benefit from this code are:
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastsse/com-palignr.asm
--- a/mpn/x86_64/fastsse/com-palignr.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastsse/com-palignr.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
 
-dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
@@ -36,13 +36,21 @@
 C              aligned	      unaligned	      best seen	     for cpu?
 C AMD K8,K9	 2.0		 illop		1.0/1.0		N
 C AMD K10	 0.85		 illop				Y/N
-C AMD bd1	 1.39		 ? 1.45				Y/N
+C AMD bull	 1.39		 ? 1.45				Y/N
+C AMD pile     0.8-1.4	       0.7-1.4				Y
+C AMD steam
+C AMD excavator
 C AMD bobcat	 1.97		 ? 8.17		1.5/1.5		N
+C AMD jaguar	 1.02		 1.02		0.91/0.91	N
 C Intel P4	 2.26		 illop				Y/N
-C Intel core2	 0.52		 0.82		opt/0.74	Y
+C Intel core	 0.52		 0.95		opt/0.74	Y
 C Intel NHM	 0.52		 0.65		opt/opt		Y
-C Intel SBR	 0.51		 0.55		opt/0.51	Y
+C Intel SBR	 0.51		 0.65		opt/opt		Y
+C Intel IBR	 0.50		 0.64		opt/0.57	Y
+C Intel HWL	 0.51		 0.58		opt/opt		Y
+C Intel BWL	 0.57		 0.69		opt/0.65	Y
 C Intel atom	 1.16		 1.70		opt/opt		Y
+C Intel SLM	 1.02		 1.52				N
 C VIA nano	 1.09		 1.10		opt/opt		Y
 
 C We use only 16-byte operations, except for unaligned top-most and bottom-most
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastsse/com.asm
--- a/mpn/x86_64/fastsse/com.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastsse/com.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,6 +1,7 @@
 dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
 
-dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl  Inc.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
@@ -36,13 +37,21 @@
 C              aligned	      unaligned	      best seen	     for cpu?
 C AMD K8,K9	 2.0		 2.0				N
 C AMD K10	 0.85		 1.3				Y/N
-C AMD bd1	 1.40		 1.40				Y
+C AMD bull	 1.40		 1.40				Y
+C AMD pile     0.9-1.4	       0.9-1.4				Y
+C AMD steam
+C AMD excavator
 C AMD bobcat	 3.1		 3.1				N
+C AMD jaguar	 0.91		 0.91		opt/opt		Y
 C Intel P4	 2.28		 illop				Y
 C Intel core2	 1.02		 1.02				N
 C Intel NHM	 0.53		 0.68				Y
-C Intel SBR	 0.51		 0.75				Y
+C Intel SBR	 0.51		 0.75		opt/0.65	Y/N
+C Intel IBR	 0.50		 0.57		opt/opt		Y
+C Intel HWL	 0.51		 0.64		opt/0.58	Y
+C Intel BWL	 0.61		 0.65		0.57/opt	Y
 C Intel atom	 3.68		 3.68				N
+C Intel SLM	 1.09		 1.35				N
 C VIA nano	 1.17		 5.09				Y/N
 
 C We try to do as many 16-byte operations as possible.  The top-most and
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastsse/copyd-palignr.asm
--- a/mpn/x86_64/fastsse/copyd-palignr.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastsse/copyd-palignr.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
 
-dnl  Copyright 2012 Free Software Foundation, Inc.
+dnl  Copyright 2012, 2015 Free Software Foundation, Inc.
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
@@ -38,16 +38,19 @@
 C AMD K10	 0.85		 illop				Y/N
 C AMD bull	 0.70		 0.70				Y
 C AMD pile	 0.68		 0.68				Y
-C AMD steam	 ?		 ?
+C AMD steam
+C AMD excavator
 C AMD bobcat	 1.97		 8.24		1.5/1.5		N
-C AMD jaguar	 ?		 ?
+C AMD jaguar	 0.77		 0.89		0.65/opt	N/Y
 C Intel P4	 2.26		 illop				Y/N
-C Intel core	 0.52		0.68-0.80	opt/0.64	Y
+C Intel core	 0.52		 0.80		opt/opt		Y
 C Intel NHM	 0.52		 0.64		opt/opt		Y
-C Intel SBR	 0.51		 0.51		opt/0.51	Y
-C Intel IBR	 ?		 ?				Y
-C Intel HWL	 0.51		 0.51		0.25/0.25	N
+C Intel SBR	 0.51		 0.51		opt/opt		Y
+C Intel IBR	 0.50		 0.50		opt/opt		Y
+C Intel HWL	 0.50		 0.51		opt/opt		Y
+C Intel BWL	 0.55		 0.55		opt/opt		Y
 C Intel atom	 1.16		 1.66		opt/opt		Y
+C Intel SLM	 1.02		 1.04		opt/opt		Y
 C VIA nano	 1.08		 1.06		opt/opt		Y
 
 C We use only 16-byte operations, except for unaligned top-most and bottom-most
diff -r 6e11cd70e19e -r 4fe5cd26ebf2 mpn/x86_64/fastsse/copyd.asm
--- a/mpn/x86_64/fastsse/copyd.asm	Mon Apr 27 22:46:53 2015 +0200
+++ b/mpn/x86_64/fastsse/copyd.asm	Sat May 02 13:01:30 2015 +0200
@@ -1,6 +1,9 @@
 dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE.
 
-dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl  Inc.
+
+dnl  Contributed to the GNU project by TorbjÃ¶rn Granlund.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -30,18 +33,26 @@
 
 include(`../config.m4')
 
-
-C	    cycles/limb		  good for cpu?
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
 C AMD K8,K9
-C AMD K10	 0.85			Y
-C AMD bd1	 0.8			Y
+C AMD K10	 0.85		 1.64				Y/N
+C AMD bull	 1.4		 1.4				Y
+C AMD pile	 0.68		 0.98				Y/N
+C AMD steam
+C AMD excavator
 C AMD bobcat
-C Intel P4	 2.28			Y
-C Intel core2	 1
-C Intel NHM	 0.5			Y
-C Intel SBR	 0.5			Y
+C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
+C Intel P4	 2.3		 2.3				Y
+C Intel core	 1.0		 1.0		0.52/0.80	N
+C Intel NHM	 0.5		 0.67				Y
+C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
+C Intel IBR	 0.50		 0.57		opt/0.50	Y
+C Intel HWL	 0.50		 0.57		opt/0.51	Y
+C Intel BWL	 0.55		 0.62		opt/0.55	Y
 C Intel atom
-C VIA nano	 1.1			Y
+C Intel SLM	 1.02		 1.27		opt/1.04	Y/N
+C VIA nano	 1.16		 5.16				Y/N
 
 C We try to do as many 16-byte operations as possible.  The top-most and
 C bottom-most writes might need 8-byte operations.  We can always write using
@@ -61,6 +72,8 @@
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)