[Gmp-commit] /var/hg/gmp: 4 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sat Sep 14 11:12:17 UTC 2019


details:   /var/hg/gmp/rev/fdb32090830c
changeset: 17894:fdb32090830c
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Sep 14 13:09:14 2019 +0200
description:
(GMP_ASM_X86_MULX): Set X86_ASM_MULX to config.h.

details:   /var/hg/gmp/rev/59d8cef5e18e
changeset: 17895:59d8cef5e18e
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Sep 14 13:10:15 2019 +0200
description:
Update use of GMP_ASM_X86_MULX.

details:   /var/hg/gmp/rev/ce6516e7d7c8
changeset: 17896:ce6516e7d7c8
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Sep 14 13:10:36 2019 +0200
description:
(x86 umul_ppmm): Test also X86_ASM_MULX for when to use mulx variant.

details:   /var/hg/gmp/rev/0bc765c5b024
changeset: 17897:0bc765c5b024
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Sep 14 13:12:15 2019 +0200
description:
Trivial merge.

diffstat:

 ChangeLog     |    6 ++
 acinclude.m4  |    2 +
 configure.ac  |   14 ++--
 longlong.h    |    5 +-
 tune/tuneup.c |  172 +++++++++++++++++++++++----------------------------------
 5 files changed, 90 insertions(+), 109 deletions(-)

diffs (truncated from 333 to 300 lines):

diff -r edfbbc5d4e48 -r 0bc765c5b024 ChangeLog
--- a/ChangeLog	Fri Sep 13 14:32:41 2019 +0200
+++ b/ChangeLog	Sat Sep 14 13:12:15 2019 +0200
@@ -1,3 +1,9 @@
+2019-09-13  Niels Möller  <nisse at lysator.liu.se>
+
+	* tune/tuneup.c (one_method): New helper function, to measure
+	several functions for a fix size.
+	(tune_hgcd2, tune_div_qr_1, tune_mod_1, tune_jacobi_base): Use it.
+
 2019-09-05  Torbjörn Granlund  <tg at gmplib.org>
 
 	* mpn/arm64/gcd_22.asm: Rewrite to make better use of Arm conditional
diff -r edfbbc5d4e48 -r 0bc765c5b024 acinclude.m4
--- a/acinclude.m4	Fri Sep 13 14:32:41 2019 +0200
+++ b/acinclude.m4	Sat Sep 14 13:12:15 2019 +0200
@@ -2659,6 +2659,8 @@
 ])
 case $gmp_cv_asm_x86_mulx in
 yes)
+  AC_DEFINE(X86_ASM_MULX, 1,
+  [Define to 1 if the assembler understands the mulx instruction])
   ifelse([$1],,:,[$1])
   ;;
 *)
diff -r edfbbc5d4e48 -r 0bc765c5b024 configure.ac
--- a/configure.ac	Fri Sep 13 14:32:41 2019 +0200
+++ b/configure.ac	Sat Sep 14 13:12:15 2019 +0200
@@ -1873,11 +1873,13 @@
 	gcc_cflags_arch="-march=bdver4 -march=bdver3 -march=bdver2 -march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
 	path="x86/bd4 x86/bd3 x86/bd2 x86/bd1 x86/k7/mmx x86/k7 x86/mmx x86"
 	path_64="x86_64/bd4 x86_64/bd3 x86_64/bd2 x86_64/bd1 x86_64/k10 x86_64/k8 x86_64"
+	x86_have_mulx=1
 	;;
       zen | zennoavx)
 	gcc_cflags_cpu="-mtune=znver1 -mtune=amdfam10 -mtune=k8"
 	gcc_cflags_arch="-march=znver1 -march=amdfam10 -march=k8"
 	path="x86/k7/mmx x86/k7 x86/mmx x86"
+	x86_have_mulx=1
 	path_64="x86_64/zen x86_64"
 	;;
       core2)
@@ -1904,6 +1906,7 @@
 	gcc_cflags_arch="-march=haswell -march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
 	path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
 	path_64="x86_64/coreihwl x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
+	x86_have_mulx=1
 	;;
       coreibwl | coreibwlnoavx | broadwell | broadwellnoavx)
 	gcc_cflags_cpu="-mtune=broadwell -mtune=corei7 -mtune=core2 -mtune=k8"
@@ -1911,6 +1914,7 @@
 	path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
 	path_64="x86_64/coreibwl x86_64/coreihwl x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
 	# extra_functions_64="missing"	 # enable for bmi2/adx simulation
+	x86_have_mulx=1
 	;;
       skylake | skylakenoavx | kabylake | kabylakenoavx)
 	gcc_cflags_cpu="-mtune=skylake -mtune=broadwell -mtune=corei7 -mtune=core2 -mtune=k8"
@@ -1919,6 +1923,7 @@
 	path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
 	path_64="x86_64/skylake x86_64/coreibwl x86_64/coreihwl x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
 	# extra_functions_64="missing"	 # enable for bmi2/adx simulation
+	x86_have_mulx=1
 	;;
       atom)			# in-order pipeline atom
 	gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
@@ -2581,12 +2586,6 @@
         *sse2*)  GMP_ASM_X86_SSE2( , [GMP_STRIP_PATH(sse2)]) ;;
       esac
     fi
-    case "$path $fat_path" in
-      *mulx*)  GMP_ASM_X86_MULX( , [GMP_STRIP_PATH(mulx)]) ;;
-    esac
-    case "$path $fat_path" in
-      *adx*)   GMP_ASM_X86_ADX( , [GMP_STRIP_PATH(adx)]) ;;
-    esac
     ;;
 esac
 
@@ -3756,6 +3755,9 @@
       ;;
     X86_PATTERN | X86_64_PATTERN)
       GMP_ASM_ALIGN_FILL_0x90
+      if test $x86_have_mulx = 1; then
+        GMP_ASM_X86_MULX
+      fi
       case $ABI in
         32)
           GMP_INCLUDE_MPN(x86/x86-defs.m4)
diff -r edfbbc5d4e48 -r 0bc765c5b024 longlong.h
--- a/longlong.h	Fri Sep 13 14:32:41 2019 +0200
+++ b/longlong.h	Sat Sep 14 13:12:15 2019 +0200
@@ -1058,8 +1058,9 @@
 	   : "=r" (sh), "=&r" (sl)					\
 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
-#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
-  || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen
+#if X86_ASM_MULX
+   && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell
+       || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
 #define umul_ppmm(w1, w0, u, v) \
   __asm__ ("mulx\t%3, %0, %1"						\
 	   : "=r" (w0), "=r" (w1)					\
diff -r edfbbc5d4e48 -r 0bc765c5b024 tune/tuneup.c
--- a/tune/tuneup.c	Fri Sep 13 14:32:41 2019 +0200
+++ b/tune/tuneup.c	Sat Sep 14 13:12:15 2019 +0200
@@ -519,8 +519,8 @@
 }
 
 void
-print_define_with_margin (const char *name, mp_size_t value,
-			  mp_size_t runner_up, double speedup)
+print_define_with_speedup (const char *name, mp_size_t value,
+			   mp_size_t runner_up, double speedup)
 {
   char buf[100];
   snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
@@ -712,6 +712,48 @@
     print_define_end (param->name, *threshold);
 }
 
+void
+one_method (int n, speed_function_t *functions,
+	    const char *name, const char *define,
+	    const struct param_t *param)
+{
+  double *t;
+  int i;
+  int method;
+  int method_runner_up;
+
+  TMP_DECL;
+  TMP_MARK;
+  t = TMP_ALLOC (n * sizeof (*t));
+
+  for (i = 0; i < n; i++)
+    {
+      t[i] = tuneup_measure (functions[i], param, &s);
+      if (option_trace >= 1)
+	printf ("size=%ld, %s, method %d %.9f\n",
+		(long) s.size, name, i + 1, t[i]);
+      if (t[i] == -1.0)
+	{
+	  printf ("Oops, can't measure all %s methods\n", name);
+	  abort ();
+	}
+    }
+  method = 0;
+  for (i = 1; i < n; i++)
+    if (t[i] < t[method])
+      method = i;
+
+  method_runner_up = (method == 0);
+  for (i = 0; i < n; i++)
+    if (i != method && t[i] < t[method_runner_up])
+      method_runner_up = i;
+
+  print_define_with_speedup (define, method + 1, method_runner_up + 1,
+			     t[method_runner_up] / t[method]);
+
+  TMP_FREE;
+}
+
 
 /* Special probing for the fft thresholds.  The size restrictions on the
    FFTs mean the graph of time vs size has a step effect.  See this for
@@ -1911,47 +1953,15 @@
 tune_hgcd2 (void)
 {
   static struct param_t  param;
-  double   t[3+1];
-  int      method;
-  int      runner_up_method;
-  double   runner_up_ratio;
+  speed_function_t f[3] =
+    {
+     speed_mpn_hgcd2_1,
+     speed_mpn_hgcd2_2,
+     speed_mpn_hgcd2_3,
+    };
 
   s.size = 1;
-  t[1] = tuneup_measure (speed_mpn_hgcd2_1, &param, &s);
-  if (option_trace >= 1)
-    printf ("size=%ld, mpn_hgcd2_1 %.9f\n", (long) s.size, t[1]);
-
-  t[2] = tuneup_measure (speed_mpn_hgcd2_2, &param, &s);
-  if (option_trace >= 1)
-    printf ("size=%ld, mpn_hgcd2_2 %.9f\n", (long) s.size, t[2]);
-
-  t[3] = tuneup_measure (speed_mpn_hgcd2_3, &param, &s);
-  if (option_trace >= 1)
-    printf ("size=%ld, mpn_hgcd2_3 %.9f\n", (long) s.size, t[3]);
-
-  if (t[1] == -1.0 || t[2] == -1.0 || t[3] == -1.0)
-    {
-      printf ("Oops, can't measure all mpn_hgcd2 methods\n");
-      abort ();
-    }
-
-  if (t[1] < t[2] && t[1] < t[3])
-    {
-      method = 1;
-      runner_up_method = (t[2] < t[3]) ? 2 : 3;
-    }
-  else if (t[2] < t[3])
-    {
-      method = 2;
-      runner_up_method = (t[1] < t[3]) ? 1 : 3;
-    }
-  else
-    {
-      method = 3;
-      runner_up_method = (t[1] < t[2]) ? 1 : 2;
-    }
-  print_define_with_margin ("HGCD2_METHOD", method, runner_up_method,
-			    t[runner_up_method] / t[method]);
+  one_method (3, f, "mpn_hgcd2", "HGCD2_METHOD", &param);
 }
 
 void
@@ -2227,22 +2237,16 @@
   if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
     {
       static struct param_t  param;
-      double   t1, t2;
+      speed_function_t f[2] =
+	{
+	 speed_mpn_div_qr_1n_pi1_1,
+	 speed_mpn_div_qr_1n_pi1_2,
+	};
 
       s.size = 10;
       s.r = randlimb_norm ();
 
-      t1 = tuneup_measure (speed_mpn_div_qr_1n_pi1_1, &param, &s);
-      t2 = tuneup_measure (speed_mpn_div_qr_1n_pi1_2, &param, &s);
-
-      if (t1 == -1.0 || t2 == -1.0)
-	{
-	  printf ("Oops, can't measure all mpn_div_qr_1n_pi1 methods at %ld\n",
-		  (long) s.size);
-	  abort ();
-	}
-      div_qr_1n_pi1_method = (t1 < t2) ? 1 : 2;
-      print_define ("DIV_QR_1N_PI1_METHOD", div_qr_1n_pi1_method);
+      one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
     }
 
   {
@@ -2289,22 +2293,15 @@
   if (!HAVE_NATIVE_mpn_mod_1_1p)
     {
       static struct param_t  param;
-      double   t1, t2;
+      speed_function_t f[2] =
+	{
+	 speed_mpn_mod_1_1_1,
+	 speed_mpn_mod_1_1_2,
+	};
 
       s.size = 10;
       s.r = randlimb_half ();
-
-      t1 = tuneup_measure (speed_mpn_mod_1_1_1, &param, &s);
-      t2 = tuneup_measure (speed_mpn_mod_1_1_2, &param, &s);
-
-      if (t1 == -1.0 || t2 == -1.0)
-	{
-	  printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n",
-		  (long) s.size);
-	  abort ();
-	}
-      mod_1_1p_method = (t1 < t2) ? 1 : 2;
-      print_define ("MOD_1_1P_METHOD", mod_1_1p_method);
+      one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
     }
 
   if (UDIV_PREINV_ALWAYS)
@@ -2680,44 +2677,17 @@
 tune_jacobi_base (void)
 {
   static struct param_t  param;
-  double   t1, t2, t3, t4;
-  int      method;
+  speed_function_t f[4] =
+    {
+     speed_mpn_jacobi_base_1,
+     speed_mpn_jacobi_base_2,
+     speed_mpn_jacobi_base_3,
+     speed_mpn_jacobi_base_4,
+    };
 
   s.size = GMP_LIMB_BITS * 3 / 4;
 
-  t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
-  if (option_trace >= 1)
-    printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
-


More information about the gmp-commit mailing list