[Gmp-commit] /var/hg/gmp: 3 new changesets

Mon Apr 16 00:37:15 CEST 2012

details:   /var/hg/gmp/rev/9ca269947f8f
changeset: 14832:9ca269947f8f
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Apr 15 23:27:51 2012 +0200
description:
Update c/l table.

details:   /var/hg/gmp/rev/1723a489bf78
changeset: 14833:1723a489bf78
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Apr 16 00:36:05 2012 +0200
description:
Minor changes for stable core2 performance.

details:   /var/hg/gmp/rev/aa3f6b4acebe
changeset: 14834:aa3f6b4acebe
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Apr 16 00:37:12 2012 +0200
description:
Trivial merge.

diffstat:

 ChangeLog                            |   5 ++
 mpn/x86_64/fastsse/com.asm           |  26 +++++----
 mpn/x86_64/fastsse/copyd-palignr.asm |   5 +-
 mpn/x86_64/fastsse/copyi-palignr.asm |  10 +--
 mpz/bin_uiui.c                       |   7 +-
 tests/mpz/t-bin.c                    |  87 +++++++++++++++--------------------
 6 files changed, 66 insertions(+), 74 deletions(-)

diffs (264 lines):

diff -r dbf44b5ce670 -r aa3f6b4acebe ChangeLog

--- a/ChangeLog	Sun Apr 15 16:02:37 2012 +0200
+++ b/ChangeLog	Mon Apr 16 00:37:12 2012 +0200
@@ -1,3 +1,8 @@
+2012-04-15 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* tests/mpz/t-bin.c: Add more tests on small values.
+	* mpz/bin_uiui.c (mpz_bdiv_bin_uiui): Smaller temporary areas.
+
 2012-04-15  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/x86_64/fastsse/copyd-palignr.asm: New file.
diff -r dbf44b5ce670 -r aa3f6b4acebe mpn/x86_64/fastsse/com.asm
--- a/mpn/x86_64/fastsse/com.asm	Sun Apr 15 16:02:37 2012 +0200
+++ b/mpn/x86_64/fastsse/com.asm	Mon Apr 16 00:37:12 2012 +0200
@@ -2,6 +2,8 @@
 
 dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
 
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
 dnl  This file is part of the GNU MP Library.
 
 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
@@ -19,18 +21,18 @@
 
 include(`../config.m4')
 
-
-C	    cycles/limb		  good for cpu?
-C AMD K8,K9
-C AMD K10	 0.85			Y
-C AMD bd1	 0.92			Y
-C AMD bobcat
-C Intel P4	 2.28			Y
-C Intel core2	 1
-C Intel NHM	 0.5			Y
-C Intel SBR	 0.5			Y
-C Intel atom
-C VIA nano	 1.1			Y
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	 2.0		 2.0				N
+C AMD K10	 0.85		 1.3				Y/N
+C AMD bd1	 1.40		 1.40				Y
+C AMD bobcat	 3.1		 3.1				N
+C Intel P4	 2.28		 illop				Y
+C Intel core2	 1.02		 1.02				N
+C Intel NHM	 0.53		 0.68				Y
+C Intel SBR	 0.51		 0.75				Y
+C Intel atom	 3.68		 3.68				N
+C VIA nano	 1.17		 5.09				Y/N
 
 C We try to do as many 16-byte operations as possible.  The top-most and
 C bottom-most writes might need 8-byte operations.  We can always write using
diff -r dbf44b5ce670 -r aa3f6b4acebe mpn/x86_64/fastsse/copyd-palignr.asm
--- a/mpn/x86_64/fastsse/copyd-palignr.asm	Sun Apr 15 16:02:37 2012 +0200
+++ b/mpn/x86_64/fastsse/copyd-palignr.asm	Mon Apr 16 00:37:12 2012 +0200
@@ -28,7 +28,7 @@
 C AMD bd1	 1.39		 1.40				Y
 C AMD bobcat	 1.97		 8.35		1.5/1.5		N
 C Intel P4	 2.26		 illop				Y/N
-C Intel core2	 0.52		0.68-0.80	0.52/0.68	Y
+C Intel core2	 0.52		0.68-0.80	opt/0.68	Y
 C Intel NHM	 0.52		 0.64		opt/opt		Y
 C Intel SBR	 0.51		 0.54		opt/0.51	Y
 C Intel atom	 1.16		 1.66		opt/opt		Y
@@ -45,9 +45,6 @@
 define(`up', `%rsi')
 define(`n',  `%rdx')
 
-dnl ABI_SUPPORT(DOS64)		C pointless decl since file is for grabbing
-ABI_SUPPORT(STD64)		C pointless decl since file is for grabbing
-
 C There are three instructions for loading an aligned 128-bit quantity.  We use
 C movaps, since it has the shortest coding.
 define(`movdqa', ``movaps'')
diff -r dbf44b5ce670 -r aa3f6b4acebe mpn/x86_64/fastsse/copyi-palignr.asm
--- a/mpn/x86_64/fastsse/copyi-palignr.asm	Sun Apr 15 16:02:37 2012 +0200
+++ b/mpn/x86_64/fastsse/copyi-palignr.asm	Mon Apr 16 00:37:12 2012 +0200
@@ -28,11 +28,11 @@
 C AMD bd1	 1.39		 1.45				Y/N
 C AMD bobcat	 1.97		 8.17		1.5/1.5		N
 C Intel P4	 2.26		 illop				Y/N
-C Intel core2	 0.52		 0.78		0.52/0.76	Y
+C Intel core2	 0.52		 0.80		opt/0.74	Y
 C Intel NHM	 0.52		 0.64		opt/opt		Y
 C Intel SBR	 0.51		 0.54		opt/0.51	Y
 C Intel atom	 1.16		 1.66		opt/opt		Y
-C VIA nano	 1.11		 1.10		opt/opt		Y
+C VIA nano	 1.09		 1.10		opt/opt		Y
 
 C We use only 16-byte operations, except for unaligned top-most and bottom-most
 C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
@@ -47,9 +47,6 @@
 define(`up', `%rsi')
 define(`n',  `%rdx')
 
-dnl ABI_SUPPORT(DOS64)		C pointless decl since file is for grabbing
-ABI_SUPPORT(STD64)		C pointless decl since file is for grabbing
-
 C There are three instructions for loading an aligned 128-bit quantity.  We use
 C movaps, since it has the shortest coding.
 define(`movdqa', ``movaps'')
@@ -122,14 +119,15 @@
 
 	movdqa	120(up), %xmm3
 	movdqa	104(up), %xmm2
+	sub	$16, n
 	jmp	L(um)
 
 	ALIGN(16)
 L(utop):movdqa	120(up), %xmm3
+	sub	$16, n
 	movdqa	104(up), %xmm2
 	movdqa	%xmm0, -128(rp)
 L(um):	palignr	$8, %xmm2, %xmm3
-	sub	$16, n
 	movdqa	88(up), %xmm1
 	movdqa	%xmm3, 112(rp)
 	palignr	$8, %xmm1, %xmm2
diff -r dbf44b5ce670 -r aa3f6b4acebe mpz/bin_uiui.c
--- a/mpz/bin_uiui.c	Sun Apr 15 16:02:37 2012 +0200
+++ b/mpz/bin_uiui.c	Mon Apr 16 00:37:12 2012 +0200
@@ -198,9 +198,10 @@
 
   /* FIXME: This allocation might be insufficient, but is usually way too
      large.  */
-  alloc = SOME_THRESHOLD + MAX (3 * maxn / 2, SOME_THRESHOLD);
+  alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
+  alloc = MIN (alloc, k) + 1;
   np = TMP_ALLOC_LIMBS (alloc);
-  kp = TMP_ALLOC_LIMBS (alloc);
+  kp = TMP_ALLOC_LIMBS (SOME_THRESHOLD + 1);
 
   MAXFACS (nmax, n);
   nmax = MIN (nmax, M);
@@ -232,7 +233,7 @@
       t = k - j + 1;
       kmax = MIN (kmax, t);
 
-      while (kmax != 0 && kn <  SOME_THRESHOLD)
+      while (kmax != 0 && kn < SOME_THRESHOLD)
 	{
 	  jjj = mulfunc[kmax] (j);
 	  j += kmax;				/* number of factors used */
diff -r dbf44b5ce670 -r aa3f6b4acebe tests/mpz/t-bin.c
--- a/tests/mpz/t-bin.c	Sun Apr 15 16:02:37 2012 +0200
+++ b/tests/mpz/t-bin.c	Mon Apr 16 00:37:12 2012 +0200
@@ -1,6 +1,6 @@
 /* Exercise mpz_bin_ui and mpz_bin_uiui.
 
-Copyright 2000, 2001, 2010 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2010, 2012 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -77,55 +77,11 @@
     const char     *want;
   } data[] = {
 
-    {   "0",  0, "1"   },
-    {   "0",  1, "0"   },
-    {   "0",  2, "0"   },
-    {   "0",  3, "0"   },
-    {   "0",  4, "0"   },
     {   "0", 123456, "0" },
-
-    {   "1",  0, "1"   },
-    {   "1",  1, "1"   },
-    {   "1",  2, "0"   },
-    {   "1",  3, "0"   },
-    {   "1",  4, "0"   },
-    {   "1", 123456, "0" },
-
-    {   "2",  0, "1"   },
-    {   "2",  2, "1"   },
-    {   "2",  3, "0"   },
-    {   "2",  4, "0"   },
-    {   "2", 123456, "0" },
-
-    {   "3",  0, "1"   },
-    {   "3",  1, "3"   },
-    {   "3",  2, "3"   },
-    {   "3",  3, "1"   },
-    {   "3",  4, "0"   },
-    {   "3",  5, "0"   },
-    {   "3", 123456, "0" },
-
-    {   "4",  0, "1"   },
-    {   "4",  1, "4"   },
-    {   "4",  3, "4"   },
-    {   "4",  4, "1"   },
-    {   "4",  5, "0"   },
-    {   "4",  6, "0"   },
-    {   "4", 123456, "0" },
-
-    {   "10",  0, "1"   },
-    {   "10",  1, "10"  },
-    {   "10",  2, "45"  },
-    {   "10",  3, "120" },
-    {   "10",  4, "210" },
-    {   "10",  6, "210" },
-    {   "10",  7, "120" },
-    {   "10",  8, "45"  },
-    {   "10",  9, "10"  },
-    {   "10", 10, "1"   },
-    {   "10", 11,     "0" },
-    {   "10", 12,     "0" },
-    {   "10", 123456, "0" },
+    {   "1", 543210, "0" },
+    {   "2", 123321, "0" },
+    {   "3", 234567, "0" },
+    {   "10", 23456, "0" },
 
     /* negatives, using bin(-n,k)=bin(n+k-1,k) */
     {   "-1",  0,  "1"  },
@@ -250,6 +206,38 @@
   mpz_clear (want);
 }
 
+
+/* Test all bin(n,k) cases, with 0 <= k <= n + 1 <= count.  */
+void
+smallexaustive (unsigned int count)
+{
+  mpz_t          n_z, want;
+  unsigned long  n, k, i, r;
+  int            tests;
+  gmp_randstate_ptr rands;
+
+  mpz_init (n_z);
+  mpz_init (want);
+
+  for (n = 0; n < count; n++)
+    {
+      mpz_set_ui (want, (unsigned long) 1);
+      mpz_set_ui (n_z, n);
+      for (k = 0; k <= n; k++)
+	{
+	  try_mpz_bin_ui (want, n_z, k);
+	  try_mpz_bin_uiui (want, n, k);
+	  mpz_mul_ui (want, want, n - k);
+	  mpz_fdiv_q_ui (want, want, k + 1);
+	}
+      try_mpz_bin_ui (want, n_z, k);
+      try_mpz_bin_uiui (want, n, k);
+    }
+
+  mpz_clear (n_z);
+  mpz_clear (want);
+}
+
 int
 main (int argc, char **argv)
 {
@@ -271,6 +259,7 @@
   tests_start ();
 
   samples ();
+  smallexaustive (count >> 3);
   twos (count >> 1);
   randomwalk (count - (count >> 1));