[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Apr 16 00:37:15 CEST 2012
details: /var/hg/gmp/rev/9ca269947f8f
changeset: 14832:9ca269947f8f
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Apr 15 23:27:51 2012 +0200
description:
Update c/l table.
details: /var/hg/gmp/rev/1723a489bf78
changeset: 14833:1723a489bf78
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 16 00:36:05 2012 +0200
description:
Minor changes for stable core2 performance.
details: /var/hg/gmp/rev/aa3f6b4acebe
changeset: 14834:aa3f6b4acebe
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 16 00:37:12 2012 +0200
description:
Trivial merge.
diffstat:
ChangeLog | 5 ++
mpn/x86_64/fastsse/com.asm | 26 +++++----
mpn/x86_64/fastsse/copyd-palignr.asm | 5 +-
mpn/x86_64/fastsse/copyi-palignr.asm | 10 +--
mpz/bin_uiui.c | 7 +-
tests/mpz/t-bin.c | 87 +++++++++++++++--------------------
6 files changed, 66 insertions(+), 74 deletions(-)
diffs (264 lines):
diff -r dbf44b5ce670 -r aa3f6b4acebe ChangeLog
--- a/ChangeLog Sun Apr 15 16:02:37 2012 +0200
+++ b/ChangeLog Mon Apr 16 00:37:12 2012 +0200
@@ -1,3 +1,8 @@
+2012-04-15 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * tests/mpz/t-bin.c: Add more tests on small values.
+ * mpz/bin_uiui.c (mpz_bdiv_bin_uiui): Smaller temporary areas.
+
2012-04-15 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/fastsse/copyd-palignr.asm: New file.
diff -r dbf44b5ce670 -r aa3f6b4acebe mpn/x86_64/fastsse/com.asm
--- a/mpn/x86_64/fastsse/com.asm Sun Apr 15 16:02:37 2012 +0200
+++ b/mpn/x86_64/fastsse/com.asm Mon Apr 16 00:37:12 2012 +0200
@@ -2,6 +2,8 @@
dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
@@ -19,18 +21,18 @@
include(`../config.m4')
-
-C cycles/limb good for cpu?
-C AMD K8,K9
-C AMD K10 0.85 Y
-C AMD bd1 0.92 Y
-C AMD bobcat
-C Intel P4 2.28 Y
-C Intel core2 1
-C Intel NHM 0.5 Y
-C Intel SBR 0.5 Y
-C Intel atom
-C VIA nano 1.1 Y
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 2.0 N
+C AMD K10 0.85 1.3 Y/N
+C AMD bd1 1.40 1.40 Y
+C AMD bobcat 3.1 3.1 N
+C Intel P4 2.28 illop Y
+C Intel core2 1.02 1.02 N
+C Intel NHM 0.53 0.68 Y
+C Intel SBR 0.51 0.75 Y
+C Intel atom 3.68 3.68 N
+C VIA nano 1.17 5.09 Y/N
C We try to do as many 16-byte operations as possible. The top-most and
C bottom-most writes might need 8-byte operations. We can always write using
diff -r dbf44b5ce670 -r aa3f6b4acebe mpn/x86_64/fastsse/copyd-palignr.asm
--- a/mpn/x86_64/fastsse/copyd-palignr.asm Sun Apr 15 16:02:37 2012 +0200
+++ b/mpn/x86_64/fastsse/copyd-palignr.asm Mon Apr 16 00:37:12 2012 +0200
@@ -28,7 +28,7 @@
C AMD bd1 1.39 1.40 Y
C AMD bobcat 1.97 8.35 1.5/1.5 N
C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.68-0.80 0.52/0.68 Y
+C Intel core2 0.52 0.68-0.80 opt/0.68 Y
C Intel NHM 0.52 0.64 opt/opt Y
C Intel SBR 0.51 0.54 opt/0.51 Y
C Intel atom 1.16 1.66 opt/opt Y
@@ -45,9 +45,6 @@
define(`up', `%rsi')
define(`n', `%rdx')
-dnl ABI_SUPPORT(DOS64) C pointless decl since file is for grabbing
-ABI_SUPPORT(STD64) C pointless decl since file is for grabbing
-
C There are three instructions for loading an aligned 128-bit quantity. We use
C movaps, since it has the shortest coding.
define(`movdqa', ``movaps'')
diff -r dbf44b5ce670 -r aa3f6b4acebe mpn/x86_64/fastsse/copyi-palignr.asm
--- a/mpn/x86_64/fastsse/copyi-palignr.asm Sun Apr 15 16:02:37 2012 +0200
+++ b/mpn/x86_64/fastsse/copyi-palignr.asm Mon Apr 16 00:37:12 2012 +0200
@@ -28,11 +28,11 @@
C AMD bd1 1.39 1.45 Y/N
C AMD bobcat 1.97 8.17 1.5/1.5 N
C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.78 0.52/0.76 Y
+C Intel core2 0.52 0.80 opt/0.74 Y
C Intel NHM 0.52 0.64 opt/opt Y
C Intel SBR 0.51 0.54 opt/0.51 Y
C Intel atom 1.16 1.66 opt/opt Y
-C VIA nano 1.11 1.10 opt/opt Y
+C VIA nano 1.09 1.10 opt/opt Y
C We use only 16-byte operations, except for unaligned top-most and bottom-most
C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
@@ -47,9 +47,6 @@
define(`up', `%rsi')
define(`n', `%rdx')
-dnl ABI_SUPPORT(DOS64) C pointless decl since file is for grabbing
-ABI_SUPPORT(STD64) C pointless decl since file is for grabbing
-
C There are three instructions for loading an aligned 128-bit quantity. We use
C movaps, since it has the shortest coding.
define(`movdqa', ``movaps'')
@@ -122,14 +119,15 @@
movdqa 120(up), %xmm3
movdqa 104(up), %xmm2
+ sub $16, n
jmp L(um)
ALIGN(16)
L(utop):movdqa 120(up), %xmm3
+ sub $16, n
movdqa 104(up), %xmm2
movdqa %xmm0, -128(rp)
L(um): palignr $8, %xmm2, %xmm3
- sub $16, n
movdqa 88(up), %xmm1
movdqa %xmm3, 112(rp)
palignr $8, %xmm1, %xmm2
diff -r dbf44b5ce670 -r aa3f6b4acebe mpz/bin_uiui.c
--- a/mpz/bin_uiui.c Sun Apr 15 16:02:37 2012 +0200
+++ b/mpz/bin_uiui.c Mon Apr 16 00:37:12 2012 +0200
@@ -198,9 +198,10 @@
/* FIXME: This allocation might be insufficient, but is usually way too
large. */
- alloc = SOME_THRESHOLD + MAX (3 * maxn / 2, SOME_THRESHOLD);
+ alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
+ alloc = MIN (alloc, k) + 1;
np = TMP_ALLOC_LIMBS (alloc);
- kp = TMP_ALLOC_LIMBS (alloc);
+ kp = TMP_ALLOC_LIMBS (SOME_THRESHOLD + 1);
MAXFACS (nmax, n);
nmax = MIN (nmax, M);
@@ -232,7 +233,7 @@
t = k - j + 1;
kmax = MIN (kmax, t);
- while (kmax != 0 && kn < SOME_THRESHOLD)
+ while (kmax != 0 && kn < SOME_THRESHOLD)
{
jjj = mulfunc[kmax] (j);
j += kmax; /* number of factors used */
diff -r dbf44b5ce670 -r aa3f6b4acebe tests/mpz/t-bin.c
--- a/tests/mpz/t-bin.c Sun Apr 15 16:02:37 2012 +0200
+++ b/tests/mpz/t-bin.c Mon Apr 16 00:37:12 2012 +0200
@@ -1,6 +1,6 @@
/* Exercise mpz_bin_ui and mpz_bin_uiui.
-Copyright 2000, 2001, 2010 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2010, 2012 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -77,55 +77,11 @@
const char *want;
} data[] = {
- { "0", 0, "1" },
- { "0", 1, "0" },
- { "0", 2, "0" },
- { "0", 3, "0" },
- { "0", 4, "0" },
{ "0", 123456, "0" },
-
- { "1", 0, "1" },
- { "1", 1, "1" },
- { "1", 2, "0" },
- { "1", 3, "0" },
- { "1", 4, "0" },
- { "1", 123456, "0" },
-
- { "2", 0, "1" },
- { "2", 2, "1" },
- { "2", 3, "0" },
- { "2", 4, "0" },
- { "2", 123456, "0" },
-
- { "3", 0, "1" },
- { "3", 1, "3" },
- { "3", 2, "3" },
- { "3", 3, "1" },
- { "3", 4, "0" },
- { "3", 5, "0" },
- { "3", 123456, "0" },
-
- { "4", 0, "1" },
- { "4", 1, "4" },
- { "4", 3, "4" },
- { "4", 4, "1" },
- { "4", 5, "0" },
- { "4", 6, "0" },
- { "4", 123456, "0" },
-
- { "10", 0, "1" },
- { "10", 1, "10" },
- { "10", 2, "45" },
- { "10", 3, "120" },
- { "10", 4, "210" },
- { "10", 6, "210" },
- { "10", 7, "120" },
- { "10", 8, "45" },
- { "10", 9, "10" },
- { "10", 10, "1" },
- { "10", 11, "0" },
- { "10", 12, "0" },
- { "10", 123456, "0" },
+ { "1", 543210, "0" },
+ { "2", 123321, "0" },
+ { "3", 234567, "0" },
+ { "10", 23456, "0" },
/* negatives, using bin(-n,k)=bin(n+k-1,k) */
{ "-1", 0, "1" },
@@ -250,6 +206,38 @@
mpz_clear (want);
}
+
+/* Test all bin(n,k) cases, with 0 <= k <= n + 1 <= count. */
+void
+smallexaustive (unsigned int count)
+{
+ mpz_t n_z, want;
+ unsigned long n, k, i, r;
+ int tests;
+ gmp_randstate_ptr rands;
+
+ mpz_init (n_z);
+ mpz_init (want);
+
+ for (n = 0; n < count; n++)
+ {
+ mpz_set_ui (want, (unsigned long) 1);
+ mpz_set_ui (n_z, n);
+ for (k = 0; k <= n; k++)
+ {
+ try_mpz_bin_ui (want, n_z, k);
+ try_mpz_bin_uiui (want, n, k);
+ mpz_mul_ui (want, want, n - k);
+ mpz_fdiv_q_ui (want, want, k + 1);
+ }
+ try_mpz_bin_ui (want, n_z, k);
+ try_mpz_bin_uiui (want, n, k);
+ }
+
+ mpz_clear (n_z);
+ mpz_clear (want);
+}
+
int
main (int argc, char **argv)
{
@@ -271,6 +259,7 @@
tests_start ();
samples ();
+ smallexaustive (count >> 3);
twos (count >> 1);
randomwalk (count - (count >> 1));
More information about the gmp-commit
mailing list