[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Apr 15 21:56:30 CEST 2013
details: /var/hg/gmp/rev/149b7cb35a5a
changeset: 15719:149b7cb35a5a
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 15 21:32:51 2013 +0200
description:
Add cycle counts.
details: /var/hg/gmp/rev/b30bd9ea2c13
changeset: 15720:b30bd9ea2c13
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 15 21:41:12 2013 +0200
description:
Add an x86/mmx tabselect.
details: /var/hg/gmp/rev/a4bffd7a3cc0
changeset: 15721:a4bffd7a3cc0
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 15 21:41:52 2013 +0200
description:
Add an x86/mmx tabselect.
details: /var/hg/gmp/rev/0b90b31bd2b7
changeset: 15722:0b90b31bd2b7
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 15 21:53:23 2013 +0200
description:
Add generic sparc64 tabselect.
diffstat:
ChangeLog | 5 +
configure.ac | 46 ++++++------
mpn/powerpc64/tabselect.asm | 2 +-
mpn/sparc64/tabselect.asm | 151 +++++++++++++++++++++++++++++++++++++++++++
mpn/x86/mmx/tabselect.asm | 152 ++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 332 insertions(+), 24 deletions(-)
diffs (truncated from 505 to 300 lines):
diff -r 827551142828 -r 0b90b31bd2b7 ChangeLog
--- a/ChangeLog Mon Apr 15 21:32:16 2013 +0200
+++ b/ChangeLog Mon Apr 15 21:53:23 2013 +0200
@@ -1,5 +1,10 @@
2013-04-15 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/sparc64/tabselect.asm: New file.
+
+ * mpn/x86/mmx/tabselect.asm: New file.
+ * configure.ac (x86): Add x86/mmx to path for relevant CPUs.
+
* mpn/sparc64/gcd_1.asm: Use rdpc for PIC.
* mpn/sparc64/ultrasparct3/mode1o.asm: Use rdpc for PIC.
* mpn/sparc64/ultrasparct3/dive_1.asm: Use rdpc for PIC.
diff -r 827551142828 -r 0b90b31bd2b7 configure.ac
--- a/configure.ac Mon Apr 15 21:32:16 2013 +0200
+++ b/configure.ac Mon Apr 15 21:53:23 2013 +0200
@@ -1572,7 +1572,7 @@
pentiummmx)
gcc_cflags_cpu="-mtune=pentium-mmx -mcpu=pentium-mmx -mcpu=pentium -m486"
gcc_cflags_arch="-march=pentium-mmx -march=pentium"
- path="x86/pentium/mmx x86/pentium x86"
+ path="x86/pentium/mmx x86/pentium x86/mmx x86"
;;
i686 | pentiumpro)
gcc_cflags_cpu="-mtune=pentiumpro -mcpu=pentiumpro -mcpu=i486 -m486"
@@ -1582,37 +1582,37 @@
pentium2)
gcc_cflags_cpu="-mtune=pentium2 -mcpu=pentium2 -mcpu=pentiumpro -mcpu=i486 -m486"
gcc_cflags_arch="-march=pentium2 -march=pentiumpro -march=pentium"
- path="x86/p6/mmx x86/p6 x86"
+ path="x86/p6/mmx x86/p6 x86/mmx x86"
;;
pentium3)
gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
- path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
;;
pentiumm)
gcc_cflags_cpu="-mtune=pentium3 -mcpu=pentium3 -mcpu=pentiumpro -mcpu=i486 -m486"
gcc_cflags_arch="-march=pentium3 -march=pentiumpro -march=pentium"
- path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
;;
k6)
gcc_cflags_cpu="-mtune=k6 -mcpu=k6 -mcpu=i486 -m486"
gcc_cflags_arch="-march=k6"
- path="x86/k6/mmx x86/k6 x86"
+ path="x86/k6/mmx x86/k6 x86/mmx x86"
;;
k62)
gcc_cflags_cpu="-mtune=k6-2 -mcpu=k6-2 -mcpu=k6 -mcpu=i486 -m486"
gcc_cflags_arch="-march=k6-2 -march=k6"
- path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
+ path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86/mmx x86"
;;
k63)
gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
gcc_cflags_arch="-march=k6-3 -march=k6"
- path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
+ path="x86/k6/k62mmx x86/k6/mmx x86/k6 x86/mmx x86"
;;
geode)
gcc_cflags_cpu="-mtune=k6-3 -mcpu=k6-3 -mcpu=k6 -mcpu=i486 -m486"
gcc_cflags_arch="-march=k6-3 -march=k6"
- path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86"
+ path="x86/geode x86/k6/k62mmx x86/k6/mmx x86/k6 x86/mmx x86"
;;
athlon)
# Athlon instruction costs are close to P6 (3 cycle load latency,
@@ -1620,7 +1620,7 @@
# know athlon (eg. 2.95.2 doesn't) then fall back on pentiumpro.
gcc_cflags_cpu="-mtune=athlon -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
gcc_cflags_arch="-march=athlon -march=pentiumpro -march=pentium"
- path="x86/k7/mmx x86/k7 x86"
+ path="x86/k7/mmx x86/k7 x86/mmx x86"
;;
i786 | pentium4)
# pentiumpro is the primary fallback when gcc doesn't know pentium4.
@@ -1630,7 +1630,7 @@
gcc_cflags_cpu="-mtune=pentium4 -mcpu=pentium4 -mcpu=pentiumpro -mcpu=i486 -m486"
gcc_cflags_arch="-march=pentium4 -march=pentium4~-mno-sse2 -march=pentiumpro -march=pentium"
gcc_64_cflags_cpu="-mtune=nocona"
- path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86"
+ path="x86/pentium4/sse2 x86/pentium4/mmx x86/pentium4 x86/mmx x86"
path_64="x86_64/pentium4 x86_64"
;;
viac32)
@@ -1638,79 +1638,79 @@
# c3-2 has sse and mmx, so pentium3 is good for -march.
gcc_cflags_cpu="-mtune=c3-2 -mcpu=c3-2 -mcpu=i486 -m486"
gcc_cflags_arch="-march=c3-2 -march=pentium3 -march=pentiumpro -march=pentium"
- path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
;;
viac3*)
# Not sure of the best fallbacks here.
gcc_cflags_cpu="-mtune=c3 -mcpu=c3 -mcpu=i486 -m486"
gcc_cflags_arch="-march=c3 -march=pentium-mmx -march=pentium"
- path="x86/pentium/mmx x86/pentium x86"
+ path="x86/pentium/mmx x86/pentium x86/mmx x86"
;;
athlon64 | k8 | x86_64)
gcc_cflags_cpu="-mtune=k8 -mcpu=athlon -mcpu=pentiumpro -mcpu=i486 -m486"
gcc_cflags_arch="-march=k8 -march=k8~-mno-sse2 -march=athlon -march=pentiumpro -march=pentium"
- path="x86/k8 x86/k7/mmx x86/k7 x86"
+ path="x86/k8 x86/k7/mmx x86/k7 x86/mmx x86"
path_64="x86_64/k8 x86_64"
;;
k10)
gcc_cflags_cpu="-mtune=amdfam10 -mtune=k8"
gcc_cflags_arch="-march=amdfam10 -march=k8 -march=k8~-mno-sse2"
- path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86"
+ path="x86/k10 x86/k8 x86/k7/mmx x86/k7 x86/mmx x86"
path_64="x86_64/k10 x86_64/k8 x86_64"
;;
bobcat)
gcc_cflags_cpu="-mtune=btver1 -mtune=amdfam10 -mtune=k8"
gcc_cflags_arch="-march=btver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
- path="x86/bobcat x86/k7/mmx x86/k7 x86"
+ path="x86/bobcat x86/k7/mmx x86/k7 x86/mmx x86"
path_64="x86_64/bobcat x86_64/k10 x86_64/k8 x86_64"
;;
bulldozer | bd1)
gcc_cflags_cpu="-mtune=bdver1 -mtune=amdfam10 -mtune=k8"
gcc_cflags_arch="-march=bdver1 -march=amdfam10 -march=k8 -march=k8~-mno-sse2"
- path="x86/bd1 x86/k7/mmx x86/k7 x86"
+ path="x86/bd1 x86/k7/mmx x86/k7 x86/mmx x86"
path_64="x86_64/bd1 x86_64/k10 x86_64/k8 x86_64"
;;
core2)
gcc_cflags_cpu="-mtune=core2 -mtune=k8"
gcc_cflags_arch="-march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
- path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/core2 x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
path_64="x86_64/core2 x86_64"
;;
corei | coreinhm | coreiwsm)
gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
- path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/coreinhm x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
path_64="x86_64/coreinhm x86_64/core2 x86_64"
;;
coreisbr)
gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
- path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
path_64="x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
;;
coreihwl)
gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
- path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
path_64="x86_64/mulx x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
;;
coreibwl)
gcc_cflags_cpu="-mtune=corei7 -mtune=core2 -mtune=k8"
gcc_cflags_arch="-march=corei7 -march=core2 -march=core2~-mno-sse2 -march=k8 -march=k8~-mno-sse2"
- path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86"
+ path="x86/coreisbr x86/p6/sse2 x86/p6/p3mmx x86/p6/mmx x86/p6 x86/mmx x86"
path_64="x86_64/mulx/adx x86_64/mulx x86_64/coreisbr x86_64/coreinhm x86_64/core2 x86_64"
extra_functions_64="missing" # FIXME: remove when qemu's adx flags handling works
;;
atom)
gcc_cflags_cpu="-mtune=atom -mtune=pentium3"
gcc_cflags_arch="-march=atom -march=pentium3"
- path="x86/atom/sse2 x86/atom/mmx x86/atom x86"
+ path="x86/atom/sse2 x86/atom/mmx x86/atom x86/mmx x86"
path_64="x86_64/atom x86_64"
;;
nano)
gcc_cflags_cpu="-mtune=nano"
gcc_cflags_arch="-march=nano"
- path="x86/nano x86"
+ path="x86/nano x86/mmx x86"
path_64="x86_64/nano x86_64"
;;
*)
diff -r 827551142828 -r 0b90b31bd2b7 mpn/powerpc64/tabselect.asm
--- a/mpn/powerpc64/tabselect.asm Mon Apr 15 21:32:16 2013 +0200
+++ b/mpn/powerpc64/tabselect.asm Mon Apr 15 21:53:23 2013 +0200
@@ -22,7 +22,7 @@
include(`../config.m4')
C cycles/limb
-C POWER3/PPC630 ?
+C POWER3/PPC630 1.75
C POWER4/PPC970 2.0
C POWER5 ?
C POWER6 5.0
diff -r 827551142828 -r 0b90b31bd2b7 mpn/sparc64/tabselect.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/sparc64/tabselect.asm Mon Apr 15 21:53:23 2013 +0200
@@ -0,0 +1,151 @@
+dnl SPARC v9 mpn_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund and David Miller.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 2 hopefully
+C UltraSPARC 3: 3
+C UltraSPARC T1: 17
+C UltraSPARC T3: ?
+C UltraSPARC T4/T5: 2.25 hopefully
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`tp', `%i1')
+define(`n', `%i2')
+define(`nents', `%i3')
+define(`which', `%i4')
+
+define(`i', `%g1')
+define(`j', `%g3')
+define(`stride', `%g4')
+define(`tporig', `%g5')
+define(`mask', `%o0')
+
+define(`data0', `%l0')
+define(`data1', `%l1')
+define(`data2', `%l2')
+define(`data3', `%l3')
+define(`t0', `%l4')
+define(`t1', `%l5')
+define(`t2', `%l6')
+define(`t3', `%l7')
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(mpn_tabselect)
+ save %sp, -176, %sp
+
+ sllx n, 3, stride
+ sub n, 4, j
+ brlz j, L(outer_end)
+ mov tp, tporig
+
+L(outer_loop):
+ clr data0
+ clr data1
+ clr data2
+ clr data3
+ mov tporig, tp
+ mov nents, i
+ mov which, %o1
+
+L(top): subcc %o1, 1, %o1 C set carry iff o1 = 0
+ ldx [tp + 0], t0
+ subc %g0, %g0, mask
+ ldx [tp + 8], t1
+ sub i, 1, i
+ ldx [tp + 16], t2
+ ldx [tp + 24], t3
+ add tp, stride, tp
+ and t0, mask, t0
+ and t1, mask, t1
+ or t0, data0, data0
+ and t2, mask, t2
+ or t1, data1, data1
+ and t3, mask, t3
+ or t2, data2, data2
+ brnz i, L(top)
+ or t3, data3, data3
+
+ stx data0, [rp + 0]
+ subcc j, 4, j
+ stx data1, [rp + 8]
+ stx data2, [rp + 16]
+ stx data3, [rp + 24]
+ add tporig, (4 * 8), tporig
+
+ brgez j, L(outer_loop)
+ add rp, (4 * 8), rp
+L(outer_end):
+
+
More information about the gmp-commit
mailing list