[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Nov 16 21:55:28 CET 2011
details: /var/hg/gmp/rev/0f0b00869a7d
changeset: 14448:0f0b00869a7d
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Nov 16 21:49:38 2011 +0100
description:
Slight tweak of new code.
details: /var/hg/gmp/rev/7cd202214b9c
changeset: 14449:7cd202214b9c
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Nov 16 21:50:51 2011 +0100
description:
Add cycle counts.
details: /var/hg/gmp/rev/852607453cb1
changeset: 14450:852607453cb1
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Nov 16 21:51:17 2011 +0100
description:
Add cycle counts.
details: /var/hg/gmp/rev/c1b7e36d824b
changeset: 14451:c1b7e36d824b
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Nov 16 21:55:23 2011 +0100
description:
New file.
diffstat:
ChangeLog | 2 +
mpn/ia64/tabselect.asm | 8 +-
mpn/powerpc32/tabselect.asm | 98 +++++++++++++++++++++++++++++++++++++++++++++
mpn/powerpc64/tabselect.asm | 11 ++--
mpn/x86_64/tabselect.asm | 14 +++---
5 files changed, 117 insertions(+), 16 deletions(-)
diffs (211 lines):
diff -r fc2167681b3e -r c1b7e36d824b ChangeLog
--- a/ChangeLog Wed Nov 16 21:46:58 2011 +0100
+++ b/ChangeLog Wed Nov 16 21:55:23 2011 +0100
@@ -1,5 +1,7 @@
2011-11-16 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/powerpc32/tabselect.asm: New file.
+
* mpn/powerpc64/mode64/aorscnd_n.asm: New file.
2011-11-15 Niels Möller <nisse at lysator.liu.se>
diff -r fc2167681b3e -r c1b7e36d824b mpn/ia64/tabselect.asm
--- a/mpn/ia64/tabselect.asm Wed Nov 16 21:46:58 2011 +0100
+++ b/mpn/ia64/tabselect.asm Wed Nov 16 21:55:23 2011 +0100
@@ -21,12 +21,12 @@
C cycles/limb
C Itanium: ?
-C Itanium 2: 5 (estimated)
+C Itanium 2: 2.5
C NOTES
-C * Using software pipelining could trivially yield 3 c/l even without
-C unrolling. (This code was modelled after the powerpc64 code, for
-C simplicity.)
+C * Using software pipelining could trivially yield 2 c/l without unrolling,
+C or 1+epsilon with unrolling. (This code was modelled after the powerpc64
+C code, for simplicity.)
C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
define(`rp', `r32')
diff -r fc2167681b3e -r c1b7e36d824b mpn/powerpc32/tabselect.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc32/tabselect.asm Wed Nov 16 21:55:23 2011 +0100
@@ -0,0 +1,98 @@
+dnl PowerPC-32 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 603e: ?
+C 604e: ?
+C 75x (G3): ?
+C 7400,7410 (G4): ?
+C 744x,745x (G4+): ?
+C power4/ppc970: 3.3
+C power5: ?
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`mask', `r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ addi r0, n, 1
+ srdi r0, r0, 1 C inner loop count
+ andi. r9, n, 1 C set cr0 for use in inner loop
+ subf which, nents, which
+ sldi n, n, 2
+
+L(outer):
+ mtctr r0 C put inner loop count in ctr
+
+ add r9, which, nents C are we at the selected table entry?
+ addic r9, r9, -1 C set CF iff not selected entry
+ subfe mask, r0, r0
+
+ beq cr0, L(top) C branch to loop entry if n even
+
+ lwz r9, 0(tp)
+ addi tp, tp, 4
+ and r9, r9, mask
+ lwz r11, 0(rp)
+ andc r11, r11, mask
+ or r9, r9, r11
+ stw r9, 0(rp)
+ addi rp, rp, 4
+ bdz L(end)
+
+ ALIGN(16)
+L(top): lwz r9, 0(tp)
+ lwz r10, 4(tp)
+ addi tp, tp, 8
+ nop
+ and r9, r9, mask
+ and r10, r10, mask
+ lwz r11, 0(rp)
+ lwz r12, 4(rp)
+ andc r11, r11, mask
+ andc r12, r12, mask
+ or r9, r9, r11
+ or r10, r10, r12
+ stw r9, 0(rp)
+ stw r10, 4(rp)
+ addi rp, rp, 8
+ bdnz L(top)
+
+L(end): subf rp, n, rp C move rp back to beginning
+ cmpdi cr6, nents, 1
+ addi nents, nents, -1
+ bne cr6, L(outer)
+
+ blr
+EPILOGUE()
diff -r fc2167681b3e -r c1b7e36d824b mpn/powerpc64/tabselect.asm
--- a/mpn/powerpc64/tabselect.asm Wed Nov 16 21:46:58 2011 +0100
+++ b/mpn/powerpc64/tabselect.asm Wed Nov 16 21:55:23 2011 +0100
@@ -21,10 +21,10 @@
C cycles/limb
C POWER3/PPC630 ?
-C POWER4/PPC970 ?
+C POWER4/PPC970 3.3
C POWER5 ?
C POWER6 ?
-C POWER7 ?
+C POWER7 2.5
C NOTES
C * This has not been tuned for any specific processor. Its speed should not
@@ -60,18 +60,20 @@
beq cr0, L(top) C branch to loop entry if n even
ld r9, 0(tp)
+ addi tp, tp, 8
and r9, r9, mask
ld r11, 0(rp)
andc r11, r11, mask
or r9, r9, r11
std r9, 0(rp)
- addi tp, tp, 8
addi rp, rp, 8
bdz L(end)
ALIGN(16)
L(top): ld r9, 0(tp)
ld r10, 8(tp)
+ addi tp, tp, 16
+ nop
and r9, r9, mask
and r10, r10, mask
ld r11, 0(rp)
@@ -82,13 +84,12 @@
or r10, r10, r12
std r9, 0(rp)
std r10, 8(rp)
- addi tp, tp, 16
addi rp, rp, 16
bdnz L(top)
L(end): subf rp, n, rp C move rp back to beginning
+ cmpdi cr6, nents, 1
addi nents, nents, -1
- cmpdi cr6, nents, 0
bne cr6, L(outer)
blr
diff -r fc2167681b3e -r c1b7e36d824b mpn/x86_64/tabselect.asm
--- a/mpn/x86_64/tabselect.asm Wed Nov 16 21:46:58 2011 +0100
+++ b/mpn/x86_64/tabselect.asm Wed Nov 16 21:55:23 2011 +0100
@@ -21,14 +21,14 @@
C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
+C AMD K8,K9 2.5
+C AMD K10 2.5
+C Intel P4 4
+C Intel core2 2.3
+C Intel NHM 2.5
+C Intel SBR 2.2
C Intel atom ?
-C VIA nano ?
+C VIA nano 3.5
C NOTES
C * This has not been tuned for any specific processor. Its speed should not
More information about the gmp-commit
mailing list