[Gmp-commit] /var/hg/gmp: 4 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Wed Nov 16 21:55:28 CET 2011


details:   /var/hg/gmp/rev/0f0b00869a7d
changeset: 14448:0f0b00869a7d
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Nov 16 21:49:38 2011 +0100
description:
Slight tweak of new code.

details:   /var/hg/gmp/rev/7cd202214b9c
changeset: 14449:7cd202214b9c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Nov 16 21:50:51 2011 +0100
description:
Add cycle counts.

details:   /var/hg/gmp/rev/852607453cb1
changeset: 14450:852607453cb1
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Nov 16 21:51:17 2011 +0100
description:
Add cycle counts.

details:   /var/hg/gmp/rev/c1b7e36d824b
changeset: 14451:c1b7e36d824b
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Nov 16 21:55:23 2011 +0100
description:
New file.

diffstat:

 ChangeLog                   |   2 +
 mpn/ia64/tabselect.asm      |   8 +-
 mpn/powerpc32/tabselect.asm |  98 +++++++++++++++++++++++++++++++++++++++++++++
 mpn/powerpc64/tabselect.asm |  11 ++--
 mpn/x86_64/tabselect.asm    |  14 +++---
 5 files changed, 117 insertions(+), 16 deletions(-)

diffs (211 lines):

diff -r fc2167681b3e -r c1b7e36d824b ChangeLog
--- a/ChangeLog	Wed Nov 16 21:46:58 2011 +0100
+++ b/ChangeLog	Wed Nov 16 21:55:23 2011 +0100
@@ -1,5 +1,7 @@
 2011-11-16  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/powerpc32/tabselect.asm: New file.
+
 	* mpn/powerpc64/mode64/aorscnd_n.asm: New file.
 
 2011-11-15  Niels Möller  <nisse at lysator.liu.se>
diff -r fc2167681b3e -r c1b7e36d824b mpn/ia64/tabselect.asm
--- a/mpn/ia64/tabselect.asm	Wed Nov 16 21:46:58 2011 +0100
+++ b/mpn/ia64/tabselect.asm	Wed Nov 16 21:55:23 2011 +0100
@@ -21,12 +21,12 @@
 
 C           cycles/limb
 C Itanium:       ?
-C Itanium 2:     5  (estimated)
+C Itanium 2:     2.5
 
 C NOTES
-C  * Using software pipelining could trivially yield 3 c/l even without
-C    unrolling.  (This code was modelled after the powerpc64 code, for
-C    simplicity.)
+C  * Using software pipelining could trivially yield 2 c/l without unrolling,
+C    or 1+epsilon with unrolling.  (This code was modelled after the powerpc64
+C    code, for simplicity.)
 
 C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
 define(`rp',     `r32')
diff -r fc2167681b3e -r c1b7e36d824b mpn/powerpc32/tabselect.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc32/tabselect.asm	Wed Nov 16 21:55:23 2011 +0100
@@ -0,0 +1,98 @@
+dnl  PowerPC-32 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C 603e:              ?
+C 604e:              ?
+C 75x (G3):          ?
+C 7400,7410 (G4):    ?
+C 744x,745x (G4+):   ?
+C power4/ppc970:     3.3
+C power5:            ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r3')
+define(`tp',     `r4')
+define(`n',      `r5')
+define(`nents',  `r6')
+define(`which',  `r7')
+
+define(`mask',   `r8')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	addi	r0, n, 1
+	srdi	r0, r0, 1		C inner loop count
+	andi.	r9, n, 1		C set cr0 for use in inner loop
+	subf	which, nents, which
+	sldi	n, n, 2
+
+L(outer):
+	mtctr	r0			C put inner loop count in ctr
+
+	add	r9, which, nents	C are we at the selected table entry?
+	addic	r9, r9, -1		C set CF iff not selected entry
+	subfe	mask, r0, r0
+
+	beq	cr0, L(top)		C branch to loop entry if n even
+
+	lwz	r9, 0(tp)
+	addi	tp, tp, 4
+	and	r9, r9, mask
+	lwz	r11, 0(rp)
+	andc	r11, r11, mask
+	or	r9, r9, r11
+	stw	r9, 0(rp)
+	addi	rp, rp, 4
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	lwz	r9, 0(tp)
+	lwz	r10, 4(tp)
+	addi	tp, tp, 8
+	nop
+	and	r9, r9, mask
+	and	r10, r10, mask
+	lwz	r11, 0(rp)
+	lwz	r12, 4(rp)
+	andc	r11, r11, mask
+	andc	r12, r12, mask
+	or	r9, r9, r11
+	or	r10, r10, r12
+	stw	r9, 0(rp)
+	stw	r10, 4(rp)
+	addi	rp, rp, 8
+	bdnz	L(top)
+
+L(end):	subf	rp, n, rp		C move rp back to beginning
+	cmpdi	cr6, nents, 1
+	addi	nents, nents, -1
+	bne	cr6, L(outer)
+
+	blr
+EPILOGUE()
diff -r fc2167681b3e -r c1b7e36d824b mpn/powerpc64/tabselect.asm
--- a/mpn/powerpc64/tabselect.asm	Wed Nov 16 21:46:58 2011 +0100
+++ b/mpn/powerpc64/tabselect.asm	Wed Nov 16 21:55:23 2011 +0100
@@ -21,10 +21,10 @@
 
 C                  cycles/limb
 C POWER3/PPC630          ?
-C POWER4/PPC970          ?
+C POWER4/PPC970          3.3
 C POWER5                 ?
 C POWER6                 ?
-C POWER7                 ?
+C POWER7                 2.5
 
 C NOTES
 C  * This has not been tuned for any specific processor.  Its speed should not
@@ -60,18 +60,20 @@
 	beq	cr0, L(top)		C branch to loop entry if n even
 
 	ld	r9, 0(tp)
+	addi	tp, tp, 8
 	and	r9, r9, mask
 	ld	r11, 0(rp)
 	andc	r11, r11, mask
 	or	r9, r9, r11
 	std	r9, 0(rp)
-	addi	tp, tp, 8
 	addi	rp, rp, 8
 	bdz	L(end)
 
 	ALIGN(16)
 L(top):	ld	r9, 0(tp)
 	ld	r10, 8(tp)
+	addi	tp, tp, 16
+	nop
 	and	r9, r9, mask
 	and	r10, r10, mask
 	ld	r11, 0(rp)
@@ -82,13 +84,12 @@
 	or	r10, r10, r12
 	std	r9, 0(rp)
 	std	r10, 8(rp)
-	addi	tp, tp, 16
 	addi	rp, rp, 16
 	bdnz	L(top)
 
 L(end):	subf	rp, n, rp		C move rp back to beginning
+	cmpdi	cr6, nents, 1
 	addi	nents, nents, -1
-	cmpdi	cr6, nents, 0
 	bne	cr6, L(outer)
 
 	blr
diff -r fc2167681b3e -r c1b7e36d824b mpn/x86_64/tabselect.asm
--- a/mpn/x86_64/tabselect.asm	Wed Nov 16 21:46:58 2011 +0100
+++ b/mpn/x86_64/tabselect.asm	Wed Nov 16 21:55:23 2011 +0100
@@ -21,14 +21,14 @@
 
 
 C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 ?
-C Intel NHM	 ?
-C Intel SBR	 ?
+C AMD K8,K9	 2.5
+C AMD K10	 2.5
+C Intel P4	 4
+C Intel core2	 2.3
+C Intel NHM	 2.5
+C Intel SBR	 2.2
 C Intel atom	 ?
-C VIA nano	 ?
+C VIA nano	 3.5
 
 C NOTES
 C  * This has not been tuned for any specific processor.  Its speed should not


More information about the gmp-commit mailing list