[Gmp-commit] /var/hg/gmp: 3 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sat Oct 29 21:57:16 CEST 2011


details:   /var/hg/gmp/rev/f6aeeec5055e
changeset: 14394:f6aeeec5055e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Oct 29 21:46:29 2011 +0200
description:
Complete rewrite.

details:   /var/hg/gmp/rev/f9a5c0d9986a
changeset: 14395:f9a5c0d9986a
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Oct 29 21:46:37 2011 +0200
description:
New file.

details:   /var/hg/gmp/rev/38ae645d060e
changeset: 14396:38ae645d060e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Oct 29 21:46:43 2011 +0200
description:
*** empty log message ***

diffstat:

 ChangeLog               |    7 +
 mpn/s390_64/lshift.asm  |  185 ++++++++++++++++++++++++++++++++-------------
 mpn/s390_64/lshiftc.asm |  196 ++++++++++++++++++++++++++++++++++++++++++++++++
 mpn/s390_64/rshift.asm  |  182 +++++++++++++++++++++++++++++++++-----------
 4 files changed, 469 insertions(+), 101 deletions(-)

diffs (truncated from 656 to 300 lines):

diff -r b31a653eb776 -r 38ae645d060e ChangeLog
--- a/ChangeLog	Fri Oct 28 00:13:42 2011 +0200
+++ b/ChangeLog	Sat Oct 29 21:46:43 2011 +0200
@@ -1,3 +1,10 @@
+2011-10-29  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/s390_64/lshift.asm: Complete rewrite.
+	* mpn/s390_64/rshift.asm: Likewise.
+
+	* mpn/s390_64/lshiftc.asm: New file.
+
 2011-10-28  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/s390_32/esame/aors_n.asm: New file, with rewritten add/sub code.
diff -r b31a653eb776 -r 38ae645d060e mpn/s390_64/lshift.asm
--- a/mpn/s390_64/lshift.asm	Fri Oct 28 00:13:42 2011 +0200
+++ b/mpn/s390_64/lshift.asm	Sat Oct 29 21:46:43 2011 +0200
@@ -1,4 +1,4 @@
-dnl  S/390-64 mpn_lshift
+dnl  S/390-64 mpn_lshift.
 
 dnl  Copyright 2011 Free Software Foundation, Inc.
 
@@ -20,14 +20,19 @@
 include(`../config.m4')
 
 C            cycles/limb
-C z900		 7.25
-C z990		 4.25
+C z900		 ?
+C z990           3
 C z9		 ?
 C z10		 ?
 C z196		 ?
 
-C FIXME
-C  * Streamline feed-in code.
+C NOTES
+C  * This uses discrete loads and stores in a software pipeline.  Using lmg and
+C    stmg is not faster.
+C  * One could assume more pipelining could approach 2.5 c/l, but we have not
+C    found any 8-way loop that runs better than the current 4-way loop.
+C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
+C    similrly to the x86_64 sqr_basecase feed-in.
 
 C INPUT PARAMETERS
 define(`rp',	`%r2')
@@ -39,70 +44,142 @@
 
 ASM_START()
 PROLOGUE(mpn_lshift)
-	cghi	n, 1
-	jne	L(gt1)
-	lcgr	%r4, cnt
-	lg	%r1, 0(up)
+	cghi	n, 3
+	jh	L(gt1)
+
+	stmg	%r6, %r7, 48(%r15)
+	larl	%r1, L(tab)-4
+	lcgr	tnc, cnt
+	sllg	n, n, 2
+	b	0(n,%r1)
+L(tab):	j	L(n1)
+	j	L(n2)
+	j	L(n3)
+
+L(n1):	lg	%r1, 0(up)
 	sllg	%r0, %r1, 0(cnt)
 	stg	%r0, 0(rp)
-	srlg	%r2, %r1, 0(%r4)
+	srlg	%r2, %r1, 0(tnc)
+	lg	%r6, 48(%r15)		C restoring r7 not needed
 	br	%r14
 
-L(gt1):	stmg	%r6, %r9, 48(%r15)
+L(n2):	lg	%r1, 8(up)
+	srlg	%r4, %r1, 0(tnc)
+	sllg	%r0, %r1, 0(cnt)
+	j	L(cj)
+
+L(n3):	lg	%r1, 16(up)
+	srlg	%r4, %r1, 0(tnc)
+	sllg	%r0, %r1, 0(cnt)
+	lg	%r1, 8(up)
+	srlg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	sllg	%r0, %r1, 0(cnt)
+	stg	%r7, 16(rp)
+L(cj):	lg	%r1, 0(up)
+	srlg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	sllg	%r0, %r1, 0(cnt)
+	stg	%r7, 8(rp)
+	stg	%r0, 0(rp)
+	lgr	%r2, %r4
+	lmg	%r6, %r7, 48(%r15)
+	br	%r14
+
+L(gt1):	stmg	%r6, %r13, 48(%r15)
+	lcgr	tnc, cnt		C tnc = -cnt
 
 	sllg	%r1, n, 3
-	aghi	up, -8
+	srlg	%r0, n, 2		C loop count
 
-	lg	%r9, 0(%r1,up)
-
-	srlg	%r8, n, 2		C loop count
-	lcgr	tnc, cnt
+	agr	up, %r1			C point up at end of U
+	agr	rp, %r1			C point rp at end of R
+	aghi	up, -56
+	aghi	rp, -40
 
 	lghi	%r7, 3
-	sllg	%r0, %r9, 0(cnt)
-
-	ngr	%r7, n			C n mod 4
+	ngr	%r7, n
 	je	L(b0)
-	aghi	%r8, 1
 	cghi	%r7, 2
 	jl	L(b1)
 	je	L(b2)
 
-L(b3):	aghi	%r1, -16
-	j	L(m3)
-L(b0):	aghi	%r1, -24
-	j	L(m0)
-L(b1):	aghi	%r1, -32
-	aghi	%r8, -1
-	j	L(top)
-L(b2):	aghi	%r1, -8
-	j	L(m2)
+L(b3):	lg	%r7, 48(up)
+	srlg	%r9, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 40(up)
+	lg	%r7, 32(up)
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	rp, 16(rp)
+	j	L(lm3)
 
-L(top):	lg	%r7, 24(%r1,up)
-	srlg	%r4, %r7, 0(tnc)
-	ogr	%r4, %r0
-	sllg	%r0, %r7, 0(cnt)
-	stg	%r4, 24(%r1,rp)
-L(m0):	lg	%r7, 16(%r1,up)
-	srlg	%r4, %r7, 0(tnc)
-	ogr	%r4, %r0
-	sllg	%r0, %r7, 0(cnt)
-	stg	%r4, 16(%r1,rp)
-L(m3):	lg	%r7, 8(%r1,up)
-	srlg	%r4, %r7, 0(tnc)
-	ogr	%r4, %r0
-	sllg	%r0, %r7, 0(cnt)
-	stg	%r4, 8(%r1,rp)
-L(m2):	lg	%r7, 0(%r1,up)
-	srlg	%r4, %r7, 0(tnc)
-	ogr	%r4, %r0
-	sllg	%r0, %r7, 0(cnt)
-	stg	%r4, 0(%r1,rp)
-	aghi	%r1, -32
-	brctg	%r8, L(top)
+L(b2):	lg	%r8, 48(up)
+	lg	%r7, 40(up)
+	srlg	%r9, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	la	rp, 24(rp)
+	la	up, 8(up)
+	j	L(lm2)
 
-L(end):	stg	%r0, 24(%r1,rp)
-	srlg	%r2, %r9, 0(tnc)
-	lmg	%r6, %r9, 48(%r15)
+L(b1):	lg	%r7, 48(up)
+	srlg	%r9, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 40(up)
+	lg	%r7, 32(up)
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	rp, 32(rp)
+	la	up, 16(up)
+	j	L(lm1)
+
+L(b0):	lg	%r8, 48(up)
+	lg	%r7, 40(up)
+	srlg	%r9, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	la	rp, 40(rp)
+	la	up, 24(up)
+	j	L(lm0)
+
+C	ALIGN(16)
+L(top):	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r10, 24(rp)
+L(lm3):	stg	%r11, 16(rp)
+L(lm2):	srlg	%r12, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 24(up)
+	lg	%r7, 16(up)
+	ogr	%r13, %r12
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r13, 8(rp)
+L(lm1):	stg	%r11, 0(rp)
+L(lm0):	srlg	%r12, %r7, 0(tnc)
+	aghi	rp, -32
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 8(up)
+	lg	%r7, 0(up)
+	aghi	up, -32
+	ogr	%r10, %r12
+	brctg	%r0, L(top)
+
+L(end):	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r10, 24(rp)
+	stg	%r11, 16(rp)
+	srlg	%r12, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	ogr	%r13, %r12
+	stg	%r13, 8(rp)
+	stg	%r11, 0(rp)
+	lgr	%r2, %r9
+
+	lmg	%r6, %r13, 48(%r15)
 	br	%r14
 EPILOGUE()
diff -r b31a653eb776 -r 38ae645d060e mpn/s390_64/lshiftc.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/s390_64/lshiftc.asm	Sat Oct 29 21:46:43 2011 +0200
@@ -0,0 +1,196 @@
+dnl  S/390-64 mpn_lshiftc.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 ?
+C z990           3.5
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C NOTES
+C  * See notes in lshift.asm.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+define(`tnc',	`%r6')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	cghi	n, 3
+	jh	L(gt1)
+
+	stmg	%r6, %r8, 48(%r15)
+	larl	%r1, L(tab)-4
+	lcgr	tnc, cnt
+	sllg	n, n, 2
+	lghi	%r8, -1
+	b	0(n,%r1)
+L(tab):	j	L(n1)
+	j	L(n2)
+	j	L(n3)
+
+L(n1):	lg	%r1, 0(up)
+	sllg	%r0, %r1, 0(cnt)
+	xgr	%r0, %r8


More information about the gmp-commit mailing list