[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Oct 29 21:57:16 CEST 2011
details: /var/hg/gmp/rev/f6aeeec5055e
changeset: 14394:f6aeeec5055e
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Oct 29 21:46:29 2011 +0200
description:
Complete rewrite.
details: /var/hg/gmp/rev/f9a5c0d9986a
changeset: 14395:f9a5c0d9986a
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Oct 29 21:46:37 2011 +0200
description:
New file.
details: /var/hg/gmp/rev/38ae645d060e
changeset: 14396:38ae645d060e
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Oct 29 21:46:43 2011 +0200
description:
*** empty log message ***
diffstat:
ChangeLog | 7 +
mpn/s390_64/lshift.asm | 185 ++++++++++++++++++++++++++++++++-------------
mpn/s390_64/lshiftc.asm | 196 ++++++++++++++++++++++++++++++++++++++++++++++++
mpn/s390_64/rshift.asm | 182 +++++++++++++++++++++++++++++++++-----------
4 files changed, 469 insertions(+), 101 deletions(-)
diffs (truncated from 656 to 300 lines):
diff -r b31a653eb776 -r 38ae645d060e ChangeLog
--- a/ChangeLog Fri Oct 28 00:13:42 2011 +0200
+++ b/ChangeLog Sat Oct 29 21:46:43 2011 +0200
@@ -1,3 +1,10 @@
+2011-10-29 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/s390_64/lshift.asm: Complete rewrite.
+ * mpn/s390_64/rshift.asm: Likewise.
+
+ * mpn/s390_64/lshiftc.asm: New file.
+
2011-10-28 Torbjorn Granlund <tege at gmplib.org>
* mpn/s390_32/esame/aors_n.asm: New file, with rewritten add/sub code.
diff -r b31a653eb776 -r 38ae645d060e mpn/s390_64/lshift.asm
--- a/mpn/s390_64/lshift.asm Fri Oct 28 00:13:42 2011 +0200
+++ b/mpn/s390_64/lshift.asm Sat Oct 29 21:46:43 2011 +0200
@@ -1,4 +1,4 @@
-dnl S/390-64 mpn_lshift
+dnl S/390-64 mpn_lshift.
dnl Copyright 2011 Free Software Foundation, Inc.
@@ -20,14 +20,19 @@
include(`../config.m4')
C cycles/limb
-C z900 7.25
-C z990 4.25
+C z900 ?
+C z990 3
C z9 ?
C z10 ?
C z196 ?
-C FIXME
-C * Streamline feed-in code.
+C NOTES
+C * This uses discrete loads and stores in a software pipeline. Using lmg and
+C stmg is not faster.
+C * One could assume more pipelining could approach 2.5 c/l, but we have not
+C found any 8-way loop that runs better than the current 4-way loop.
+C * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
+C similrly to the x86_64 sqr_basecase feed-in.
C INPUT PARAMETERS
define(`rp', `%r2')
@@ -39,70 +44,142 @@
ASM_START()
PROLOGUE(mpn_lshift)
- cghi n, 1
- jne L(gt1)
- lcgr %r4, cnt
- lg %r1, 0(up)
+ cghi n, 3
+ jh L(gt1)
+
+ stmg %r6, %r7, 48(%r15)
+ larl %r1, L(tab)-4
+ lcgr tnc, cnt
+ sllg n, n, 2
+ b 0(n,%r1)
+L(tab): j L(n1)
+ j L(n2)
+ j L(n3)
+
+L(n1): lg %r1, 0(up)
sllg %r0, %r1, 0(cnt)
stg %r0, 0(rp)
- srlg %r2, %r1, 0(%r4)
+ srlg %r2, %r1, 0(tnc)
+ lg %r6, 48(%r15) C restoring r7 not needed
br %r14
-L(gt1): stmg %r6, %r9, 48(%r15)
+L(n2): lg %r1, 8(up)
+ srlg %r4, %r1, 0(tnc)
+ sllg %r0, %r1, 0(cnt)
+ j L(cj)
+
+L(n3): lg %r1, 16(up)
+ srlg %r4, %r1, 0(tnc)
+ sllg %r0, %r1, 0(cnt)
+ lg %r1, 8(up)
+ srlg %r7, %r1, 0(tnc)
+ ogr %r7, %r0
+ sllg %r0, %r1, 0(cnt)
+ stg %r7, 16(rp)
+L(cj): lg %r1, 0(up)
+ srlg %r7, %r1, 0(tnc)
+ ogr %r7, %r0
+ sllg %r0, %r1, 0(cnt)
+ stg %r7, 8(rp)
+ stg %r0, 0(rp)
+ lgr %r2, %r4
+ lmg %r6, %r7, 48(%r15)
+ br %r14
+
+L(gt1): stmg %r6, %r13, 48(%r15)
+ lcgr tnc, cnt C tnc = -cnt
sllg %r1, n, 3
- aghi up, -8
+ srlg %r0, n, 2 C loop count
- lg %r9, 0(%r1,up)
-
- srlg %r8, n, 2 C loop count
- lcgr tnc, cnt
+ agr up, %r1 C point up at end of U
+ agr rp, %r1 C point rp at end of R
+ aghi up, -56
+ aghi rp, -40
lghi %r7, 3
- sllg %r0, %r9, 0(cnt)
-
- ngr %r7, n C n mod 4
+ ngr %r7, n
je L(b0)
- aghi %r8, 1
cghi %r7, 2
jl L(b1)
je L(b2)
-L(b3): aghi %r1, -16
- j L(m3)
-L(b0): aghi %r1, -24
- j L(m0)
-L(b1): aghi %r1, -32
- aghi %r8, -1
- j L(top)
-L(b2): aghi %r1, -8
- j L(m2)
+L(b3): lg %r7, 48(up)
+ srlg %r9, %r7, 0(tnc)
+ sllg %r11, %r7, 0(cnt)
+ lg %r8, 40(up)
+ lg %r7, 32(up)
+ srlg %r4, %r8, 0(tnc)
+ sllg %r13, %r8, 0(cnt)
+ ogr %r11, %r4
+ la rp, 16(rp)
+ j L(lm3)
-L(top): lg %r7, 24(%r1,up)
- srlg %r4, %r7, 0(tnc)
- ogr %r4, %r0
- sllg %r0, %r7, 0(cnt)
- stg %r4, 24(%r1,rp)
-L(m0): lg %r7, 16(%r1,up)
- srlg %r4, %r7, 0(tnc)
- ogr %r4, %r0
- sllg %r0, %r7, 0(cnt)
- stg %r4, 16(%r1,rp)
-L(m3): lg %r7, 8(%r1,up)
- srlg %r4, %r7, 0(tnc)
- ogr %r4, %r0
- sllg %r0, %r7, 0(cnt)
- stg %r4, 8(%r1,rp)
-L(m2): lg %r7, 0(%r1,up)
- srlg %r4, %r7, 0(tnc)
- ogr %r4, %r0
- sllg %r0, %r7, 0(cnt)
- stg %r4, 0(%r1,rp)
- aghi %r1, -32
- brctg %r8, L(top)
+L(b2): lg %r8, 48(up)
+ lg %r7, 40(up)
+ srlg %r9, %r8, 0(tnc)
+ sllg %r13, %r8, 0(cnt)
+ la rp, 24(rp)
+ la up, 8(up)
+ j L(lm2)
-L(end): stg %r0, 24(%r1,rp)
- srlg %r2, %r9, 0(tnc)
- lmg %r6, %r9, 48(%r15)
+L(b1): lg %r7, 48(up)
+ srlg %r9, %r7, 0(tnc)
+ sllg %r11, %r7, 0(cnt)
+ lg %r8, 40(up)
+ lg %r7, 32(up)
+ srlg %r4, %r8, 0(tnc)
+ sllg %r10, %r8, 0(cnt)
+ ogr %r11, %r4
+ la rp, 32(rp)
+ la up, 16(up)
+ j L(lm1)
+
+L(b0): lg %r8, 48(up)
+ lg %r7, 40(up)
+ srlg %r9, %r8, 0(tnc)
+ sllg %r10, %r8, 0(cnt)
+ la rp, 40(rp)
+ la up, 24(up)
+ j L(lm0)
+
+C ALIGN(16)
+L(top): srlg %r4, %r8, 0(tnc)
+ sllg %r13, %r8, 0(cnt)
+ ogr %r11, %r4
+ stg %r10, 24(rp)
+L(lm3): stg %r11, 16(rp)
+L(lm2): srlg %r12, %r7, 0(tnc)
+ sllg %r11, %r7, 0(cnt)
+ lg %r8, 24(up)
+ lg %r7, 16(up)
+ ogr %r13, %r12
+ srlg %r4, %r8, 0(tnc)
+ sllg %r10, %r8, 0(cnt)
+ ogr %r11, %r4
+ stg %r13, 8(rp)
+L(lm1): stg %r11, 0(rp)
+L(lm0): srlg %r12, %r7, 0(tnc)
+ aghi rp, -32
+ sllg %r11, %r7, 0(cnt)
+ lg %r8, 8(up)
+ lg %r7, 0(up)
+ aghi up, -32
+ ogr %r10, %r12
+ brctg %r0, L(top)
+
+L(end): srlg %r4, %r8, 0(tnc)
+ sllg %r13, %r8, 0(cnt)
+ ogr %r11, %r4
+ stg %r10, 24(rp)
+ stg %r11, 16(rp)
+ srlg %r12, %r7, 0(tnc)
+ sllg %r11, %r7, 0(cnt)
+ ogr %r13, %r12
+ stg %r13, 8(rp)
+ stg %r11, 0(rp)
+ lgr %r2, %r9
+
+ lmg %r6, %r13, 48(%r15)
br %r14
EPILOGUE()
diff -r b31a653eb776 -r 38ae645d060e mpn/s390_64/lshiftc.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/s390_64/lshiftc.asm Sat Oct 29 21:46:43 2011 +0200
@@ -0,0 +1,196 @@
+dnl S/390-64 mpn_lshiftc.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C z900 ?
+C z990 3.5
+C z9 ?
+C z10 ?
+C z196 ?
+
+C NOTES
+C * See notes in lshift.asm.
+
+C INPUT PARAMETERS
+define(`rp', `%r2')
+define(`up', `%r3')
+define(`n', `%r4')
+define(`cnt', `%r5')
+
+define(`tnc', `%r6')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+ cghi n, 3
+ jh L(gt1)
+
+ stmg %r6, %r8, 48(%r15)
+ larl %r1, L(tab)-4
+ lcgr tnc, cnt
+ sllg n, n, 2
+ lghi %r8, -1
+ b 0(n,%r1)
+L(tab): j L(n1)
+ j L(n2)
+ j L(n3)
+
+L(n1): lg %r1, 0(up)
+ sllg %r0, %r1, 0(cnt)
+ xgr %r0, %r8
More information about the gmp-commit
mailing list