[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Aug 5 16:59:38 CEST 2023
details: /var/hg/gmp/rev/f31b8c135ee3
changeset: 18422:f31b8c135ee3
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Aug 05 16:56:43 2023 +0200
description:
Improve z13 asm support.
details: /var/hg/gmp/rev/98c2e3541dfb
changeset: 18423:98c2e3541dfb
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Aug 05 16:59:34 2023 +0200
description:
Remove z13 sqr_basecase.c, replaced by asm.
diffstat:
mpn/s390_64/z13/addmul_1.asm | 56 ++--
mpn/s390_64/z13/lshift.asm | 124 +++++++++++++
mpn/s390_64/z13/lshiftc.asm | 128 +++++++++++++
mpn/s390_64/z13/mul_1.asm | 55 ++--
mpn/s390_64/z13/mul_basecase.asm | 179 +++++++++++-------
mpn/s390_64/z13/rshift.asm | 120 ++++++++++++
mpn/s390_64/z13/sqr_basecase.asm | 369 +++++++++++++++++++++++++++++++++++++++
mpn/s390_64/z13/sqr_basecase.c | 82 --------
mpn/s390_64/z13/submul_1.asm | 52 ++--
9 files changed, 931 insertions(+), 234 deletions(-)
diffs (truncated from 1543 to 300 lines):
diff -r 372acfd0c33e -r 98c2e3541dfb mpn/s390_64/z13/addmul_1.asm
--- a/mpn/s390_64/z13/addmul_1.asm Thu Aug 03 16:18:17 2023 +0200
+++ b/mpn/s390_64/z13/addmul_1.asm Sat Aug 05 16:59:34 2023 +0200
@@ -44,7 +44,7 @@
C z12 ?
C z13 ?
C z14 ?
-C z15 2.55
+C z15 2.5
define(`rp', `%r2')
@@ -67,16 +67,16 @@
ASM_START()
PROLOGUE(mpn_addmul_1c)
- stmg %r6, %r13, 48(%r15)
+ stmg %r6, %r10, 48(%r15)
j L(ent)
EPILOGUE()
PROLOGUE(mpn_addmul_1)
- stmg %r6, %r13, 48(%r15)
+ stmg %r6, %r10, 48(%r15)
lghi %r6, 0
L(ent): vzero %v0
vzero %v2
- srlg %r11, an, 2
+ srlg %r10, an, 2
tmll an, 1
je L(bx0)
@@ -86,16 +86,17 @@
jne L(b11)
L(b01): lghi idx, -24
- lg %r13, 0(ap)
- mlgr %r12, b0
- algr %r13, %r6
- lghi %r6, 0
- alcgr %r12, %r6
- vlvgg %v4, %r13, 1
+ lgr %r0, %r6
+ lg %r7, 0(ap)
+ mlgr %r6, b0
+ algr %r7, %r0
+ lghi %r0, 0
+ alcgr %r6, %r0
+ vlvgg %v4, %r7, 1
vaq %v2, %v2, %v4
vsteg %v2, 0(rp), 1
vmrhg %v2, %v2, %v2
- cgije %r11, 0, L(1)
+ cgije %r10, 0, L(1)
j L(cj0)
L(b11): lghi idx, -8
@@ -114,7 +115,6 @@
jne L(b10)
L(b00): lghi idx, -32
- lgr %r12, %r6
L(cj0): lg %r1, 32(idx, ap)
lg %r9, 40(idx, ap)
mlgr %r0, b0
@@ -122,20 +122,20 @@
vler %v1, 32(idx, rp), 3
vpdi %v1, %v1, %v1, 4
vlvgp %v6, %r0, %r1
- vlvgp %v7, %r9, %r12
+ vlvgp %v7, %r9, %r6
j L(mid)
L(b10): lghi idx, -16
lgr %r8, %r6
-L(cj1): lg %r7, 16(idx, ap)
- lg %r13, 24(idx, ap)
+L(cj1): lg %r1, 16(idx, ap)
+ lg %r7, 24(idx, ap)
+ mlgr %r0, b0
mlgr %r6, b0
- mlgr %r12, b0
vler %v1, 16(idx, rp), 3
vpdi %v1, %v1, %v1, 4
- vlvgp %v6, %r6, %r7
- vlvgp %v7, %r13, %r8
- cgije %r11, 0, L(end)
+ vlvgp %v6, %r0, %r1
+ vlvgp %v7, %r7, %r8
+ cgije %r10, 0, L(end)
L(top): lg %r1, 32(idx, ap)
lg %r9, 40(idx, ap)
@@ -150,11 +150,11 @@
vpdi %v1, %v1, %v1, 4
vster %v3, 16(idx, rp), 3
vlvgp %v6, %r0, %r1
- vlvgp %v7, %r9, %r12
-L(mid): lg %r7, 48(idx, ap)
- lg %r13, 56(idx, ap)
+ vlvgp %v7, %r9, %r6
+L(mid): lg %r1, 48(idx, ap)
+ lg %r7, 56(idx, ap)
+ mlgr %r0, b0
mlgr %r6, b0
- mlgr %r12, b0
vacq %v5, %v6, %v1, %v0
vacccq %v0, %v6, %v1, %v0
vacq %v3, %v5, %v7, %v2
@@ -163,10 +163,10 @@
vler %v1, 48(idx, rp), 3
vpdi %v1, %v1, %v1, 4
vster %v3, 32(idx, rp), 3
- vlvgp %v6, %r6, %r7
- vlvgp %v7, %r13, %r8
+ vlvgp %v6, %r0, %r1
+ vlvgp %v7, %r7, %r8
la idx, 32(idx)
- brctg %r11, L(top)
+ brctg %r10, L(top)
L(end): vacq %v5, %v6, %v1, %v0
vacccq %v0, %v6, %v1, %v0
@@ -177,7 +177,7 @@
vag %v2, %v0, %v2
L(1): vlgvg %r2, %v2, 1
- algr %r2, %r12
- lmg %r6, %r13, 48(%r15)
+ algr %r2, %r6
+ lmg %r6, %r10, 48(%r15)
br %r14
EPILOGUE()
diff -r 372acfd0c33e -r 98c2e3541dfb mpn/s390_64/z13/lshift.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/s390_64/z13/lshift.asm Sat Aug 05 16:59:34 2023 +0200
@@ -0,0 +1,124 @@
+dnl S/390-64 mpn_lshift.
+
+dnl Copyright 2023 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C z900 7
+C z990 3
+C z9 ?
+C z10 6
+C z196 ?
+
+C NOTES
+C * This uses discrete loads and stores in a software pipeline. Using lmg and
+C stmg is not faster.
+C * One could assume more pipelining could approach 2.5 c/l, but we have not
+C found any 8-way loop that runs better than the current 4-way loop.
+C * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
+C similarly to the x86_64 sqr_basecase feed-in.
+
+define(`rp', `%r2')
+define(`ap', `%r3')
+define(`n', `%r4')
+define(`cnt', `%r5')
+
+define(`tnc', `%r1')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ sllg %r1, n, 3
+ lay ap, -40(%r1, ap)
+ lay rp, -32(%r1, rp)
+
+ lghi tnc, 64
+ slgr tnc, cnt
+
+ lg %r0, 32(ap)
+
+ tmll n, 1
+ je L(bx0)
+L(bx1):
+ clgijne n, 1, L(gt1)
+
+L(1): sllg %r5, %r0, 0(cnt)
+ stg %r5, 24(rp)
+ srlg %r2, %r0, 0(tnc)
+ br %r14
+
+L(gt1): stmg %r6, %r7, 48(%r15)
+ lg %r6, 24(ap)
+ srlg %r6, %r6, 0(tnc)
+ sllg %r7, %r0, 0(cnt)
+ ogr %r6, %r7
+ stg %r6, 24(rp)
+ lay ap, -8(ap)
+ lay rp, -8(rp)
+ lmg %r6, %r7, 48(%r15)
+
+L(bx0): tmll n, 2
+ srlg n, n, 2
+ jne L(bx10)
+L(bx00):vleg %v0, 32(ap), 0
+ la ap, 16(ap)
+ la rp, 16(rp)
+ j L(mid)
+
+L(bx10):vleg %v1, 32(ap), 0
+ clgije n, 0, L(end)
+
+L(top): vl %v0, 16(ap), 3
+ vpdi %v2, %v0, %v1, 4
+ veslg %v4, %v2, 0(cnt)
+ vesrlg %v6, %v0, 0(tnc)
+ vo %v6, %v4, %v6
+ vst %v6, 16(rp), 3
+L(mid): vl %v1, 0(ap), 3
+ vpdi %v3, %v1, %v0, 4
+ veslg %v5, %v3, 0(cnt)
+ vesrlg %v7, %v1, 0(tnc)
+ vo %v7, %v5, %v7
+ vst %v7, 0(rp), 3
+ lay ap, -32(ap)
+ lay rp, -32(rp)
+ brctg n, L(top)
+
+L(end): vzero %v0
+ vleg %v0, 24(ap), 1
+ vpdi %v2, %v0, %v1, 4
+ veslg %v4, %v2, 0(cnt)
+ vesrlg %v6, %v0, 0(tnc)
+ vo %v6, %v4, %v6
+ vst %v6, 16(rp), 3
+
+ srlg %r2, %r0, 0(tnc)
+ br %r14
+EPILOGUE()
+ .section .note.GNU-stack
diff -r 372acfd0c33e -r 98c2e3541dfb mpn/s390_64/z13/lshiftc.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/s390_64/z13/lshiftc.asm Sat Aug 05 16:59:34 2023 +0200
@@ -0,0 +1,128 @@
+dnl S/390-64 mpn_lshiftc.
+
+dnl Copyright 2023 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C z900 7
+C z990 3
More information about the gmp-commit
mailing list