[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Jul 29 01:04:30 CEST 2023
details: /var/hg/gmp/rev/2dd38d2e9c75
changeset: 18410:2dd38d2e9c75
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Jul 29 00:52:40 2023 +0200
description:
Rewrite z13 mul_1.asm, addmul_1.asm, submul_1.asm.
details: /var/hg/gmp/rev/2eaf0c2f3a2b
changeset: 18411:2eaf0c2f3a2b
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Jul 29 01:03:36 2023 +0200
description:
Move popcount and hamdist back from z14 to z13 after needed edits.
details: /var/hg/gmp/rev/e8890259c68a
changeset: 18412:e8890259c68a
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Jul 29 01:04:10 2023 +0200
description:
Remove z13 addmul_2 now that addmul_1 is more than twice faster.
diffstat:
mpn/s390_64/z13/addmul_1.asm | 176 +++++++++++++++++++++++++++---------------
mpn/s390_64/z13/addmul_2.asm | 132 --------------------------------
mpn/s390_64/z13/hamdist.asm | 76 ++++++++++++++++++
mpn/s390_64/z13/mul_1.asm | 142 +++++++++++++++++++++-------------
mpn/s390_64/z13/popcount.asm | 69 ++++++++++++++++
mpn/s390_64/z13/submul_1.asm | 151 +++++++++++++++++++++++++-----------
mpn/s390_64/z14/hamdist.asm | 76 ------------------
mpn/s390_64/z14/popcount.asm | 69 ----------------
8 files changed, 448 insertions(+), 443 deletions(-)
diffs (truncated from 989 to 300 lines):
diff -r 66e10f88f532 -r e8890259c68a mpn/s390_64/z13/addmul_1.asm
--- a/mpn/s390_64/z13/addmul_1.asm Thu Jul 27 21:31:49 2023 +0200
+++ b/mpn/s390_64/z13/addmul_1.asm Sat Jul 29 01:04:10 2023 +0200
@@ -1,4 +1,5 @@
dnl S/390-64 mpn_addmul_1 and mpn_addmul_1c.
+dnl Based on C code contributed by Marius Hillenbrand.
dnl Copyright 2023 Free Software Foundation, Inc.
@@ -43,85 +44,130 @@
C z12 ?
C z13 ?
C z14 ?
-C z15 3.9
+C z15 2.55
define(`rp', `%r2')
-define(`up', `%r3')
-define(`un', `%r4')
-define(`v0', `%r5')
+define(`ap', `%r3')
+define(`an', `%r4')
+define(`b0', `%r5')
define(`cy', `%r6')
-define(`idx', `%r8')
+define(`idx', `%r4')
ASM_START()
PROLOGUE(mpn_addmul_1c)
- stmg %r6, %r9, 48(%r15)
- tmll un, 1
- srlg un, un, 1
- je L(cev)
-
-L(cod): lg %r9, 0(up)
- mlgr %r8, v0 C W1 W0
- alg %r6, 0(rp) C W0
- lghi %r7, 0
- alcgr %r8, %r7 C W1
- algr %r6, %r9 C W0
- alcgr %r8, %r7 C W1
- stg %r6, 0(rp)
- lgr %r6, %r8
- clgije un, 0, L(1)
- lghi idx, 8
- j L(lst)
-L(cev): lghi idx, 0
- j L(lst)
+ stmg %r6, %r13, 48(%r15)
+ j L(ent)
EPILOGUE()
PROLOGUE(mpn_addmul_1)
- stmg %r6, %r9, 48(%r15)
- tmll un, 1
- srlg un, un, 1
- je L(evn)
+ stmg %r6, %r13, 48(%r15)
+ lghi %r6, 0
+L(ent): vzero %v0
+ vzero %v2
+ srlg %r11, an, 2
+
+ tmll an, 1
+ je L(bx0)
+L(bx1): tmll an, 2
+ jne L(b11)
+
+L(b01): lghi idx, -24
+ vleg %v2, 0(rp), 1
+ lg %r13, 0(ap)
+ vzero %v4
+ mlgr %r12, b0
+ algr %r13, %r6
+ lghi %r6, 0
+ alcgr %r12, %r6
+ vlvgg %v4, %r13, 1
+ vaq %v2, %v2, %v4
+ vsteg %v2, 0(rp), 1
+ vmrhg %v2, %v2, %v2
+ cgije %r11, 0, L(1)
+ j L(cj0)
-L(odd): lg %r7, 0(up)
- mlgr %r6, v0 C W1 W0
- lghi %r9, 0
- alg %r7, 0(rp)
- alcgr %r6, %r9
- stg %r7, 0(rp)
- clgije un, 0, L(1)
- lghi idx, 8
- j L(lst)
-L(evn): lghi %r6, 0
- lghi idx, 0
+L(b11): lghi idx, -8
+ vleg %v2, 0(rp), 1
+ lg %r9, 0(ap)
+ vzero %v4
+ mlgr %r8, b0
+ algr %r9, %r6
+ lghi %r6, 0
+ alcgr %r8, %r6
+ vlvgg %v4, %r9, 1
+ vaq %v2, %v2, %v4
+ vsteg %v2, 0(rp), 1
+ vmrhg %v2, %v2, %v2
+ j L(cj1)
+
+L(bx0): tmll an, 2
+ jne L(b10)
+L(b00): lghi idx, -32
+ lgr %r12, %r6
+L(cj0): lg %r1, 32(idx, ap)
+ lg %r9, 40(idx, ap)
+ mlgr %r0, b0
+ mlgr %r8, b0
+ vlvgp %v6, %r0, %r1
+ vlvgp %v7, %r9, %r12
+ j L(mid)
-L(lst): vzero %v29
- vzero %v30
-L(top): lgr %r9, %r6
- lg %r1, 0(idx, up)
- lg %r7, 8(idx, up)
- mlgr %r0, v0 C W1 W0
- mlgr %r6, v0 C W2 W1
- vlvgp %v23, %r0, %r1 C W1 W0
- vlvgp %v21, %r7, %r9 C W1 W0
- vacq %v24, %v23, %v21, %v29 C
- vacccq %v29, %v23, %v21, %v29 C carry critical path 1
- vl %v16, 0(idx, rp), 3
- vpdi %v16, %v16, %v16, 4
- vacq %v20, %v24, %v16, %v30 C
- vacccq %v30, %v24, %v16, %v30 C carry critical path 2
- vpdi %v20, %v20, %v20, 4
- vst %v20, 0(idx, rp), 3
- la idx, 16(idx)
- brctg un, L(top)
+L(b10): lghi idx, -16
+ lgr %r8, %r6
+L(cj1): lg %r7, 16(idx, ap)
+ lg %r13, 24(idx, ap)
+ mlgr %r6, b0
+ mlgr %r12, b0
+ vlvgp %v6, %r6, %r7
+ vlvgp %v7, %r13, %r8
+ cgije %r11, 0, L(end)
-L(end): vag %v29, %v29, %v30
- vlgvg %r2, %v29, 1
- algr %r2, %r6
- lmg %r6, %r9, 48(%r15)
- br %r14
-L(1): lgr %r2, %r6
- lmg %r6, %r9, 48(%r15)
+L(top): lg %r1, 32(idx, ap)
+ lg %r9, 40(idx, ap)
+ mlgr %r0, b0
+ mlgr %r8, b0
+ vl %v1, 16(idx, rp), 3
+ vpdi %v1, %v1, %v1, 4
+ vacq %v5, %v6, %v1, %v0
+ vacccq %v0, %v6, %v1, %v0
+ vacq %v3, %v5, %v7, %v2
+ vacccq %v2, %v5, %v7, %v2
+ vpdi %v3, %v3, %v3, 4
+ vst %v3, 16(idx, rp), 3
+ vlvgp %v6, %r0, %r1
+ vlvgp %v7, %r9, %r12
+L(mid): lg %r7, 48(idx, ap)
+ lg %r13, 56(idx, ap)
+ mlgr %r6, b0
+ mlgr %r12, b0
+ vl %v4, 32(idx, rp), 3
+ vpdi %v4, %v4, %v4, 4
+ vacq %v5, %v6, %v4, %v0
+ vacccq %v0, %v6, %v4, %v0
+ vacq %v1, %v5, %v7, %v2
+ vacccq %v2, %v5, %v7, %v2
+ vpdi %v1, %v1, %v1, 4
+ vst %v1, 32(idx, rp), 3
+ vlvgp %v6, %r6, %r7
+ vlvgp %v7, %r13, %r8
+ la idx, 32(idx)
+ brctg %r11, L(top)
+
+L(end): vl %v1, 16(idx, rp), 3
+ vpdi %v1, %v1, %v1, 4
+ vacq %v5, %v6, %v1, %v0
+ vacccq %v0, %v6, %v1, %v0
+ vacq %v3, %v5, %v7, %v2
+ vacccq %v2, %v5, %v7, %v2
+ vpdi %v3, %v3, %v3, 4
+ vst %v3, 16(idx, rp), 3
+
+ vag %v2, %v0, %v2
+L(1): vlgvg %r2, %v2, 1
+ algr %r2, %r12
+ lmg %r6, %r13, 48(%r15)
br %r14
EPILOGUE()
diff -r 66e10f88f532 -r e8890259c68a mpn/s390_64/z13/addmul_2.asm
--- a/mpn/s390_64/z13/addmul_2.asm Thu Jul 27 21:31:49 2023 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,132 +0,0 @@
-dnl S/390-64 mpn_addmul_2
-
-dnl Copyright 2023 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-dnl TODO
-dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time.
-dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1.
-
-C cycles/limb
-C z900 -
-C z990 -
-C z9 -
-C z10 -
-C z196 -
-C z12 ?
-C z13 ?
-C z14 ?
-C z15 3.1
-
-
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`un', `%r4')
-define(`vp', `%r5')
-
-define(`idx', `%r12')
-define(`v0', `%r11')
-define(`v1', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_addmul_2)
- stmg %r6, %r12, 48(%r15)
-
- vzero %v27
- vzero %v28
- vzero %v29
- vzero %v30
- lghi %r10, 0
- lg v0, 0(vp)
- lg v1, 8(vp)
- tmll un, 1
- srlg un, un, 1
- je L(evn)
-
-L(odd): lg %r7, 0(up)
- mlgr %r6, v0 C W2 W1
- alg %r7, 0(rp)
- alcgr %r6, %r10
- stg %r7, 0(rp)
- lghi idx, 8
-dnl clgije un, 0, L(end)
- j L(top)
-
-L(evn): lghi %r6, 0
- lghi idx, 0
- lghi %r1, 0
- j L(ent)
-
-L(top): lg %r1, -8(idx, up)
-L(ent): lg %r9, 0(idx, up)
- mlgr %r0, v1 C W2 W1
- mlgr %r8, v1 C W3 W2
- vlvgp %v22, %r0, %r1 C W2 W1
More information about the gmp-commit
mailing list