[Gmp-commit] /var/hg/gmp: 3 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sat Jul 29 01:04:30 CEST 2023


details:   /var/hg/gmp/rev/2dd38d2e9c75
changeset: 18410:2dd38d2e9c75
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Jul 29 00:52:40 2023 +0200
description:
Rewrite z13 mul_1.asm, addmul_1.asm, submul_1.asm.

details:   /var/hg/gmp/rev/2eaf0c2f3a2b
changeset: 18411:2eaf0c2f3a2b
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Jul 29 01:03:36 2023 +0200
description:
Move popcount and hamdist back from z14 to z13 after needed edits.

details:   /var/hg/gmp/rev/e8890259c68a
changeset: 18412:e8890259c68a
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Jul 29 01:04:10 2023 +0200
description:
Remove z13 addmul_2 now that addmul_1 is more than twice faster.

diffstat:

 mpn/s390_64/z13/addmul_1.asm |  176 +++++++++++++++++++++++++++---------------
 mpn/s390_64/z13/addmul_2.asm |  132 --------------------------------
 mpn/s390_64/z13/hamdist.asm  |   76 ++++++++++++++++++
 mpn/s390_64/z13/mul_1.asm    |  142 +++++++++++++++++++++-------------
 mpn/s390_64/z13/popcount.asm |   69 ++++++++++++++++
 mpn/s390_64/z13/submul_1.asm |  151 +++++++++++++++++++++++++-----------
 mpn/s390_64/z14/hamdist.asm  |   76 ------------------
 mpn/s390_64/z14/popcount.asm |   69 ----------------
 8 files changed, 448 insertions(+), 443 deletions(-)

diffs (truncated from 989 to 300 lines):

diff -r 66e10f88f532 -r e8890259c68a mpn/s390_64/z13/addmul_1.asm
--- a/mpn/s390_64/z13/addmul_1.asm	Thu Jul 27 21:31:49 2023 +0200
+++ b/mpn/s390_64/z13/addmul_1.asm	Sat Jul 29 01:04:10 2023 +0200
@@ -1,4 +1,5 @@
 dnl  S/390-64 mpn_addmul_1 and mpn_addmul_1c.
+dnl  Based on C code contributed by Marius Hillenbrand.
 
 dnl  Copyright 2023 Free Software Foundation, Inc.
 
@@ -43,85 +44,130 @@
 C z12		 ?
 C z13		 ?
 C z14		 ?
-C z15		 3.9
+C z15		 2.55
 
 
 define(`rp',	`%r2')
-define(`up',	`%r3')
-define(`un',	`%r4')
-define(`v0',	`%r5')
+define(`ap',	`%r3')
+define(`an',	`%r4')
+define(`b0',	`%r5')
 define(`cy',	`%r6')
 
-define(`idx',	`%r8')
+define(`idx',	`%r4')
 
 ASM_START()
 
 PROLOGUE(mpn_addmul_1c)
-	stmg	%r6, %r9, 48(%r15)
-	tmll	un, 1
-	srlg	un, un, 1
-	je	L(cev)
-
-L(cod):	lg	%r9, 0(up)
-	mlgr	%r8, v0			C W1 W0
-	alg	%r6, 0(rp)		C W0
-	lghi	%r7, 0
-	alcgr	%r8, %r7		C W1
-	algr	%r6, %r9		C W0
-	alcgr	%r8, %r7		C W1
-	stg	%r6, 0(rp)
-	lgr	%r6, %r8
-	clgije	un, 0, L(1)
-	lghi	idx, 8
-	j	L(lst)
-L(cev):	lghi	idx, 0
-	j	L(lst)
+	stmg	%r6, %r13, 48(%r15)
+	j	L(ent)
 EPILOGUE()
 
 PROLOGUE(mpn_addmul_1)
-	stmg	%r6, %r9, 48(%r15)
-	tmll	un, 1
-	srlg	un, un, 1
-	je	L(evn)
+	stmg	%r6, %r13, 48(%r15)
+	lghi	%r6, 0
+L(ent):	vzero	%v0
+	vzero	%v2
+	srlg	%r11, an, 2
+
+	tmll	an, 1
+	je	L(bx0)
+L(bx1):	tmll	an, 2
+	jne	L(b11)
+
+L(b01):	lghi	idx, -24
+	vleg	%v2, 0(rp), 1
+	lg	%r13, 0(ap)
+	vzero	%v4
+	mlgr	%r12, b0
+	algr	%r13, %r6
+	lghi	%r6, 0
+	alcgr	%r12, %r6
+	vlvgg	%v4, %r13, 1
+	vaq	%v2, %v2, %v4
+	vsteg	%v2, 0(rp), 1
+	vmrhg	%v2, %v2, %v2
+	cgije	%r11, 0, L(1)
+	j	L(cj0)
 
-L(odd):	lg	%r7, 0(up)
-	mlgr	%r6, v0			C W1 W0
-	lghi	%r9, 0
-	alg	%r7, 0(rp)
-	alcgr	%r6, %r9
-	stg	%r7, 0(rp)
-	clgije	un, 0, L(1)
-	lghi	idx, 8
-	j	L(lst)
-L(evn):	lghi	%r6, 0
-	lghi	idx, 0
+L(b11):	lghi	idx, -8
+	vleg	%v2, 0(rp), 1
+	lg	%r9, 0(ap)
+	vzero	%v4
+	mlgr	%r8, b0
+	algr	%r9, %r6
+	lghi	%r6, 0
+	alcgr	%r8, %r6
+	vlvgg	%v4, %r9, 1
+	vaq	%v2, %v2, %v4
+	vsteg	%v2, 0(rp), 1
+	vmrhg	%v2, %v2, %v2
+	j	L(cj1)
+
+L(bx0):	tmll	an, 2
+	jne	L(b10)
+L(b00):	lghi	idx, -32
+	lgr	%r12, %r6
+L(cj0):	lg	%r1, 32(idx, ap)
+	lg	%r9, 40(idx, ap)
+	mlgr	%r0, b0
+	mlgr	%r8, b0
+	vlvgp	%v6, %r0, %r1
+	vlvgp	%v7, %r9, %r12
+	j	L(mid)
 
-L(lst):	vzero	%v29
-	vzero	%v30
-L(top):	lgr	%r9, %r6
-	lg	%r1, 0(idx, up)
-	lg	%r7, 8(idx, up)
-	mlgr	%r0, v0			C W1 W0
-	mlgr	%r6, v0			C W2 W1
-	vlvgp	%v23, %r0, %r1		C W1 W0
-	vlvgp	%v21, %r7, %r9		C W1 W0
-	vacq	%v24, %v23, %v21, %v29	C
-	vacccq	%v29, %v23, %v21, %v29	C	carry critical path 1
-	vl	%v16, 0(idx, rp), 3
-	vpdi	%v16, %v16, %v16, 4
-	vacq	%v20, %v24, %v16, %v30	C
-	vacccq	%v30, %v24, %v16, %v30	C	carry critical path 2
-	vpdi	%v20, %v20, %v20, 4
-	vst	%v20, 0(idx, rp), 3
-	la	idx, 16(idx)
-	brctg	un, L(top)
+L(b10):	lghi	idx, -16
+	lgr	%r8, %r6
+L(cj1):	lg	%r7, 16(idx, ap)
+	lg	%r13, 24(idx, ap)
+	mlgr	%r6, b0
+	mlgr	%r12, b0
+	vlvgp	%v6, %r6, %r7
+	vlvgp	%v7, %r13, %r8
+	cgije	%r11, 0, L(end)
 
-L(end):	vag	%v29, %v29, %v30
-	vlgvg	%r2, %v29, 1
-	algr	%r2, %r6
-	lmg	%r6, %r9, 48(%r15)
-	br	%r14
-L(1):	lgr	%r2, %r6
-	lmg	%r6, %r9, 48(%r15)
+L(top):	lg	%r1, 32(idx, ap)
+	lg	%r9, 40(idx, ap)
+	mlgr	%r0, b0
+	mlgr	%r8, b0
+	vl	%v1, 16(idx, rp), 3
+	vpdi	%v1, %v1, %v1, 4
+	vacq	%v5, %v6, %v1, %v0
+	vacccq	%v0, %v6, %v1, %v0
+	vacq	%v3, %v5, %v7, %v2
+	vacccq	%v2, %v5, %v7, %v2
+	vpdi	%v3, %v3, %v3, 4
+	vst	%v3, 16(idx, rp), 3
+	vlvgp	%v6, %r0, %r1
+	vlvgp	%v7, %r9, %r12
+L(mid):	lg	%r7, 48(idx, ap)
+	lg	%r13, 56(idx, ap)
+	mlgr	%r6, b0
+	mlgr	%r12, b0
+	vl	%v4, 32(idx, rp), 3
+	vpdi	%v4, %v4, %v4, 4
+	vacq	%v5, %v6, %v4, %v0
+	vacccq	%v0, %v6, %v4, %v0
+	vacq	%v1, %v5, %v7, %v2
+	vacccq	%v2, %v5, %v7, %v2
+	vpdi	%v1, %v1, %v1, 4
+	vst	%v1, 32(idx, rp), 3
+	vlvgp	%v6, %r6, %r7
+	vlvgp	%v7, %r13, %r8
+	la	idx, 32(idx)
+	brctg	%r11, L(top)
+
+L(end):	vl	%v1, 16(idx, rp), 3
+	vpdi	%v1, %v1, %v1, 4
+	vacq	%v5, %v6, %v1, %v0
+	vacccq	%v0, %v6, %v1, %v0
+	vacq	%v3, %v5, %v7, %v2
+	vacccq	%v2, %v5, %v7, %v2
+	vpdi	%v3, %v3, %v3, 4
+	vst	%v3, 16(idx, rp), 3
+
+	vag	%v2, %v0, %v2
+L(1):	vlgvg	%r2, %v2, 1
+	algr	%r2, %r12
+	lmg	%r6, %r13, 48(%r15)
 	br	%r14
 EPILOGUE()
diff -r 66e10f88f532 -r e8890259c68a mpn/s390_64/z13/addmul_2.asm
--- a/mpn/s390_64/z13/addmul_2.asm	Thu Jul 27 21:31:49 2023 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,132 +0,0 @@
-dnl  S/390-64 mpn_addmul_2
-
-dnl  Copyright 2023 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-dnl TODO
-dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time.
-dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1.
-
-C            cycles/limb
-C z900		 -
-C z990		 -
-C z9		 -
-C z10		 -
-C z196		 -
-C z12		 ?
-C z13		 ?
-C z14		 ?
-C z15		 3.1
-
-
-define(`rp',	`%r2')
-define(`up',	`%r3')
-define(`un',	`%r4')
-define(`vp',	`%r5')
-
-define(`idx',	`%r12')
-define(`v0',	`%r11')
-define(`v1',	`%r5')
-
-ASM_START()
-PROLOGUE(mpn_addmul_2)
-	stmg	%r6, %r12, 48(%r15)
-
-	vzero	%v27
-	vzero	%v28
-	vzero	%v29
-	vzero	%v30
-	lghi	%r10, 0
-	lg	v0, 0(vp)
-	lg	v1, 8(vp)
-	tmll	un, 1
-	srlg	un, un, 1
-	je	L(evn)
-
-L(odd):	lg	%r7, 0(up)
-	mlgr	%r6, v0			C W2 W1
-	alg	%r7, 0(rp)
-	alcgr	%r6, %r10
-	stg	%r7, 0(rp)
-	lghi	idx, 8
-dnl	clgije	un, 0, L(end)
-	j	L(top)
-
-L(evn):	lghi	%r6, 0
-	lghi	idx, 0
-	lghi	%r1, 0
-	j	L(ent)
-
-L(top):	lg	%r1, -8(idx, up)
-L(ent):	lg	%r9, 0(idx, up)
-	mlgr	%r0, v1			C W2 W1
-	mlgr	%r8, v1			C W3 W2
-	vlvgp	%v22, %r0, %r1		C W2 W1


More information about the gmp-commit mailing list