[PATCH 3/4] Provide addmul_1 for IBM z13 and newer (s390_64/z13)

Wed Feb 17 10:19:59 UTC 2021

Add implementation based on 64x64 multiplications (mlgr) and 128-bit
adds in vector registers (vacq/vacccq). Unroll by 2 and use three
parallel carry chains to reduce dependencies within each iteration
(carry limb and two carry bits in vector registers).
---
 mpn/s390_64/z13/addmul_1.asm | 95 ++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 mpn/s390_64/z13/addmul_1.asm

diff --git a/mpn/s390_64/z13/addmul_1.asm b/mpn/s390_64/z13/addmul_1.asm
new file mode 100644
index 000000000..bcedfa75c
--- /dev/null
+++ b/mpn/s390_64/z13/addmul_1.asm
@@ -0,0 +1,95 @@
+dnl  S/390-64 mpn_addmul_1
+
+dnl  Copyright 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`s1p',	`%r3')
+define(`n',	`%r4')
+define(`s2',	`%r5')
+
+define(`carry',	`%r8')
+define(`idx',	`%r9')
+define(`carry_vec1',	`%v6')
+define(`carry_vec2',	`%v20')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	.align 16
+	stmg	%r8,%r11,64(%r15)
+	lghi	%r1,0
+	lghi	carry,0
+	vzero	carry_vec1
+	vzero	carry_vec2
+	tmll	n,1
+	je	L(even)
+
+	lg	%r11,0(s1p)
+	mlgr	%r10,s2
+	lgr	carry,%r10
+	lg	%r10,0(rp)
+	algr	%r11,%r10
+	alcgr	carry,%r1
+	lghi	%r1,1
+	stg	%r11,0(rp)
+	clgrjhe	%r1,n,L(out)
+
+L(even):
+	sllg	idx,%r1,3
+	sllg	n,n,3 C Note that mp_size_t n will always be small enough so that n<<3 cannot overflow
+
+L(loop):
+	vl	%v22,0(idx,rp)
+	lg	%r11,0(idx,s1p)
+	lg	%r1,8(idx,s1p)
+	mlgr	%r10,s2
+	mlgr	%r0,s2
+	vpdi	%v21,%v22,%v22,4
+	vlvgp	%v7,%r10,%r11
+	vlvgp	%v4,%r1,carry
+	vacq	%v3,%v7,%v21,carry_vec1
+	vacq	%v5,%v3,%v4,carry_vec2
+	vacccq	carry_vec1,%v7,%v21,carry_vec1
+	vacccq	carry_vec2,%v3,%v4,carry_vec2
+	vpdi	%v22,%v5,%v5,4
+	vst	%v22,0(idx,rp)
+	lgr	carry,%r0
+	aghi	idx,16
+	clgrjl	idx,n,L(loop)
+
+L(out):
+	vag	carry_vec1,carry_vec1,carry_vec2
+	vlgvg	%r2,carry_vec1,1
+	agr	%r2,carry
+	lmg	%r8,%r11,64(%r15)
+	br	%r14
+EPILOGUE()
-- 
2.26.2