[PATCH 4/4] Provide submul_1 for IBM z13 and newer (s390_64/z13)
Marius Hillenbrand
mhillen at linux.ibm.com
Wed Feb 17 10:20:00 UTC 2021
Add implementation based on 64x64 multiplications (mlgr) and 128-bit
subtractions in vector registers (vsbiq/vsbcbiq). Unroll loop by 2 and
use three borrow chains to reduce dependencies within each iteration
(borrow limb and two borrow bits in vector registers).
---
mpn/s390_64/z13/submul_1.asm | 115 +++++++++++++++++++++++++++++++++++
1 file changed, 115 insertions(+)
create mode 100644 mpn/s390_64/z13/submul_1.asm
diff --git a/mpn/s390_64/z13/submul_1.asm b/mpn/s390_64/z13/submul_1.asm
new file mode 100644
index 000000000..d7761f0e3
--- /dev/null
+++ b/mpn/s390_64/z13/submul_1.asm
@@ -0,0 +1,115 @@
+dnl S/390-64 mpn_submul_1
+
+dnl Copyright 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C INPUT PARAMETERS
+define(`rp', `%r2')
+define(`s1p', `%r3')
+define(`n', `%r4')
+define(`s2', `%r5')
+
+define(`borrow', `%r0')
+define(`p0_low', `%r9')
+define(`p0_high', `%r8')
+define(`p1_low', `%r11')
+define(`p1_high', `%r10')
+define(`idx', `%r1')
+
+define(`rp_vec', `%v20')
+define(`p0_vec', `%v6')
+define(`p1_vec', `%v22')
+define(`borrow_vec1', `%v23')
+define(`borrow_vec2', `%v4')
+
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ .align 16
+ stmg %r8,%r11,64(%r15)
+
+ lghi %r10,0
+ lghi borrow,0
+ vrepig borrow_vec1,1
+ vlr borrow_vec2,borrow_vec1
+
+ tmll n,1
+ je L(even)
+
+ lg p0_low,0(s1p)
+ lg %r11,0(rp)
+ mlgr p0_high,s2
+ lghi %r10,1
+
+ slgr %r11,p0_low
+ locghi borrow,1,0xC
+ agr borrow,p0_high
+
+ stg %r11,0(rp)
+
+ clgrjhe %r10,n,L(done)
+
+L(even):
+ sllg idx,%r10,3
+ sllg n,n,3
+
+L(loop):
+ lg p0_low,0(idx,s1p)
+ lg p1_low,8(idx,s1p)
+ vl rp_vec,0(idx,rp)
+ mlgr p0_high,s2
+ mlgr p1_high,s2
+
+ vpdi rp_vec,rp_vec,rp_vec,4
+ vlvgp p0_vec,p0_high,p0_low
+ vlvgp p1_vec,p1_low,borrow
+
+ vsbiq %v5,rp_vec,p0_vec,borrow_vec1
+ vsbiq %v3,%v5,p1_vec,borrow_vec2
+ vsbcbiq borrow_vec1,rp_vec,p0_vec,borrow_vec1
+ vsbcbiq borrow_vec2,%v5,p1_vec,borrow_vec2
+
+ vpdi %v7,%v3,%v3,4
+ vst %v7,0(idx,rp)
+ lgr borrow,p1_high
+
+ aghi idx,16
+ clgrjl idx,n,L(loop)
+
+L(done):
+ vlgvg %r3,borrow_vec1,1
+ vlgvg %r2,borrow_vec2,1
+ aghi borrow,2
+ lmg %r8,%r11,64(%r15)
+ sgr borrow,%r3
+ sgrk %r2,borrow,%r2
+ br %r14
+EPILOGUE()
--
2.26.2
More information about the gmp-devel
mailing list