[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Aug 13 00:52:17 CEST 2023
details: /var/hg/gmp/rev/8c579e44ac7d
changeset: 18425:8c579e44ac7d
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Aug 13 00:38:18 2023 +0200
description:
Remove C low-level s390 files.
details: /var/hg/gmp/rev/30989fa5c4f5
changeset: 18426:30989fa5c4f5
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Aug 13 00:51:13 2023 +0200
description:
Clean up comments.
details: /var/hg/gmp/rev/e6a74b5299fe
changeset: 18427:e6a74b5299fe
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Aug 13 00:52:13 2023 +0200
description:
Lots of new z13 and z15 asm code.
diffstat:
mpn/s390_64/z13/addmul_1.asm | 6 +-
mpn/s390_64/z13/addmul_1.c | 358 ----------------------------
mpn/s390_64/z13/aormul_2.c | 476 --------------------------------------
mpn/s390_64/z13/com.asm | 103 ++++++++
mpn/s390_64/z13/common-vec.h | 175 -------------
mpn/s390_64/z13/gcd_11.asm | 54 ++++
mpn/s390_64/z13/gcd_22.asm | 117 +++++++++
mpn/s390_64/z13/logops_n.asm | 113 +++++++++
mpn/s390_64/z13/lshift.asm | 22 +-
mpn/s390_64/z13/lshiftc.asm | 22 +-
mpn/s390_64/z13/mul_1.asm | 6 +-
mpn/s390_64/z13/mul_1.c | 31 --
mpn/s390_64/z13/mul_basecase.c | 124 ---------
mpn/s390_64/z13/rshift.asm | 22 +-
mpn/s390_64/z13/sec_tabselect.asm | 147 +++++++++++
mpn/s390_64/z13/submul_1.asm | 6 +-
mpn/s390_64/z15/add_n_sub_n.asm | 112 ++++++++
mpn/s390_64/z15/aors_n.asm | 121 +++++++++
mpn/s390_64/z15/aorsorrlsh1_n.asm | 43 +++
mpn/s390_64/z15/aorsorrlsh2_n.asm | 43 +++
mpn/s390_64/z15/aorsorrlshC_n.asm | 160 ++++++++++++
mpn/s390_64/z15/cnd_aors_n.asm | 137 ++++++++++
mpn/s390_64/z15/rsh1aors_n.asm | 221 +++++++++++++++++
23 files changed, 1401 insertions(+), 1218 deletions(-)
diffs (truncated from 2765 to 300 lines):
diff -r 387869cc4a31 -r e6a74b5299fe mpn/s390_64/z13/addmul_1.asm
--- a/mpn/s390_64/z13/addmul_1.asm Sat Aug 05 17:04:40 2023 +0200
+++ b/mpn/s390_64/z13/addmul_1.asm Sun Aug 13 00:52:13 2023 +0200
@@ -31,17 +31,13 @@
include(`../config.m4')
-dnl TODO
-dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time.
-dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1.
-
C cycles/limb
C z900 -
C z990 -
C z9 -
C z10 -
C z196 -
-C z12 ?
+C z12 -
C z13 ?
C z14 ?
C z15 2.5
diff -r 387869cc4a31 -r e6a74b5299fe mpn/s390_64/z13/addmul_1.c
--- a/mpn/s390_64/z13/addmul_1.c Sat Aug 05 17:04:40 2023 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-/* Addmul_1 / mul_1 for IBM z13 and later
- Contributed by Marius Hillenbrand
-
-Copyright 2021 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp-impl.h"
-#include "s390_64/z13/common-vec.h"
-
-#undef FUNCNAME
-
-#ifdef DO_INLINE
-# ifdef OPERATION_addmul_1
-# define ADD
-# define FUNCNAME inline_addmul_1
-# elif defined(OPERATION_mul_1)
-# define FUNCNAME inline_mul_1
-# endif
-
-#else
-# ifdef OPERATION_addmul_1
-# define ADD
-# define FUNCNAME mpn_addmul_1
-# elif defined(OPERATION_mul_1)
-# define FUNCNAME mpn_mul_1
-# endif
-#endif
-
-#ifdef DO_INLINE
-static inline mp_limb_t
-FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
- __attribute__ ((always_inline));
-
-static inline
-#endif
-mp_limb_t
-FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
-{
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n));
-
- /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
- VRs (using each VR as a single 128-bit accumulator).
- The inner loop is unrolled to four limbs, with two blocks of four
- multiplications each. Since the MLGR operation operates on even/odd GPR
- pairs, pin the products appropriately. */
-
- /* products as GPR pairs */
- register mp_limb_t p0_high asm("r0");
- register mp_limb_t p0_low asm("r1");
-
- register mp_limb_t p1_high asm("r8");
- register mp_limb_t p1_low asm("r9");
-
- register mp_limb_t p2_high asm("r6");
- register mp_limb_t p2_low asm("r7");
-
- register mp_limb_t p3_high asm("r10");
- register mp_limb_t p3_low asm("r11");
-
- /* carry flag for 128-bit add in VR for first carry chain */
- vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
- mp_limb_t carry_limb = 0;
-
-#ifdef ADD
- /* 2nd carry flag for 2nd carry chain with addmul */
- vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
- vec_t sum0;
- vec_t rp0_addend, rp1_addend;
- rp0_addend.dw = vec_splat_u64 (0);
- rp1_addend.dw = vec_splat_u64 (0);
-#endif
- vec_t sum1;
-
- vec_t carry_prod = { .dw = vec_splat_u64 (0) };
-
- /* The scalar multiplications compete with pointer and index increments for
- * issue ports. Thus, increment the loop index in the middle of the loop so
- * that the operations for the next iteration's multiplications can be
- * loaded in time (looks horrible, yet helps performance) and make sure we
- * use addressing with base reg + index reg + immediate displacement
- * (so that only the single index needs incrementing, instead of multiple
- * pointers). */
-#undef LOOP_ADVANCE
-#undef IDX_OFFSET
-
-#define LOOP_ADVANCE 4 * sizeof (mp_limb_t)
-#define IDX_OFFSET (LOOP_ADVANCE)
- register ssize_t idx = 0 - IDX_OFFSET;
-
- /*
- * branch-on-count implicitly hint to the branch prediction as taken, while
- * compare-and-branch hints as not taken. currently, using branch-on-count
- * has a performance advantage, but it is not clear that it is generally the
- * better choice (e.g., branch-on-count requires decrementing the separate
- * counter). so, allow switching the loop condition to enable either
- * category of branch instructions:
- * - idx is less than an upper bound, for compare-and-branch
- * - iteration counter greater than zero, for branch-on-count
- */
-#define BRCTG
-#ifdef BRCTG
- ssize_t iterations = (size_t)n / 4;
-#else
- ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
-#endif
-
- /* products will be transferred into VRs before adding up.
- * see main loop below for comments on accumulation scheme. */
- vec_t product0, product1, product2;
-
- product0.dw = vec_splat_u64 (0);
-
- switch ((size_t)n % 4)
- {
- case 0:
- break;
-
- case 1:
- idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET;
-
- p3_low = s1p[0];
- s390_umul_ppmm (p3_high, p3_low, s2limb);
-
-#ifdef ADD
- rp0_addend.dw[1] = rp[0];
- product0.dw[1] = p3_low;
-
- sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
- carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
-
- rp[0] = sum0.dw[1];
-#else
- rp[0] = p3_low;
-#endif
-
- carry_limb = p3_high;
- break;
-
- case 2:
- p0_low = s1p[0];
- p3_low = s1p[1];
- idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET;
-
- s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb);
-
- carry_prod.dw[0] = p3_low;
-
- product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
-
- carry_limb = p3_high;
-
-#ifdef ADD
- rp0_addend = vec_load_elements_reversed (rp, 0);
- sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
- carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
-
- sum1.sw = vec_add_u128 (sum0.sw, product0.sw);
- carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw);
-#else
- sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw);
- carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw);
-#endif
-
- vec_store_elements_reversed (rp, 0, sum1);
-
- break;
-
- case 3:
- idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET;
-
- p0_low = s1p[0];
- s390_umul_ppmm (p0_high, p0_low, s2limb);
-
-#ifdef ADD
- rp0_addend.dw[1] = rp[0];
- product0.dw[1] = p0_low;
-
- sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
- carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
-
- rp[0] = sum0.dw[1];
-#else
- rp[0] = p0_low;
-#endif
- carry_limb = p0_high;
-
- p1_low = s1p[1];
- p3_low = s1p[2];
-
- s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb);
-
- carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb);
- product1.dw = vec_load_2di_as_pair (p1_high, p1_low);
- carry_limb = p3_high;
-
-#ifdef ADD
- rp0_addend = vec_load_elements_reversed (rp, 8);
- sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
- carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
-
- sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw);
- carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw);
-#else
- sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
- carry_vec0.sw
- = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
-#endif
- vec_store_elements_reversed (rp, 8, sum1);
- break;
- }
-
-#ifdef BRCTG
- for (; iterations > 0; iterations--)
- {
-#else
- while (idx < idx_bound)
- {
-#endif
- vec_t overlap_addend0;
- vec_t overlap_addend1;
-
- /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
- * result in a GPR pair. One of the factors is taken from the GPR pair
- * and overwritten.
- * To reuse factors, it turned out cheaper to load limbs multiple times
- * than copying GPR contents. Enforce that and the use of addressing by
- * base + index gpr + immediate displacement via inline asm.
- */
- ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET);
- ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET);
- ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET);
- ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET);
-
- /*
- * accumulate products as follows (for addmul):
- * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] |
- * p0_high | p0_low |
- * p1_high | p1_low | carry-limb in
- * p2_high | p2_low |
- * c-limb out <- p3_high | p3_low |
- * | < 128-bit VR > < 128-bit VR >
- *
- * < rp1_addend > < rp0_addend >
- * carry-chain 0 <- + <- + <- carry_vec0[127]
- * < product1 > < product0 >
- * carry-chain 1 <- + <- + <- carry_vec1[127]
- * < overlap_addend1 > < overlap_addend0 >
- *
More information about the gmp-commit
mailing list