[Gmp-commit] /var/hg/gmp: 3 new changesets

Sun Aug 13 00:52:17 CEST 2023

details:   /var/hg/gmp/rev/8c579e44ac7d
changeset: 18425:8c579e44ac7d
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Aug 13 00:38:18 2023 +0200
description:
Remove C low-level s390 files.

details:   /var/hg/gmp/rev/30989fa5c4f5
changeset: 18426:30989fa5c4f5
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Aug 13 00:51:13 2023 +0200
description:
Clean up comments.

details:   /var/hg/gmp/rev/e6a74b5299fe
changeset: 18427:e6a74b5299fe
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Aug 13 00:52:13 2023 +0200
description:
Lots of new z13 and z15 asm code.

diffstat:

 mpn/s390_64/z13/addmul_1.asm      |    6 +-
 mpn/s390_64/z13/addmul_1.c        |  358 ----------------------------
 mpn/s390_64/z13/aormul_2.c        |  476 --------------------------------------
 mpn/s390_64/z13/com.asm           |  103 ++++++++
 mpn/s390_64/z13/common-vec.h      |  175 -------------
 mpn/s390_64/z13/gcd_11.asm        |   54 ++++
 mpn/s390_64/z13/gcd_22.asm        |  117 +++++++++
 mpn/s390_64/z13/logops_n.asm      |  113 +++++++++
 mpn/s390_64/z13/lshift.asm        |   22 +-
 mpn/s390_64/z13/lshiftc.asm       |   22 +-
 mpn/s390_64/z13/mul_1.asm         |    6 +-
 mpn/s390_64/z13/mul_1.c           |   31 --
 mpn/s390_64/z13/mul_basecase.c    |  124 ---------
 mpn/s390_64/z13/rshift.asm        |   22 +-
 mpn/s390_64/z13/sec_tabselect.asm |  147 +++++++++++
 mpn/s390_64/z13/submul_1.asm      |    6 +-
 mpn/s390_64/z15/add_n_sub_n.asm   |  112 ++++++++
 mpn/s390_64/z15/aors_n.asm        |  121 +++++++++
 mpn/s390_64/z15/aorsorrlsh1_n.asm |   43 +++
 mpn/s390_64/z15/aorsorrlsh2_n.asm |   43 +++
 mpn/s390_64/z15/aorsorrlshC_n.asm |  160 ++++++++++++
 mpn/s390_64/z15/cnd_aors_n.asm    |  137 ++++++++++
 mpn/s390_64/z15/rsh1aors_n.asm    |  221 +++++++++++++++++
 23 files changed, 1401 insertions(+), 1218 deletions(-)

diffs (truncated from 2765 to 300 lines):

diff -r 387869cc4a31 -r e6a74b5299fe mpn/s390_64/z13/addmul_1.asm

--- a/mpn/s390_64/z13/addmul_1.asm	Sat Aug 05 17:04:40 2023 +0200
+++ b/mpn/s390_64/z13/addmul_1.asm	Sun Aug 13 00:52:13 2023 +0200
@@ -31,17 +31,13 @@
 
 include(`../config.m4')
 
-dnl TODO
-dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time.
-dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1.
-
 C            cycles/limb
 C z900		 -
 C z990		 -
 C z9		 -
 C z10		 -
 C z196		 -
-C z12		 ?
+C z12		 -
 C z13		 ?
 C z14		 ?
 C z15		 2.5
diff -r 387869cc4a31 -r e6a74b5299fe mpn/s390_64/z13/addmul_1.c
--- a/mpn/s390_64/z13/addmul_1.c	Sat Aug 05 17:04:40 2023 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-/* Addmul_1 / mul_1 for IBM z13 and later
-   Contributed by Marius Hillenbrand
-
-Copyright 2021 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
-  * the GNU Lesser General Public License as published by the Free
-    Software Foundation; either version 3 of the License, or (at your
-    option) any later version.
-
-or
-
-  * the GNU General Public License as published by the Free Software
-    Foundation; either version 2 of the License, or (at your option) any
-    later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library.  If not,
-see https://www.gnu.org/licenses/.  */
-
-#include "gmp-impl.h"
-#include "s390_64/z13/common-vec.h"
-
-#undef FUNCNAME
-
-#ifdef DO_INLINE
-#  ifdef OPERATION_addmul_1
-#    define ADD
-#    define FUNCNAME inline_addmul_1
-#  elif defined(OPERATION_mul_1)
-#    define FUNCNAME inline_mul_1
-#  endif
-
-#else
-#  ifdef OPERATION_addmul_1
-#    define ADD
-#    define FUNCNAME mpn_addmul_1
-#  elif defined(OPERATION_mul_1)
-#    define FUNCNAME mpn_mul_1
-#  endif
-#endif
-
-#ifdef DO_INLINE
-static inline mp_limb_t
-FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
-    __attribute__ ((always_inline));
-
-static inline
-#endif
-mp_limb_t
-FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
-{
-  ASSERT (n >= 1);
-  ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n));
-
-  /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
-     VRs (using each VR as a single 128-bit accumulator).
-     The inner loop is unrolled to four limbs, with two blocks of four
-     multiplications each. Since the MLGR operation operates on even/odd GPR
-     pairs, pin the products appropriately. */
-
-  /* products as GPR pairs */
-  register mp_limb_t p0_high asm("r0");
-  register mp_limb_t p0_low asm("r1");
-
-  register mp_limb_t p1_high asm("r8");
-  register mp_limb_t p1_low asm("r9");
-
-  register mp_limb_t p2_high asm("r6");
-  register mp_limb_t p2_low asm("r7");
-
-  register mp_limb_t p3_high asm("r10");
-  register mp_limb_t p3_low asm("r11");
-
-  /* carry flag for 128-bit add in VR for first carry chain */
-  vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
-  mp_limb_t carry_limb = 0;
-
-#ifdef ADD
-  /* 2nd carry flag for 2nd carry chain with addmul */
-  vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
-  vec_t sum0;
-  vec_t rp0_addend, rp1_addend;
-  rp0_addend.dw = vec_splat_u64 (0);
-  rp1_addend.dw = vec_splat_u64 (0);
-#endif
-  vec_t sum1;
-
-  vec_t carry_prod = { .dw = vec_splat_u64 (0) };
-
-  /* The scalar multiplications compete with pointer and index increments for
-   * issue ports. Thus, increment the loop index in the middle of the loop so
-   * that the operations for the next iteration's multiplications can be
-   * loaded in time (looks horrible, yet helps performance) and make sure we
-   * use addressing with base reg + index reg + immediate displacement
-   * (so that only the single index needs incrementing, instead of multiple
-   * pointers). */
-#undef LOOP_ADVANCE
-#undef IDX_OFFSET
-
-#define LOOP_ADVANCE 4 * sizeof (mp_limb_t)
-#define IDX_OFFSET (LOOP_ADVANCE)
-  register ssize_t idx = 0 - IDX_OFFSET;
-
-  /*
-   * branch-on-count implicitly hint to the branch prediction as taken, while
-   * compare-and-branch hints as not taken. currently, using branch-on-count
-   * has a performance advantage, but it is not clear that it is generally the
-   * better choice (e.g., branch-on-count requires decrementing the separate
-   * counter). so, allow switching the loop condition to enable either
-   * category of branch instructions:
-   * - idx is less than an upper bound, for compare-and-branch
-   * - iteration counter greater than zero, for branch-on-count
-   */
-#define BRCTG
-#ifdef BRCTG
-  ssize_t iterations = (size_t)n / 4;
-#else
-  ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
-#endif
-
-  /* products will be transferred into VRs before adding up.
-   * see main loop below for comments on accumulation scheme. */
-  vec_t product0, product1, product2;
-
-  product0.dw = vec_splat_u64 (0);
-
-  switch ((size_t)n % 4)
-    {
-    case 0:
-      break;
-
-    case 1:
-      idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET;
-
-      p3_low = s1p[0];
-      s390_umul_ppmm (p3_high, p3_low, s2limb);
-
-#ifdef ADD
-      rp0_addend.dw[1] = rp[0];
-      product0.dw[1] = p3_low;
-
-      sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
-      carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
-
-      rp[0] = sum0.dw[1];
-#else
-      rp[0] = p3_low;
-#endif
-
-      carry_limb = p3_high;
-      break;
-
-    case 2:
-      p0_low = s1p[0];
-      p3_low = s1p[1];
-      idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET;
-
-      s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb);
-
-      carry_prod.dw[0] = p3_low;
-
-      product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
-
-      carry_limb = p3_high;
-
-#ifdef ADD
-      rp0_addend = vec_load_elements_reversed (rp, 0);
-      sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
-      carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
-
-      sum1.sw = vec_add_u128 (sum0.sw, product0.sw);
-      carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw);
-#else
-      sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw);
-      carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw);
-#endif
-
-      vec_store_elements_reversed (rp, 0, sum1);
-
-      break;
-
-    case 3:
-      idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET;
-
-      p0_low = s1p[0];
-      s390_umul_ppmm (p0_high, p0_low, s2limb);
-
-#ifdef ADD
-      rp0_addend.dw[1] = rp[0];
-      product0.dw[1] = p0_low;
-
-      sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
-      carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
-
-      rp[0] = sum0.dw[1];
-#else
-      rp[0] = p0_low;
-#endif
-      carry_limb = p0_high;
-
-      p1_low = s1p[1];
-      p3_low = s1p[2];
-
-      s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb);
-
-      carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb);
-      product1.dw = vec_load_2di_as_pair (p1_high, p1_low);
-      carry_limb = p3_high;
-
-#ifdef ADD
-      rp0_addend = vec_load_elements_reversed (rp, 8);
-      sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
-      carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
-
-      sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw);
-      carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw);
-#else
-      sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
-      carry_vec0.sw
-          = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
-#endif
-      vec_store_elements_reversed (rp, 8, sum1);
-      break;
-    }
-
-#ifdef BRCTG
-  for (; iterations > 0; iterations--)
-    {
-#else
-  while (idx < idx_bound)
-    {
-#endif
-      vec_t overlap_addend0;
-      vec_t overlap_addend1;
-
-      /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
-       * result in a GPR pair. One of the factors is taken from the GPR pair
-       * and overwritten.
-       * To reuse factors, it turned out cheaper to load limbs multiple times
-       * than copying GPR contents. Enforce that and the use of addressing by
-       * base + index gpr + immediate displacement via inline asm.
-       */
-      ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET);
-      ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET);
-      ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET);
-      ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET);
-
-      /*
-       * accumulate products as follows (for addmul):
-       *                       | rp[i+3] | rp[i+2] | rp[i+1] | rp[i]   |
-       *                                             p0_high | p0_low  |
-       *                                   p1_high | p1_low  | carry-limb in
-       *                         p2_high | p2_low  |
-       * c-limb out <- p3_high | p3_low  |
-       *                       | <    128-bit VR   > <   128-bit VR    >
-       *
-       *                         <   rp1_addend    > <  rp0_addend     >
-       *     carry-chain 0  <-   +           <-      +  <- carry_vec0[127]
-       *                         <   product1      > <  product0       >
-       *     carry-chain 1  <-   +           <-      +  <- carry_vec1[127]
-       *                         < overlap_addend1 > < overlap_addend0 >
-       *