[PATCH 3/4] s390_64: Add mul_basecase for IBM z13 and later

Thu Aug 5 07:03:53 UTC 2021

Employ the new (add)mul_1 and (add)mul_2 to improve performance for
mul_basecase. This implementation inlines the code for the (add)mul
functions.
---
 mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 mpn/s390_64/z13/mul_basecase.c

diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c
new file mode 100644
index 000000000..4f0b79e0c
--- /dev/null
+++ b/mpn/s390_64/z13/mul_basecase.c
@@ -0,0 +1,124 @@
+/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two
+   natural numbers of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdlib.h>
+
+#include "gmp-impl.h"
+
+/* Note: we explicitly inline all mul and addmul routines here to reduce the
+ * number of branches in prologues of unrolled functions. That comes at the
+   cost of duplicating common loop bodies in object code. */
+#define DO_INLINE
+
+/*
+ * tweak loop conditions in addmul subroutines to enable use of
+ * branch-relative-on-count (BRCTG) instructions, which currently results in
+ * better performance.
+ */
+#define BRCTG // needs evaluation & comparison
+
+#include "s390_64/z13/common-vec.h"
+
+#define OPERATION_mul_1
+#include "s390_64/z13/addmul_1.c"
+#undef OPERATION_mul_1
+
+#define OPERATION_addmul_1
+#include "s390_64/z13/addmul_1.c"
+#undef OPERATION_addmul_1
+
+#define OPERATION_mul_2
+#include "s390_64/z13/aormul_2.c"
+#undef OPERATION_mul_2
+
+#define OPERATION_addmul_2
+#include "s390_64/z13/aormul_2.c"
+#undef OPERATION_addmul_2
+
+void
+mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp,
+                  mp_size_t vn)
+{
+  ASSERT (un >= vn);
+  ASSERT (vn >= 1);
+  ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un));
+  ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn));
+
+  /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch
+   * for un%4 and inline specific variants. */
+
+#define BRANCH_FOR_MOD(N)                                                     \
+  do                                                                          \
+    {                                                                         \
+      if (vn >= 2)                                                            \
+        {                                                                     \
+          rp[un + 1] = inline_mul_2 (rp, up, un, vp);                         \
+          rp += 2, vp += 2, vn -= 2;                                          \
+        }                                                                     \
+      else                                                                    \
+        {                                                                     \
+          rp[un] = inline_mul_1 (rp, up, un, vp[0]);                          \
+          return;                                                             \
+        }                                                                     \
+                                                                              \
+      while (vn >= 2)                                                         \
+        {                                                                     \
+          rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp);                  \
+          rp += 2, vp += 2, vn -= 2;                                          \
+        }                                                                     \
+                                                                              \
+      while (vn >= 1)                                                         \
+        {                                                                     \
+          rp[un] = inline_addmul_1 (rp, up, un, vp[0]);                       \
+          rp += 1, vp += 1, vn -= 1;                                          \
+        }                                                                     \
+    }                                                                         \
+  while (0);
+
+  switch (((size_t)un) % 4)
+    {
+    case 0:
+      BRANCH_FOR_MOD (0);
+      break;
+    case 1:
+      BRANCH_FOR_MOD (1);
+      break;
+    case 2:
+      BRANCH_FOR_MOD (2);
+      break;
+    case 3:
+      BRANCH_FOR_MOD (3);
+      break;
+    }
+}
-- 
2.26.2