[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Jul 29 15:38:46 CEST 2023
details: /var/hg/gmp/rev/fdfe3f1baa1c
changeset: 18413:fdfe3f1baa1c
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Jul 29 15:07:12 2023 +0200
description:
Fix typos.
details: /var/hg/gmp/rev/ff1f39e6eba5
changeset: 18414:ff1f39e6eba5
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Jul 29 15:07:46 2023 +0200
description:
Rewrite z13 mul_basecase, using new addmul_1.
details: /var/hg/gmp/rev/bc0ec1699e58
changeset: 18415:bc0ec1699e58
user: Torbjorn Granlund <tg at gmplib.org>
date: Sat Jul 29 15:38:41 2023 +0200
description:
Release updates.
diffstat:
AUTHORS | 2 +
ChangeLog | 10 +
Makefile.am | 9 +-
NEWS | 15 +-
gmp-h.in | 4 +-
mpn/s390_64/z13/mul_basecase.asm | 391 +++++++++++++++++++-------------------
mpn/s390_64/z13/submul_1.asm | 7 +-
7 files changed, 236 insertions(+), 202 deletions(-)
diffs (truncated from 571 to 300 lines):
diff -r e8890259c68a -r bc0ec1699e58 AUTHORS
--- a/AUTHORS Sat Jul 29 01:04:10 2023 +0200
+++ b/AUTHORS Sat Jul 29 15:38:41 2023 +0200
@@ -106,3 +106,5 @@
mpn/powerpc64/vmx/popcount.asm.
Seth Troisi mpz/nextprime.c general speed-up and prevprime.
+
+Marius Hillenbrand mpn/s390_64/z13/*.c (later used as basis for asm code)
diff -r e8890259c68a -r bc0ec1699e58 ChangeLog
--- a/ChangeLog Sat Jul 29 01:04:10 2023 +0200
+++ b/ChangeLog Sat Jul 29 15:38:41 2023 +0200
@@ -1,3 +1,13 @@
+2023-07-29 Torbjörn Granlund <tg at gmplib.org>
+
+ * Version 6.3.0 released.
+
+ * mpn/s390_64/z13/mul_basecase.asm: Rewrite, using new addmul_1.asm.
+
+ * mpn/s390_64/z13/mul_1.asm: Rewrite.
+ * mpn/s390_64/z13/addmul_1.asm: Likewise.
+ * mpn/s390_64/z13/submul_1.asm: Likewise.
+
2023-07-27 Niels Möller <nisse at lysator.liu.se>
Update DIV_QR_1N_PI1_METHOD to use method 3 or 4.
diff -r e8890259c68a -r bc0ec1699e58 Makefile.am
--- a/Makefile.am Sat Jul 29 01:04:10 2023 +0200
+++ b/Makefile.am Sat Jul 29 15:38:41 2023 +0200
@@ -91,6 +91,7 @@
# 6.1.2 13:2:3 9:2:5 -
# 6.2.0 14:0:4 10:0:6 -
# 6.2.1 14:1:4 10:1:6 -
+# 6.3.0 15:0:5 11:0:7 -
#
# Starting at 3:0:0 is a slight abuse of the versioning system, but it
# ensures we're past soname libgmp.so.2, which was used on Debian GNU/Linux
@@ -103,13 +104,13 @@
# it's still good to get the shared library filename (like
# libgmpxx.so.3.0.4) incrementing, to make it clear which GMP it's from.
-LIBGMP_LT_CURRENT = 14
+LIBGMP_LT_CURRENT = 15
LIBGMP_LT_REVISION = 0
-LIBGMP_LT_AGE = 4
+LIBGMP_LT_AGE = 5
-LIBGMPXX_LT_CURRENT = 10
+LIBGMPXX_LT_CURRENT = 11
LIBGMPXX_LT_REVISION = 0
-LIBGMPXX_LT_AGE = 6
+LIBGMPXX_LT_AGE = 7
SUBDIRS = tests mpn mpz mpq mpf printf scanf rand cxx demos tune doc
diff -r e8890259c68a -r bc0ec1699e58 NEWS
--- a/NEWS Sat Jul 29 01:04:10 2023 +0200
+++ b/NEWS Sat Jul 29 15:38:41 2023 +0200
@@ -1,9 +1,9 @@
-Copyright 1996, 1999-2016, 2018-2020 Free Software Foundation, Inc.
+Copyright 1996, 1999-2016, 2018-2023 Free Software Foundation, Inc.
Verbatim copying and distribution of this entire article is permitted in any
medium, provided this notice is preserved.
-Changes between GMP version 6.2.* and *.*.*
+Changes between GMP version 6.2.* and 6.3.*.
BUGS FIXED
* A possible overflow of type int is avoided for mpz_cmp on huge operands.
@@ -20,6 +20,12 @@
usage. These types have been present in gmp.h at least since
GMP-4.0, but previously not advertised to users.
+ * Support for 64-bit Arm under Macos.
+
+ * Support for the loongarch64 CPU family.
+
+ * Support for building with LTO, link-time optimisations.
+
SPEEDUPS
* New special code for base = 2 in mpz_powm reduces the average time
for the functions that test primality.
@@ -29,6 +35,11 @@
* Speedup for multiplications (some sizes only) thanks to new
internal functions to compute small negacyclic products.
+ * Special assembly code for IBM z13 and later "mainframe" CPUs, resulting in
+ a huge speedup.
+
+ * Improved assembly for several 64-bit x86 CPUs, Risc-V, 64-bit Arm.
+
Changes between GMP version 6.1.* and 6.2.0
BUGS FIXED
diff -r e8890259c68a -r bc0ec1699e58 gmp-h.in
--- a/gmp-h.in Sat Jul 29 01:04:10 2023 +0200
+++ b/gmp-h.in Sat Jul 29 15:38:41 2023 +0200
@@ -2336,8 +2336,8 @@
/* Major version number is the value of __GNU_MP__ too, above. */
#define __GNU_MP_VERSION 6
-#define __GNU_MP_VERSION_MINOR 2
-#define __GNU_MP_VERSION_PATCHLEVEL 99
+#define __GNU_MP_VERSION_MINOR 3
+#define __GNU_MP_VERSION_PATCHLEVEL 0
#define __GNU_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL)
#define __GMP_H__
diff -r e8890259c68a -r bc0ec1699e58 mpn/s390_64/z13/mul_basecase.asm
--- a/mpn/s390_64/z13/mul_basecase.asm Sat Jul 29 01:04:10 2023 +0200
+++ b/mpn/s390_64/z13/mul_basecase.asm Sat Jul 29 15:38:41 2023 +0200
@@ -33,191 +33,207 @@
C INPUT PARAMETERS
define(`rp', `%r2')
-define(`up', `%r3')
-define(`un', `%r4')
-define(`vp', `%r5')
-define(`vn_arg',`%r6')
-
-define(`vn', `%r13')
+define(`ap', `%r3')
+define(`an', `%r4') C 32
+define(`bp', `%r5') C 40
+define(`bn', `%r6') C 48
define(`idx', `%r14')
-define(`v0', `%r11')
-define(`v1', `%r12')
+define(`b0', `%r10')
+
+dnl live in addmul_1:
+dnl r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14
+dnl xx xx rp ap an bp xx xx xx xx b0 i xx xx idx
+dnl stack: bn
+
+dnl TODO
+dnl * Have mul_1 start without initial (un mod 4) separation, instead handle
+dnl after loop. Then fall into 4 separate addmul_1 loops.
+dnl * Streamline handling of bn, an, %r11 to reduce the # if memops.
define(`MUL_1',`
pushdef(`L',
defn(`L')$1`'_m1)
+ vzero %v2
+ srlg %r11, %r0, 2
- vzero %v29
- lghi %r9, 0
- tmll un, 1
- srlg un, un, 1
- je L(evn)
-L(odd): lg %r7, 0(up)
- mlgr %r6, v0 C W1 W0
- stg %r7, 0(rp)
- lghi idx, 8
- clgije un, 0, L(end) C FIXME: Done, return!
- j L(top)
-L(evn): lghi %r6, 0
- lghi idx, 0
+ tmll %r0, 1
+ je L(bx0)
+L(bx1): tmll %r0, 2
+ jne L(b11)
+
+L(b01): lghi idx, -24
+ lg %r13, 0(ap)
+ mlgr %r12, b0
+ stg %r13, 0(rp)
+ cgijne %r11, 0, L(cj0)
+
+L(1): stg %r12, 8(rp)
+ lmg %r6, %r14, 48(%r15)
+ br %r14
+
+L(b11): lghi idx, -8
+ lg %r9, 0(ap)
+ mlgr %r8, b0
+ stg %r9, 0(rp)
+ j L(cj1)
+
+L(bx0): tmll %r0, 2
+ jne L(b10)
+L(b00): lghi idx, -32
+ lghi %r12, 0
+L(cj0): lg %r1, 32(idx, ap)
+ lg %r9, 40(idx, ap)
+ mlgr %r0, b0
+ mlgr %r8, b0
+ vlvgp %v6, %r0, %r1
+ vlvgp %v7, %r9, %r12
+ j L(mid)
-L(top): lgr %r9, %r6
- lg %r1, 0(idx, up)
- lg %r7, 8(idx, up)
- mlgr %r0, v0 C W1 W0
- mlgr %r6, v0 C W2 W1
- vlvgp %v23, %r0, %r1 C W1 W0
- vlvgp %v21, %r7, %r9 C W1 W0
- vacq %v20, %v23, %v21, %v29 C
- vacccq %v29, %v23, %v21, %v29 C carry critical path 3
- vpdi %v20, %v20, %v20, 4
- vst %v20, 0(idx, rp), 3
- la idx, 16(idx)
- brctg un, L(top)
+L(b10): lghi idx, -16
+ lghi %r8, 0
+L(cj1): lg %r7, 16(idx, ap)
+ lg %r13, 24(idx, ap)
+ mlgr %r6, b0
+ mlgr %r12, b0
+ vlvgp %v6, %r6, %r7
+ vlvgp %v7, %r13, %r8
+ cgije %r11, 0, L(end)
-L(end): vlgvg %r7, %v29, 1
- algr %r6, %r7
- stg %r6, 0(idx, rp)
+L(top): lg %r1, 32(idx, ap)
+ lg %r9, 40(idx, ap)
+ mlgr %r0, b0
+ mlgr %r8, b0
+ vacq %v3, %v6, %v7, %v2
+ vacccq %v2, %v6, %v7, %v2
+ vpdi %v3, %v3, %v3, 4
+ vst %v3, 16(idx, rp), 3
+ vlvgp %v6, %r0, %r1
+ vlvgp %v7, %r9, %r12
+L(mid): lg %r7, 48(idx, ap)
+ lg %r13, 56(idx, ap)
+ mlgr %r6, b0
+ mlgr %r12, b0
+ vacq %v1, %v6, %v7, %v2
+ vacccq %v2, %v6, %v7, %v2
+ vpdi %v1, %v1, %v1, 4
+ vst %v1, 32(idx, rp), 3
+ vlvgp %v6, %r6, %r7
+ vlvgp %v7, %r13, %r8
+ la idx, 32(idx)
+ brctg %r11, L(top)
+
+L(end): vacq %v3, %v6, %v7, %v2
+ vacccq %v2, %v6, %v7, %v2
+ vpdi %v3, %v3, %v3, 4
+ vst %v3, 16(idx, rp), 3
+
+ vlgvg %r0, %v2, 1
+ algr %r0, %r12
+ stg %r0, 32(idx, rp)
popdef(`L')
')
-define(`MUL_2',`
-pushdef(`L',
-defn(`L')$1`'_m2)
- vzero %v27
- vzero %v28
- vzero %v29
- vzero %v30
- lghi %r10, 0
- lg v0, 0(vp)
- lg v1, 8(vp)
- tmll un, 1
- srlg un, un, 1
- je L(evn)
-
-L(odd): lg %r7, 0(up)
- mlgr %r6, v0 C W2 W1
- lg %r1, 0(up)
- stg %r7, 0(rp)
- lghi idx, 8
-dnl clgije un, 0, L(end)
- j L(top)
-
-L(evn): lghi %r6, 0
- lghi idx, 0
- lghi %r1, 0
-
-L(top): lg %r9, 0(idx, up)
- mlgr %r0, v1 C W2 W1
- mlgr %r8, v1 C W3 W2
- vlvgp %v22, %r0, %r1 C W2 W1
- vlvgp %v23, %r9, %r6 C W2 W1
- lg %r1, 0(idx, up)
- lg %r7, 8(idx, up)
- mlgr %r0, v0 C W2 W1
- mlgr %r6, v0 C W3 W2
- vlvgp %v20, %r0, %r1 C W2 W1
- vlvgp %v21, %r7, %r10 C W2 W1
- vacq %v24, %v22, %v23, %v27 C
- vacccq %v27, %v22, %v23, %v27 C carry critical path 1
- vacq %v23, %v24, %v20, %v28 C
- vacccq %v28, %v24, %v20, %v28 C carry critical path 2
- vacq %v20, %v23, %v21, %v29 C
- vacccq %v29, %v23, %v21, %v29 C carry critical path 3
- vpdi %v20, %v20, %v20, 4
- lg %r1, 8(idx, up)
- vst %v20, 0(idx, rp), 3
More information about the gmp-commit
mailing list