[Gmp-commit] /var/hg/gmp: 3 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sat Jul 29 15:38:46 CEST 2023


details:   /var/hg/gmp/rev/fdfe3f1baa1c
changeset: 18413:fdfe3f1baa1c
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Jul 29 15:07:12 2023 +0200
description:
Fix typos.

details:   /var/hg/gmp/rev/ff1f39e6eba5
changeset: 18414:ff1f39e6eba5
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Jul 29 15:07:46 2023 +0200
description:
Rewrite z13 mul_basecase, using new addmul_1.

details:   /var/hg/gmp/rev/bc0ec1699e58
changeset: 18415:bc0ec1699e58
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sat Jul 29 15:38:41 2023 +0200
description:
Release updates.

diffstat:

 AUTHORS                          |    2 +
 ChangeLog                        |   10 +
 Makefile.am                      |    9 +-
 NEWS                             |   15 +-
 gmp-h.in                         |    4 +-
 mpn/s390_64/z13/mul_basecase.asm |  391 +++++++++++++++++++-------------------
 mpn/s390_64/z13/submul_1.asm     |    7 +-
 7 files changed, 236 insertions(+), 202 deletions(-)

diffs (truncated from 571 to 300 lines):

diff -r e8890259c68a -r bc0ec1699e58 AUTHORS
--- a/AUTHORS	Sat Jul 29 01:04:10 2023 +0200
+++ b/AUTHORS	Sat Jul 29 15:38:41 2023 +0200
@@ -106,3 +106,5 @@
 			mpn/powerpc64/vmx/popcount.asm.
 
 Seth Troisi		mpz/nextprime.c general speed-up and prevprime.
+
+Marius Hillenbrand	mpn/s390_64/z13/*.c (later used as basis for asm code)
diff -r e8890259c68a -r bc0ec1699e58 ChangeLog
--- a/ChangeLog	Sat Jul 29 01:04:10 2023 +0200
+++ b/ChangeLog	Sat Jul 29 15:38:41 2023 +0200
@@ -1,3 +1,13 @@
+2023-07-29  Torbjörn Granlund  <tg at gmplib.org>
+
+	* Version 6.3.0 released.
+
+	* mpn/s390_64/z13/mul_basecase.asm: Rewrite, using new addmul_1.asm.
+
+	* mpn/s390_64/z13/mul_1.asm: Rewrite.
+	* mpn/s390_64/z13/addmul_1.asm: Likewise.
+	* mpn/s390_64/z13/submul_1.asm: Likewise.
+
 2023-07-27  Niels Möller  <nisse at lysator.liu.se>
 
 	Update DIV_QR_1N_PI1_METHOD to use method 3 or 4.
diff -r e8890259c68a -r bc0ec1699e58 Makefile.am
--- a/Makefile.am	Sat Jul 29 01:04:10 2023 +0200
+++ b/Makefile.am	Sat Jul 29 15:38:41 2023 +0200
@@ -91,6 +91,7 @@
 #        6.1.2   13:2:3    9:2:5     -
 #        6.2.0   14:0:4   10:0:6     -
 #        6.2.1   14:1:4   10:1:6     -
+#        6.3.0   15:0:5   11:0:7     -
 #
 # Starting at 3:0:0 is a slight abuse of the versioning system, but it
 # ensures we're past soname libgmp.so.2, which was used on Debian GNU/Linux
@@ -103,13 +104,13 @@
 # it's still good to get the shared library filename (like
 # libgmpxx.so.3.0.4) incrementing, to make it clear which GMP it's from.
 
-LIBGMP_LT_CURRENT    = 14
+LIBGMP_LT_CURRENT    = 15
 LIBGMP_LT_REVISION   = 0
-LIBGMP_LT_AGE        = 4
+LIBGMP_LT_AGE        = 5
 
-LIBGMPXX_LT_CURRENT  = 10
+LIBGMPXX_LT_CURRENT  = 11
 LIBGMPXX_LT_REVISION = 0
-LIBGMPXX_LT_AGE      = 6
+LIBGMPXX_LT_AGE      = 7
 
 
 SUBDIRS = tests mpn mpz mpq mpf printf scanf rand cxx demos tune doc
diff -r e8890259c68a -r bc0ec1699e58 NEWS
--- a/NEWS	Sat Jul 29 01:04:10 2023 +0200
+++ b/NEWS	Sat Jul 29 15:38:41 2023 +0200
@@ -1,9 +1,9 @@
-Copyright 1996, 1999-2016, 2018-2020 Free Software Foundation, Inc.
+Copyright 1996, 1999-2016, 2018-2023 Free Software Foundation, Inc.
 
 Verbatim copying and distribution of this entire article is permitted in any
 medium, provided this notice is preserved.
 
-Changes between GMP version 6.2.* and *.*.*
+Changes between GMP version 6.2.* and 6.3.*.
 
   BUGS FIXED
   * A possible overflow of type int is avoided for mpz_cmp on huge operands.
@@ -20,6 +20,12 @@
     usage. These types have been present in gmp.h at least since
     GMP-4.0, but previously not advertised to users.
 
+  * Support for 64-bit Arm under Macos.
+
+  * Support for the loongarch64 CPU family.
+
+  * Support for building with LTO, link-time optimisations.
+
   SPEEDUPS
   * New special code for base = 2 in mpz_powm reduces the average time
     for the functions that test primality.
@@ -29,6 +35,11 @@
   * Speedup for multiplications (some sizes only) thanks to new
     internal functions to compute small negacyclic products.
 
+  * Special assembly code for IBM z13 and later "mainframe" CPUs, resulting in
+    a huge speedup.
+
+  * Improved assembly for several 64-bit x86 CPUs, Risc-V, 64-bit Arm.
+
 Changes between GMP version 6.1.* and 6.2.0
 
   BUGS FIXED
diff -r e8890259c68a -r bc0ec1699e58 gmp-h.in
--- a/gmp-h.in	Sat Jul 29 01:04:10 2023 +0200
+++ b/gmp-h.in	Sat Jul 29 15:38:41 2023 +0200
@@ -2336,8 +2336,8 @@
 
 /* Major version number is the value of __GNU_MP__ too, above. */
 #define __GNU_MP_VERSION            6
-#define __GNU_MP_VERSION_MINOR      2
-#define __GNU_MP_VERSION_PATCHLEVEL 99
+#define __GNU_MP_VERSION_MINOR      3
+#define __GNU_MP_VERSION_PATCHLEVEL 0
 #define __GNU_MP_RELEASE (__GNU_MP_VERSION * 10000 + __GNU_MP_VERSION_MINOR * 100 + __GNU_MP_VERSION_PATCHLEVEL)
 
 #define __GMP_H__
diff -r e8890259c68a -r bc0ec1699e58 mpn/s390_64/z13/mul_basecase.asm
--- a/mpn/s390_64/z13/mul_basecase.asm	Sat Jul 29 01:04:10 2023 +0200
+++ b/mpn/s390_64/z13/mul_basecase.asm	Sat Jul 29 15:38:41 2023 +0200
@@ -33,191 +33,207 @@
 
 C INPUT PARAMETERS
 define(`rp',	`%r2')
-define(`up',	`%r3')
-define(`un',	`%r4')
-define(`vp',	`%r5')
-define(`vn_arg',`%r6')
-
-define(`vn',    `%r13')
+define(`ap',	`%r3')
+define(`an',	`%r4')	C 32
+define(`bp',	`%r5')	C 40
+define(`bn',	`%r6')	C 48
 
 define(`idx',	`%r14')
-define(`v0',	`%r11')
-define(`v1',	`%r12')
+define(`b0',	`%r10')
+
+dnl live in addmul_1:
+dnl r0  r1  r2  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r13 r14
+dnl xx  xx  rp  ap  an  bp  xx  xx  xx  xx  b0  i   xx  xx idx
+dnl stack: bn
+
+dnl TODO
+dnl  * Have mul_1 start without initial (un mod 4) separation, instead handle
+dnl    after loop.  Then fall into 4 separate addmul_1 loops.
+dnl  * Streamline handling of bn, an, %r11 to reduce the # if memops.
 
 define(`MUL_1',`
 pushdef(`L',
 defn(`L')$1`'_m1)
+	vzero	%v2
+	srlg	%r11, %r0, 2
 
-	vzero	%v29
-	lghi	%r9, 0
-	tmll	un, 1
-	srlg	un, un, 1
-	je	L(evn)
-L(odd):	lg	%r7, 0(up)
-	mlgr	%r6, v0			C W1 W0
-	stg	%r7, 0(rp)
-	lghi	idx, 8
-	clgije	un, 0, L(end)		C FIXME: Done, return!
-	j	L(top)
-L(evn):	lghi	%r6, 0
-	lghi	idx, 0
+	tmll	%r0, 1
+	je	L(bx0)
+L(bx1):	tmll	%r0, 2
+	jne	L(b11)
+
+L(b01):	lghi	idx, -24
+	lg	%r13, 0(ap)
+	mlgr	%r12, b0
+	stg	%r13, 0(rp)
+	cgijne	%r11, 0, L(cj0)
+
+L(1):	stg	%r12, 8(rp)
+	lmg	%r6, %r14, 48(%r15)
+	br	%r14
+
+L(b11):	lghi	idx, -8
+	lg	%r9, 0(ap)
+	mlgr	%r8, b0
+	stg	%r9, 0(rp)
+	j	L(cj1)
+
+L(bx0):	tmll	%r0, 2
+	jne	L(b10)
+L(b00):	lghi	idx, -32
+	lghi	%r12, 0
+L(cj0):	lg	%r1, 32(idx, ap)
+	lg	%r9, 40(idx, ap)
+	mlgr	%r0, b0
+	mlgr	%r8, b0
+	vlvgp	%v6, %r0, %r1
+	vlvgp	%v7, %r9, %r12
+	j	L(mid)
 
-L(top):	lgr	%r9, %r6
-	lg	%r1, 0(idx, up)
-	lg	%r7, 8(idx, up)
-	mlgr	%r0, v0			C W1 W0
-	mlgr	%r6, v0			C W2 W1
-	vlvgp	%v23, %r0, %r1		C W1 W0
-	vlvgp	%v21, %r7, %r9		C W1 W0
-	vacq	%v20, %v23, %v21, %v29	C
-	vacccq	%v29, %v23, %v21, %v29	C	carry critical path 3
-	vpdi	%v20, %v20, %v20, 4
-	vst	%v20, 0(idx, rp), 3
-	la	idx, 16(idx)
-	brctg	un, L(top)
+L(b10):	lghi	idx, -16
+	lghi	%r8, 0
+L(cj1):	lg	%r7, 16(idx, ap)
+	lg	%r13, 24(idx, ap)
+	mlgr	%r6, b0
+	mlgr	%r12, b0
+	vlvgp	%v6, %r6, %r7
+	vlvgp	%v7, %r13, %r8
+	cgije	%r11, 0, L(end)
 
-L(end):	vlgvg	%r7, %v29, 1
-	algr	%r6, %r7
-	stg	%r6, 0(idx, rp)
+L(top):	lg	%r1, 32(idx, ap)
+	lg	%r9, 40(idx, ap)
+	mlgr	%r0, b0
+	mlgr	%r8, b0
+	vacq	%v3, %v6, %v7, %v2
+	vacccq	%v2, %v6, %v7, %v2
+	vpdi	%v3, %v3, %v3, 4
+	vst	%v3, 16(idx, rp), 3
+	vlvgp	%v6, %r0, %r1
+	vlvgp	%v7, %r9, %r12
+L(mid):	lg	%r7, 48(idx, ap)
+	lg	%r13, 56(idx, ap)
+	mlgr	%r6, b0
+	mlgr	%r12, b0
+	vacq	%v1, %v6, %v7, %v2
+	vacccq	%v2, %v6, %v7, %v2
+	vpdi	%v1, %v1, %v1, 4
+	vst	%v1, 32(idx, rp), 3
+	vlvgp	%v6, %r6, %r7
+	vlvgp	%v7, %r13, %r8
+	la	idx, 32(idx)
+	brctg	%r11, L(top)
+
+L(end):	vacq	%v3, %v6, %v7, %v2
+	vacccq	%v2, %v6, %v7, %v2
+	vpdi	%v3, %v3, %v3, 4
+	vst	%v3, 16(idx, rp), 3
+
+	vlgvg	%r0, %v2, 1
+	algr	%r0, %r12
+	stg	%r0, 32(idx, rp)
 popdef(`L')
 ')
 
-define(`MUL_2',`
-pushdef(`L',
-defn(`L')$1`'_m2)
-	vzero	%v27
-	vzero	%v28
-	vzero	%v29
-	vzero	%v30
-	lghi	%r10, 0
-	lg	v0, 0(vp)
-	lg	v1, 8(vp)
-	tmll	un, 1
-	srlg	un, un, 1
-	je	L(evn)
-
-L(odd):	lg	%r7, 0(up)
-	mlgr	%r6, v0			C W2 W1
-	lg	%r1, 0(up)
-	stg	%r7, 0(rp)
-	lghi	idx, 8
-dnl	clgije	un, 0, L(end)
-	j	L(top)
-
-L(evn):	lghi	%r6, 0
-	lghi	idx, 0
-	lghi	%r1, 0
-
-L(top):	lg	%r9, 0(idx, up)
-	mlgr	%r0, v1			C W2 W1
-	mlgr	%r8, v1			C W3 W2
-	vlvgp	%v22, %r0, %r1		C W2 W1
-	vlvgp	%v23, %r9, %r6		C W2 W1
-	lg	%r1, 0(idx, up)
-	lg	%r7, 8(idx, up)
-	mlgr	%r0, v0			C W2 W1
-	mlgr	%r6, v0			C W3 W2
-	vlvgp	%v20, %r0, %r1		C W2 W1
-	vlvgp	%v21, %r7, %r10		C W2 W1
-	vacq	%v24, %v22, %v23, %v27	C
-	vacccq	%v27, %v22, %v23, %v27	C	carry critical path 1
-	vacq	%v23, %v24, %v20, %v28	C
-	vacccq	%v28, %v24, %v20, %v28	C	carry critical path 2
-	vacq	%v20, %v23, %v21, %v29	C
-	vacccq	%v29, %v23, %v21, %v29	C	carry critical path 3
-	vpdi	%v20, %v20, %v20, 4
-	lg	%r1, 8(idx, up)
-	vst	%v20, 0(idx, rp), 3


More information about the gmp-commit mailing list