[Gmp-commit] /var/hg/gmp: 10 new changesets

Sun Sep 15 23:39:11 CEST 2013

details:   /var/hg/gmp/rev/08f8e88f4ae1
changeset: 15986:08f8e88f4ae1
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 17:20:43 2013 +0200
description:
Rewrite for a slight speed-up for small and large operands.

details:   /var/hg/gmp/rev/065727b4471e
changeset: 15987:065727b4471e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:09:28 2013 +0200
description:
Replace mul_1 code.

details:   /var/hg/gmp/rev/1186865d021c
changeset: 15988:1186865d021c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:11:24 2013 +0200
description:
Complement c/l table.

details:   /var/hg/gmp/rev/9ab0a10854a4
changeset: 15989:9ab0a10854a4
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:12:08 2013 +0200
description:
Complement c/l table.

details:   /var/hg/gmp/rev/a1c7092df8ac
changeset: 15990:a1c7092df8ac
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:12:31 2013 +0200
description:
Correct c/l table.

details:   /var/hg/gmp/rev/6713d83a375e
changeset: 15991:6713d83a375e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:12:58 2013 +0200
description:
Use R8 for bit testing.

details:   /var/hg/gmp/rev/115a99d93773
changeset: 15992:115a99d93773
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:15:18 2013 +0200
description:
Edit NEWS items

details:   /var/hg/gmp/rev/8617df92ca04
changeset: 15993:8617df92ca04
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:15:57 2013 +0200
description:
Modernise list of CPUs with asm support.

details:   /var/hg/gmp/rev/92ed543aaeed
changeset: 15994:92ed543aaeed
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:16:51 2013 +0200
description:
Fix a comment.

details:   /var/hg/gmp/rev/c86c76910610
changeset: 15995:c86c76910610
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Sep 15 23:39:03 2013 +0200
description:
ChangeLog

diffstat:

 ChangeLog                            |   10 +
 NEWS                                 |   12 +-
 doc/gmp.texi                         |   32 +---
 gmp-impl.h                           |    3 +-
 mpn/x86_64/coreihwl/mul_basecase.asm |  256 +++++++++++++++++-----------------
 mpn/x86_64/coreisbr/aorsmul_1.asm    |  214 +++++++++++++++-------------
 mpn/x86_64/divrem_2.asm              |   14 +-
 mpn/x86_64/fastsse/copyi-palignr.asm |    2 +-
 mpn/x86_64/sqr_diag_addlsh1.asm      |    4 +-
 mpn/x86_64/tabselect.asm             |    4 +-
 10 files changed, 288 insertions(+), 263 deletions(-)

diffs (truncated from 821 to 300 lines):

diff -r 436888a19cec -r c86c76910610 ChangeLog

--- a/ChangeLog	Fri Sep 13 22:06:55 2013 +0200
+++ b/ChangeLog	Sun Sep 15 23:39:03 2013 +0200
@@ -1,5 +1,15 @@
+2013-09-15  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/tabselect.asm: Use R8 for bit testing.
+
+	* mpn/x86_64/coreihwl/mul_basecase.asm: Replace mul_1 code.
+
+	* mpn/x86_64/coreisbr/aorsmul_1.asm: Rewrite.
+
 2013-09-12  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/ia64/gcd_1.asm: Use dep for combining table base and low bits.
+
 	* mpn/x86_64/fastsse/com-palignr.asm: Implement temp fix to properly
 	handle overlap.
 
diff -r 436888a19cec -r c86c76910610 NEWS
--- a/NEWS	Fri Sep 13 22:06:55 2013 +0200
+++ b/NEWS	Sun Sep 15 23:39:03 2013 +0200
@@ -16,7 +16,11 @@
   * Major speedup for ARM, in particular ARM Cortex-A15, thanks to improved
     assembly.
 
-  * Major speedup for SPARC T4/T5 and speedup also for T3.
+  * Major speedup for SPARC T4/T5 and speedup also for T3, thanks to much new
+    assembly.
+
+  * Speedup for Intel Sandy Bridge, Ivy Bridge, Haswell, thanks to rewritten
+    and vastly expanded assembly support.
 
   FEATURES
   * Support for new Intel and AMD CPUs.
@@ -26,9 +30,9 @@
   * New functions mpn_cnd_add_n and mpn_cnd_sub_n. Side-channel silent
     conditional addition and subtraction.
 
-  * Better support for applications which use the mpz_t type, but
-    nevertheless need to call some of the lower-level mpn functions.
-    See the documentation for mpz_limbs_read and related functions.
+  * Better support for applications which use the mpz_t type, but nevertheless
+    need to call some of the lower-level mpn functions.  See the documentation
+    for mpz_limbs_read and related functions.
 
   MISC
   * None.
diff -r 436888a19cec -r c86c76910610 doc/gmp.texi
--- a/doc/gmp.texi	Fri Sep 13 22:06:55 2013 +0200
+++ b/doc/gmp.texi	Sun Sep 15 23:39:03 2013 +0200
@@ -473,29 +473,17 @@
 
 There is assembly code for these CPUs:
 @cindex CPU types
-ARM,
+ARM Cortex-A9, Cortex-A15, and generic ARM,
 DEC Alpha 21064, 21164, and 21264,
-AMD 29000,
-AMD K6, K6-2, Athlon, and Athlon64,
-Hitachi SuperH and SH-2,
-HPPA 1.0, 1.1 and 2.0,
-Intel Pentium, Pentium Pro/II/III, Pentium 4, generic x86,
-Intel IA-64, i960,
-Motorola MC68000, MC68020, MC88100, and MC88110,
-Motorola/IBM PowerPC 32 and 64,
-National NS32000,
-IBM POWER,
-MIPS R3000, R4000,
-SPARCv7, SuperSPARC, generic SPARCv8, UltraSPARC,
-DEC VAX,
-and
-Zilog Z8000.
-Some optimizations also for
-Cray vector systems,
-Clipper,
-IBM ROMP (RT),
-and
-Pyramid AP/XP.
+AMD K8 and K10 (sold under many brands, e.g. Athlon64, Phenom, Opteron)
+Bulldozer, and Bobcat,
+Intel Pentium, Pentium Pro/II/III, Pentium 4, Core2, Nehalem, Sandy bridge, Haswell, generic x86,
+Intel IA-64,
+Motorola/IBM PowerPC 32 and 64 such as POWER970, POWER5, POWER6, and POWER7,
+MIPS 32-bit and 64-bit,
+SPARC 32-bit ad 64-bit with special support for all UltraSPARC models.
+There is also assembly code for many obsolete CPUs.
+
 
 @cindex Home page
 @cindex Web page
diff -r 436888a19cec -r c86c76910610 gmp-impl.h
--- a/gmp-impl.h	Fri Sep 13 22:06:55 2013 +0200
+++ b/gmp-impl.h	Sun Sep 15 23:39:03 2013 +0200
@@ -4587,7 +4587,8 @@
 
 #if WANT_FAT_BINARY && (HAVE_HOST_CPU_FAMILY_x86 || HAVE_HOST_CPU_FAMILY_x86_64)
 /* NOTE: The function pointers in this struct are also in CPUVEC_FUNCS_LIST
-   in mpn/x86/x86-defs.m4.  Be sure to update that when changing here.  */
+   in mpn/x86/x86-defs.m4 and mpn/x86_64/x86_64-defs.m4.  Be sure to update
+   those when changing here.  */
 struct cpuvec_t {
   DECL_add_n           ((*add_n));
   DECL_addlsh1_n       ((*addlsh1_n));
diff -r 436888a19cec -r c86c76910610 mpn/x86_64/coreihwl/mul_basecase.asm
--- a/mpn/x86_64/coreihwl/mul_basecase.asm	Fri Sep 13 22:06:55 2013 +0200
+++ b/mpn/x86_64/coreihwl/mul_basecase.asm	Sun Sep 15 23:39:03 2013 +0200
@@ -23,38 +23,30 @@
 include(`../config.m4')
 
 C cycles/limb	mul_1		mul_2		mul_3		addmul_2
-C AMD K8,K9	 ?		n/a		 -		n/a
-C AMD K10	 ?		n/a		 -		n/a
-C AMD bull	 ?		n/a		 -		n/a
-C AMD pile	 ?		n/a		 -		n/a
-C AMD steam	 ?		 ?		 ?		 ?
-C AMD bobcat	 ?		n/a		 -		n/a
-C AMD jaguar	 ?		 ?		 ?		 ?
-C Intel P4	 ?		n/a		 -		n/a
-C Intel core	 ?		n/a		 -		n/a
-C Intel NHM	 ?		n/a		 -		n/a
-C Intel SBR	 ?		n/a		 -		n/a
-C Intel IBR	 ?		n/a		 -		n/a
-C Intel HWL	 2.45		 1.86		 -		 2.15
-C Intel BWL	 ?		 ?		 ?		 ?
-C Intel atom	 ?		n/a		 -		n/a
-C VIA nano	 ?		n/a		 -		n/a
+C AMD K8,K9	n/a		n/a		 -		n/a
+C AMD K10	n/a		n/a		 -		n/a
+C AMD bull	n/a		n/a		 -		n/a
+C AMD pile	n/a		n/a		 -		n/a
+C AMD steam	 ?		 ?		 -		 ?
+C AMD bobcat	n/a		n/a		 -		n/a
+C AMD jaguar	 ?		 ?		 -		 ?
+C Intel P4	n/a		n/a		 -		n/a
+C Intel core	n/a		n/a		 -		n/a
+C Intel NHM	n/a		n/a		 -		n/a
+C Intel SBR	n/a		n/a		 -		n/a
+C Intel IBR	n/a		n/a		 -		n/a
+C Intel HWL	 1.77		 1.86		 -		 2.15
+C Intel BWL	 ?		 ?		 -		 ?
+C Intel atom	n/a		n/a		 -		n/a
+C VIA nano	n/a		n/a		 -		n/a
 
 C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+C optimisation tool suite written by David Harvey and TorbjÃ¶rn Granlund.
 
 C TODO
-C  * Merge Haswell-specific mul_1, then, if new code does not use indexing,
-C    clean up pointer updates.  Current Haswell mul_1.asm uses an unfortunate
-C    number of regs, thus awkward to use here.
 C  * Adjoin a mul_3.
 C  * Further micro-optimise.
 
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-
 define(`rp',      `%rdi')
 define(`up',      `%rsi')
 define(`un_param',`%rdx')
@@ -81,108 +73,115 @@
 IFDOS(`	mov	56(%rsp), %r8d	')
 	push	%rbx
 	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
 	mov	un_param, un		C free up rdx
 	neg	un
 
-	mov	(up), %rax		C shared for mul_1 and mul_2
-	lea	(up,un_param,8), up	C point at operand end
-	lea	(rp,un_param,8), rp	C point at rp[un-1]
-
-	mov	(vp), v0		C shared for mul_1 and mul_2
-	mul	v0			C shared for mul_1 and mul_2
+	mov	un_param, n		C FIXME: share
+	sar	$2, n			C FIXME: share
 
 	test	$1, R8(vn)
 	jz	L(do_mul_2)
 
+define(`w4',	`%r9')
+define(`w5',	`%r14')
+
+	mov	(vp), %rdx
+
 L(do_mul_1):
 	test	$1, R8(un)
 	jnz	L(m1x1)
 
-L(m1x0):mov	%rax, w0		C un = 2, 4, 6, 8, ...
-	mov	%rdx, w1
-	mov	8(up,un,8), %rax
-	test	$2, R8(un)
+L(m1x0):test	$2, R8(un)
 	jnz	L(m110)
 
-L(m100):lea	2(un), n		C un = 4, 8, 12, ...
+L(m100):
+	mulx(	(up), w5, w2)
+	mulx(	8,(up), w1, w3)
+	lea	-24(rp), rp
 	jmp	L(m1l0)
 
-L(m110):lea	(un), n			C un = 2, 6, 10, ...
+L(m110):
+	mulx(	(up), w3, w4)
+	mulx(	8,(up), w1, w5)
+	lea	-8(rp), rp
+	test	n, n
+	jz	L(cj2)
+	mulx(	16,(up), w0, w2)
+	lea	16(up), up
 	jmp	L(m1l2)
 
-L(m1x1):mov	%rax, w1		C un = 1, 3, 5, 7, ...
-	mov	%rdx, w0
-	test	$2, R8(un)
+L(m1x1):test	$2, R8(un)
 	jz	L(m111)
 
-L(m101):lea	3(un), n		C un = 1, 5, 9, ...
+L(m101):
+	mulx(	(up), w4, w5)
+	lea	-16(rp), rp
 	test	n, n
-	js	L(m1l1)
-	mov	%rax, -8(rp)
-	mov	%rdx, (rp)
-	pop	%rbp
-	pop	%rbx
-	FUNC_EXIT()
-	ret
+	jz	L(cj1)
+	mulx(	8,(up), w0, w2)
+	lea	8(up), up
+	jmp	L(m1l1)
 
-L(m111):lea	1(un), n		C un = 3, 7, 11, ...
-	mov	8(up,un,8), %rax
+L(m111):
+	mulx(	(up), w2, w3)
+	mulx(	8,(up), w0, w4)
+	mulx(	16,(up), w1, w5)
+	lea	24(up), up
+	test	n, n
+	jnz	L(gt3)
+	add	w0, w3
+	jmp	L(cj3)
+L(gt3):	add	w0, w3
 	jmp	L(m1l3)
 
-	ALIGN(16)		C FIXME?
-L(m1tp):mov	%rdx, w0
-	add	%rax, w1
-L(m1l1):mov	-16(up,n,8), %rax
-	adc	$0, w0
-	mul	v0
-	add	%rax, w0
-	mov	w1, -24(rp,n,8)
-	mov	-8(up,n,8), %rax
-	mov	%rdx, w1
-	adc	$0, w1
-L(m1l0):mul	v0
-	mov	w0, -16(rp,n,8)
-	add	%rax, w1
-	mov	%rdx, w0
-	mov	(up,n,8), %rax
-	adc	$0, w0
-L(m1l3):mul	v0
-	mov	w1, -8(rp,n,8)
-	mov	%rdx, w1
-	add	%rax, w0
-	mov	8(up,n,8), %rax
-	adc	$0, w1
-L(m1l2):mul	v0
-	mov	w0, (rp,n,8)
-	add	$4, n
-	jnc	L(m1tp)
+	ALIGN(32)
+L(m1tp):lea	32(rp), rp
+L(m1l3):mov	w2, (rp)
+	mulx(	(up), w0, w2)
+L(m1l2):mov	w3, 8(rp)
+	adc	w1, w4
+L(m1l1):adc	w0, w5
+	mov	w4, 16(rp)
+	mulx(	8,(up), w1, w3)
+L(m1l0):mov	w5, 24(rp)
+	mulx(	16,(up), w0, w4)
+	adc	w1, w2
+	mulx(	24,(up), w1, w5)
+	adc	w0, w3
+	lea	32(up), up
+	dec	n
+	jnz	L(m1tp)