[Gmp-commit] /var/hg/gmp: 2 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Tue Jun 27 16:53:31 UTC 2017


details:   /var/hg/gmp/rev/ed86ced358a6
changeset: 17458:ed86ced358a6
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue Jun 27 18:48:05 2017 +0200
description:
Expand to use 4 addmul_1 loops.

details:   /var/hg/gmp/rev/3921c37733fb
changeset: 17459:3921c37733fb
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue Jun 27 18:52:54 2017 +0200
description:
ChangeLog

diffstat:

 ChangeLog                       |   28 +++-
 mpn/x86_64/zen/sqr_basecase.asm |  281 ++++++++++++++++++++++++++-------------
 2 files changed, 215 insertions(+), 94 deletions(-)

diffs (truncated from 417 to 300 lines):

diff -r b046f121b5da -r 3921c37733fb ChangeLog
--- a/ChangeLog	Tue Jun 27 10:30:36 2017 +0200
+++ b/ChangeLog	Tue Jun 27 18:52:54 2017 +0200
@@ -1,3 +1,29 @@
+2017-06-27  Torbjörn Granlund  <tg at gmplib.org>
+
+	* mpn/x86_64/zen/sqr_basecase.asm: Expand to use 4 addmul_1 loops.
+
+	* mpn/x86_64/x86_64-defs.m4 (sarx): New macro.
+
+2017-06-26  Torbjörn Granlund  <tg at gmplib.org>
+
+	* mpn/x86_64/coreibwl/sqr_basecase.asm: Rewrite to do 2x and limb
+	squaring in main loop.
+
+2017-06-20  Torbjörn Granlund  <tg at gmplib.org>
+
+	* mpn/x86_64/atom/cnd_add_n.asm: New grabber file.
+	* mpn/x86_64/atom/cnd_sub_n.asm: Likewise.
+
+	* mpn/x86_64/coreihwl/aorrlsh_n.asm: New grabber file.
+
+2017-06-16  Torbjörn Granlund  <tg at gmplib.org>
+
+	* mpn/x86_64/zen/mul_basecase.asm: Do overlapped software pipelining.
+
+	* mpn/x86_64/silvermont/mul_basecase.asm: New grabber file.
+	* mpn/x86_64/silvermont/sqr_basecase.asm: Likewise.
+	* mpn/x86_64/silvermont/mullo_basecase.asm: Likewise.
+
 2017-06-14  Torbjörn Granlund  <tg at gmplib.org>
 
 	* mpn/x86_64/zen/mullo_basecase.asm: New file.
@@ -395,7 +421,7 @@
 	instead of mpn_divexact_1.
 
 	* gen-bases.c (binvert): New function, computing modular inverse and
-p	low zero count.
+	low zero count.
 	(header): Print MP_BASES_BIG_BASE_CTZ_10 and
 	MP_BASES_BIG_BASE_BINVERTED_10.
 
diff -r b046f121b5da -r 3921c37733fb mpn/x86_64/zen/sqr_basecase.asm
--- a/mpn/x86_64/zen/sqr_basecase.asm	Tue Jun 27 10:30:36 2017 +0200
+++ b/mpn/x86_64/zen/sqr_basecase.asm	Tue Jun 27 18:52:54 2017 +0200
@@ -31,19 +31,21 @@
 include(`../config.m4')
 
 C TODO
-C  * Polish.
-C  * Micro-schedule.
-C  * Perform CSE of corner code as indicated by FIXME comments.
-C  * Do overlapped software pipelining.
-C  * Consider shallower sw pipelining of mul_1/addmul_1 loops, allowing 4
-C    instead of 8 product registers.  Keep 4x unrolling or go to 2x.  This
-C    would allow leaner feed-in as the size congruence classes (mod 2) would
-C    share the same feed-in, except the final branch.
-C  * Expand inner loops 4x in the outer loop, to both save some (poorly branch
-C    predicted) bookkeeping, and to allow some overlapped sw pipelining.
-C  * It is tempting to use 32-bit loop counts, but it is tricky as we keep all
-C    counts negative, and 32-bit ops zero extend.  It would work if we first
-C    offset ptrs by 2^64-2^32...
+C  * Do overlapped software pipelining.  This should close the remaining gap to
+C    mul_basecase.
+C
+C  * Update un just once in the outer loop.
+C
+C  * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from
+C    loads and stores.  At least in some cases, the non-scaped form is faster.
+C
+C  * Optimise xit3 code, e.g., using shrx and sarx like in the main loop.
+C
+C  * The mul_1 feed-in code has gotten little attention and could probably be
+C    improved.  Perhaps even expand it to 4 separate loops to allow straight
+C    fall-through into the 4 addmul_1 loops.
+C
+C  * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks.
 
 define(`rp',      `%rdi')
 define(`up',      `%rsi')
@@ -181,7 +183,6 @@
 L(n4):	mov	%r11, 8(rp)
 	adc	%r10, %r13
 	adc	%r12, %rbx
-	adc	$0, %rax
 	jmp	L(m)
 
 L(mx0):	test	$2, R8(un)
@@ -199,7 +200,7 @@
 	add	%r15, %r9
 	jmp	L(mlo0)
 
-	ALIGN(64)
+	ALIGN(16)
 L(mtop):jrcxz	L(mend)
 	adc	%r8, %r11
 	mov	%r9, (rp,n,8)
@@ -226,85 +227,53 @@
 	mov	%rbx, 24(rp)
 	mov	%rax, 32(rp)
 
-	lea	2(un), un		C FIXME: Incorporate above
-
-L(outer):
-	mov	-8(up,un,8), %rdx	C up[0]
-	lea	3(un), n
-	and	$-4, n
+	lea	2(un), un
 
-	mov	-16(up,un,8), %r9	C up[-1]
-	sar	$63, %r9
-	and	%rdx, %r9		C "ci" in C code
-	add	32(rp,un,8), %r9
-	mulx(	%rdx, %rax, %r15)	C up[0]^2
-	mov	(up,un,8), %r8		C up[1]
-	adc	$0, %r15
-	add	%rax, %r9
-	adc	$0, %r15		C "cin" in C code
-	mov	%r9, 32(rp,un,8)
-	lea	8(rp), rp
-
-	mov	-16(up,un,8), %r10	C up[-1]
-	shr	$63, %r10
-	lea	(%r10,%rdx,2), %rdx	C "u0" arg in C code
-
+	mov	$63, R32(%r15)			C keep at 63 for shrx/sarx.
 	test	$1, R8(un)
 	jz	L(x0)
 L(x1):	test	$2, R8(un)
-	jz	L(b3)
-
-L(b1):	mulx(	%r8, %rbx, %rax)
-	add	%r15, %rbx
-	adc	$0, %rax
-	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %r9, %r8
-	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10	C mulx 16(up,un,8), %r11, %r10
-	jmp	L(lo1)
-
-L(b0):	mulx(	%r8, %r9, %r8)
-	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08	C mulx 8(up,un,8), %r11, %r10
-	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x10	C mulx 16(up,un,8), %r13, %r12
-	add	%r15, %r9
-	jmp	L(lo0)
+	jz	L(f3)
+	jmp	L(f1)
+L(x0):	test	$2, R8(un)
+	jz	L(f0)
+C	jmp	L(f2)
 
-L(x0):	test	$2, R8(un)
-	jz	L(b0)
+L(f2):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	2(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0	C sarx %r15, -16(up,un,8), %r11
+	.byte	0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0	C shrx %r15, -16(up,un,8), %r13
+	and	%rdx, %r11			C "ci" in C code
+	mulx(	%rdx, %rax, %r10)		C up[0]^2
+	lea	(%r13,%rdx,2), %rdx		C "u0" arg in C code
+	add	%rax, %r11
 
-L(b2):	mulx(	%r8, %r13, %r12)
+	.byte	0xc4,0x62,0x93,0xf6,0x24,0xee		C mulx (up,un,8), %r13, %r12
 	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %rbx, %rax
-	add	%r15, %r13
-	adc	%r12, %rbx
-	adc	$0, %rax
-	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10	C mulx 16(up,un,8), %r9, %r8
-	jmp	L(lo2)
-
-L(b3):	mulx(	%r8, %r11, %r10)
-	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x08	C mulx 8(up,un,8), %r13, %r12
-	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10	C mulx 16(up,un,8), %rbx, %rax
-	add	%r15, %r11
 	adc	%r10, %r13
 	adc	%r12, %rbx
 	adc	$0, %rax
-	jrcxz	L(xit3)
-	jmp	L(lo3)
+	jmp	L(b2)
 
-	ALIGN(64)
-L(top):	add	%r9, (rp,n,8)
-L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	ALIGN(16)
+L(top2):add	%r9, (rp,n,8)
+L(b2):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
 	adc	%r11, 8(rp,n,8)
-L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
 	adc	%r13, 16(rp,n,8)
-L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
 	adc	%rbx, 24(rp,n,8)
 	adc	%rax, %r9
-L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
 	adc	%r8, %r11
 	adc	%r10, %r13
 	adc	%r12, %rbx
 	adc	$0, %rax
 	add	$4, n
-	jnz	L(top)
+	jnz	L(top2)
 
+	inc	un
 	add	%r9, (rp)
 	adc	%r11, 8(rp)
 	adc	%r13, 16(rp)
@@ -312,31 +281,157 @@
 	adc	$0, %rax
 	mov	%rax, 32(rp)
 
+L(f1):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	1(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0	C sarx	%r15, -16(up,un,8), %r13
+	.byte	0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0	C shrx	%r15, -16(up,un,8), %rbx
+	and	%rdx, %r13			C "ci" in C code
+	mulx(	%rdx, %rax, %r12)		C up[0]^2
+	lea	(%rbx,%rdx,2), %rdx		C "u0" arg in C code
+	add	%rax, %r13
+
+	.byte	0xc4,0xe2,0xe3,0xf6,0x04,0xee		C mulx (up,un,8), %rbx, %rax
+	adc	%r12, %rbx
+	adc	$0, %rax
+	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %r9, %r8
+	jmp	L(b1)
+
+	ALIGN(16)
+L(top1):add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+L(b1):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(top1)
+
 	inc	un
-	jmp	L(outer)
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
 
-L(xit3):add	%r11, 8(rp)
+L(f0):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0	C sarx	%r15, -16(up,un,8), %rbx
+	.byte	0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0	C shrx	%r15, -16(up,un,8), %r9
+	and	%rdx, %rbx			C "ci" in C code
+	mulx(	%rdx, %r10, %rax)		C up[0]^2
+	lea	(%r9,%rdx,2), %rdx		C "u0" arg in C code
+	add	%r10, %rbx
+	adc	$0, %rax			C "cin" in C code
+
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,un,8), %r9, %r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08	C mulx 8(up,un,8), %r11, %r10
+	jmp	L(b0)
+
+	ALIGN(16)
+L(top0):add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+L(b0):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(top0)
+
+	inc	un
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+
+L(f3):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	3(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0	C sarx %r15, -16(up,un,8), %r9


More information about the gmp-commit mailing list