[Gmp-commit] /var/hg/gmp: Rewrite to do 2x and limb squaring in main loop.

Thu Apr 27 16:20:44 UTC 2017

details:   /var/hg/gmp/rev/6e4e07f8ac81
changeset: 17369:6e4e07f8ac81
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Apr 27 18:20:35 2017 +0200
description:
Rewrite to do 2x and limb squaring in main loop.

diffstat:

 mpn/x86_64/zen/sqr_basecase.asm |  294 ++++++++++++++++++++++-----------------
 1 files changed, 168 insertions(+), 126 deletions(-)

diffs (truncated from 415 to 300 lines):

diff -r 36b4b377a950 -r 6e4e07f8ac81 mpn/x86_64/zen/sqr_basecase.asm

--- a/mpn/x86_64/zen/sqr_basecase.asm	Tue Apr 25 22:23:51 2017 +0200
+++ b/mpn/x86_64/zen/sqr_basecase.asm	Thu Apr 27 18:20:35 2017 +0200
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_sqr_basecase optimised for AMD Zen.
 
-dnl  Copyright 2012, 2013, 2015, 2017 Free Software Foundation, Inc.
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,22 +31,27 @@
 include(`../config.m4')
 
 C TODO
-C  * Try 2x unrolling instead of current 4x, at least for mul_1.  Else consider
-C    shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 instead of 8
-C    product registers.
-C  * Replace sqr_diag_addlsh1 code with zen optimised code.
+C  * Polish.
+C  * Micro-schedule.
 C  * Do overlapped software pipelining.
-C  * Re-allocate to benefit more from 32-bit encoding (register rbp is free).
-C  * Polish.
+C  * Consider shallower sw pipelining of mul_1/addmul_1 loops, allowing 4
+C    instead of 8 product registers.  Keep 4x unrolling or go to 2x.  This
+C    would allow leaner feed-in as the size congruence classes (mod 2) would
+C    share the same feed-in, except the final branch.
+C  * Expand inner loops 4x in the outer loop, to both save some (poorly branch
+C    predicted) bookkeeping, and to allow some overlapped sw pipelining.
+C  * It is tempting to use 32-bit loop counts, but it is tricky as we keep all
+C    counts negative, and 32-bit ops zero extend.  It would work if we first
+C    offset ptrs by 2^64-2^32...
 
 define(`rp',      `%rdi')
 define(`up',      `%rsi')
 define(`un_param',`%rdx')
 
-define(`un',      `%r14')
+define(`un',      `%rbp')
 define(`n',       `%rcx')
 
-C these are used just for the small op code 
+C these are used just for the small op code
 define(`w0',	`%r8')
 define(`w1',	`%r9')
 define(`w2',	`%r10')
@@ -62,7 +67,7 @@
 PROLOGUE(mpn_sqr_basecase)
 	FUNC_ENTRY(3)
 
-	cmp	$2, un_param
+	cmp	$2, R32(un_param)
 	jae	L(gt1)
 
 	mov	(up), %rdx
@@ -93,7 +98,7 @@
 	FUNC_EXIT()
 	ret
 
-L(gt2):	cmp	$4, un_param
+L(gt2):	cmp	$4, R32(un_param)
 	jae	L(gt3)
 
 	push	%rbx
@@ -133,57 +138,69 @@
 	FUNC_EXIT()
 	ret
 
-L(gt3):
-	push	%r15
-	push	%r14
+L(gt3):	push	%r15
+C	push	%r14
 	push	%r13
 	push	%r12
 	push	%rbp
 	push	%rbx
-	mov	un_param, un
+	mov	R32(un_param), R32(un)
 
 	mov	(up), %rdx		C up[0]
 	mov	8(up), %r9		C up[1]
 
+	mulx(	%rdx, %rax, %r15)	C up[0]^2
+	mov	%rax, (rp)
+	shl	%rdx
+
 	lea	(up,un,8), up
 	lea	-32(rp,un,8), rp
 
 	neg	un
-	lea	1(un), n
+	lea	4(un), n
+	and	$-4, n
 
-	bt	$0, R32(n)
-	jnc	L(mx0)
-L(mx1):	bt	$1, R32(n)
-	jnc	L(mb3)
+	test	$1, R8(un)
+	jnz	L(mx0)
+L(mx1):	test	$2, R8(un)
+	jz	L(mb3)
 
 L(mb1):	mulx(	%r9, %rbx, %rax)
-	add	$1, n					C clear cy as side-effect
-	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
-	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	`mulx'	16(up,un,8), %r9, %r8
+	`mulx'	24(up,un,8), %r11, %r10
+	add	%r15, %rbx
 	jmp	L(mlo1)
 
 L(mb3):	mulx(	%r9, %r11, %r10)
-	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x08	C mulx 8(up,n,8), %r13, %r12
-	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10	C mulx 16(up,n,8), %rbx, %rax
-	sub	$-3, n					C clear cy as side-effect
-	jz	L(mwd3)
-	test	R32(%rdx), R32(%rdx)			C clear cy
+	`mulx'	16(up,un,8), %r13, %r12
+	`mulx'	24(up,un,8), %rbx, %rax
+	add	%r15, %r11
+	jrcxz	L(n4)
 	jmp	L(mlo3)
+L(n4):	mov	%r11, 8(rp)
+	adc	%r10, %r13
+	mov	%r13, 16(rp)		C FIXME: suppress
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%rbx, 24(rp)		C FIXME: suppress
+	jmp	L(m)
 
-L(mx0):	bt	$1, R32(n)
-	jnc	L(mb0)
+L(mx0):	test	$2, R8(un)
+	jnz	L(mb0)
 
 L(mb2):	mulx(	%r9, %r13, %r12)
-	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 	C mulx 8(up,n,8), %rbx, %rax
-	add	$2, n					C clear cy as side-effect
-	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce	  	C mulx (up,n,8), %r9, %r8
+	`mulx'	16(up,un,8), %rbx, %rax
+	`mulx'	24(up,un,8), %r9, %r8
+	add	%r15, %r13
 	jmp	L(mlo2)
 
 L(mb0):	mulx(	%r9, %r9, %r8)
-	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
-	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	`mulx'	16(up,un,8), %r11, %r10
+	`mulx'	24(up,un,8), %r13, %r12
+	add	%r15, %r9
 	jmp	L(mlo0)
 
+	ALIGN(64)
 L(mtop):jrcxz	L(mend)
 	adc	%r8, %r11
 	mov	%r9, (rp,n,8)
@@ -202,7 +219,7 @@
 
 L(mend):mov	%r9, (rp)
 	adc	%r8, %r11
-L(mwd3):mov	%r11, 8(rp)
+	mov	%r11, 8(rp)
 	adc	%r10, %r13
 	mov	%r13, 16(rp)
 	adc	%r12, %rbx
@@ -210,54 +227,69 @@
 	mov	%rbx, 24(rp)
 	mov	%rax, 32(rp)
 
-	lea	2(un), %r15
+	lea	2(un), un		C FIXME: Incorporate above
 
 L(outer):
-	mov	-8(up,%r15,8), %rdx	C v0 = up[0]
-	mov	%r15, n
+	mov	-8(up,un,8), %rdx	C up[0]
+	lea	3(un), n
+	and	$-4, n
+
+	mov	-16(up,un,8), %r9	C up[-1]
+	sar	$63, %r9
+	and	%rdx, %r9		C "ci" in C code
+	add	32(rp,un,8), %r9
+	mulx(	%rdx, %rax, %r15)	C up[0]^2
+	mov	(up,un,8), %r8		C up[1]
+	adc	$0, %r15
+	add	%rax, %r9
+	adc	$0, %r15		C "cin" in C code
+	mov	%r9, 32(rp,un,8)
 	lea	8(rp), rp
-	mov	(up,%r15,8), %r8	C v0 = up[1]
 
-	bt	$0, R32(n)
-	jnc	L(x0)
-L(x1):	bt	$1, R32(n)
-	jnc	L(b3)
+	mov	-16(up,un,8), %r10	C up[-1]
+	shr	$63, %r10
+	lea	(%r10,%rdx,2), %rdx	C "u0" arg in C code
+
+	test	$1, R8(un)
+	jz	L(x0)
+L(x1):	test	$2, R8(un)
+	jz	L(b3)
 
 L(b1):	mulx(	%r8, %rbx, %rax)
-	add	$1, n					C clear cy as side-effect
-	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
-	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	add	%r15, %rbx
+	adc	$0, %rax
+	`mulx'	8(up,un,8), %r9, %r8
+	`mulx'	16(up,un,8), %r11, %r10
 	jmp	L(lo1)
 
 L(b0):	mulx(	%r8, %r9, %r8)
-	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
-	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
-	xor	R32(%rax), R32(%rax)
+	`mulx'	8(up,un,8), %r11, %r10
+	`mulx'	16(up,un,8), %r13, %r12
+	add	%r15, %r9
 	jmp	L(lo0)
 
-L(b3):	mulx(	%r8, %r11, %r10)
-	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x08	C mulx 8(up,n,8), %r13, %r12
-	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10	C mulx 16(up,n,8), %rbx, %rax
-	add	%r10, %r13
+L(x0):	test	$2, R8(un)
+	jz	L(b0)
+
+L(b2):	mulx(	%r8, %r13, %r12)
+	`mulx'	8(up,un,8), %rbx, %rax
+	add	%r15, %r13
 	adc	%r12, %rbx
 	adc	$0, %rax
-	sub	$-3, n					C clear cy as side-effect
-	jz	L(wd3)
-	test	R32(%rdx), R32(%rdx)			C clear cy
+	`mulx'	16(up,un,8), %r9, %r8
+	jmp	L(lo2)
+
+L(b3):	mulx(	%r8, %r11, %r10)
+	`mulx'	8(up,un,8), %r13, %r12
+	`mulx'	16(up,un,8), %rbx, %rax
+	add	%r15, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	jrcxz	L(xit3)
 	jmp	L(lo3)
 
-L(x0):	bt	$1, R32(n)
-	jnc	L(b0)
-
-L(b2):	mulx(	%r8, %r13, %r12)
-	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08	C mulx 8(up,n,8), %rbx, %rax
-	add	%r12, %rbx
-	adc	$0, %rax
-	lea	2(n), n
-	jrcxz	L(xit2)
-	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
-	jmp	L(lo2)
-
+	ALIGN(64)
 L(top):	add	%r9, (rp,n,8)
 L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
 	adc	%r11, 8(rp,n,8)
@@ -270,76 +302,86 @@
 	adc	%r8, %r11
 	adc	%r10, %r13
 	adc	%r12, %rbx
-	adc	$0, %rax		C rax = carry limb
+	adc	$0, %rax
 	add	$4, n
-	js	L(top)
+	jnz	L(top)
 
 	add	%r9, (rp)
-L(wd3):	adc	%r11, 8(rp)
-L(wd2):	adc	%r13, 16(rp)
-L(wd1):	adc	%rbx, 24(rp)
-	adc	$0, %rax
-	mov	%rax, 32(rp)
-
-	add	$1, %r15
-	jmp	L(outer)
-
-L(xit2):add	%r13, 16(rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
 	adc	%rbx, 24(rp)
 	adc	$0, %rax
 	mov	%rax, 32(rp)
-	mov	-16(up), %rdx
-	lea	8(rp), rp
-	mov	-8(up), %r8