[Gmp-commit] /var/hg/gmp: Rewrite to do 2x and limb squaring in main loop.

Mon Jun 26 21:44:17 UTC 2017

details:   /var/hg/gmp/rev/78e40fad7642
changeset: 17456:78e40fad7642
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon Jun 26 23:44:10 2017 +0200
description:
Rewrite to do 2x and limb squaring in main loop.

diffstat:

 mpn/x86_64/coreibwl/sqr_basecase.asm |  423 +++++++++++++++++-----------------
 1 files changed, 210 insertions(+), 213 deletions(-)

diffs (truncated from 657 to 300 lines):

diff -r ef66a6a32972 -r 78e40fad7642 mpn/x86_64/coreibwl/sqr_basecase.asm

--- a/mpn/x86_64/coreibwl/sqr_basecase.asm	Tue Jun 20 15:55:33 2017 +0200
+++ b/mpn/x86_64/coreibwl/sqr_basecase.asm	Mon Jun 26 23:44:10 2017 +0200
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
 
-dnl  Copyright 2015 Free Software Foundation, Inc.
+dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -61,12 +61,11 @@
 C    hardly allow correct branch prediction.  On 2nd thought, we now might make
 C    each of the 8 loop branches be poorly predicted since they will be
 C    executed fewer times for each time.  With just one addmul_1 loop, the loop
-C    count will change only once each 8th time!
-C  * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code.  We have
-C    3 variants below, but the haswell code turns out to be fastest.
+C    count will change only once each 8th time.
 C  * Do overlapped software pipelining.
-C  * When changing this, make sure the code which falls into the inner loops
-C    does not execute too many no-ops (for both PIC and non-PIC).
+C  * Perhaps load in shrx/sarx, eliminating separate load insn.
+C  * Schedule add+stored in small n code.
+C  * Try swapping adox and adcx insn, making mulx have more time to run.
 
 define(`rp',      `%rdi')
 define(`up',      `%rsi')
@@ -163,12 +162,8 @@
 
 L(gt3):	push	%rbx
 
-	push	rp
-	push	up
-	push	un_param
-
 	lea	-3(un_param), R32(un_save)
-	lea	5(un_param), n
+	lea	5(un_param), R32(n)
 	mov	R32(un_param), R32(%rax)
 	and	$-8, R32(un_save)
 	shr	$3, R32(n)		C count for mul_1 loop
@@ -186,45 +181,75 @@
 	jmp	*(%r10,%rax,8)
 ')
 
-L(mf0):	mulx(	8,(up), w2, w3)
+L(mf0):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
 	lea	64(up), up
-C	lea	(rp), rp
+	add	w1, w2
 	jmp	L(mb0)
 
-L(mf3):	mulx(	8,(up), w0, w1)
+L(mf3):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mov	w2, (rp)
+	mulx(	8,(up), w0, w1)
 	lea	24(up), up
 	lea	24(rp), rp
+	add	w3, w0
 	jmp	L(mb3)
 
-L(mf4):	mulx(	8,(up), w2, w3)
+L(mf4):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
 	lea	32(up), up
 	lea	32(rp), rp
+	add	w1, w2
 	jmp	L(mb4)
 
-L(mf5):	mulx(	8,(up), w0, w1)
+L(mf5):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
 	lea	40(up), up
 	lea	40(rp), rp
+	add	w3, w0
 	jmp	L(mb5)
 
-L(mf6):	mulx(	8,(up), w2, w3)
+L(mf6):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
 	lea	48(up), up
 	lea	48(rp), rp
+	add	w1, w2
 	jmp	L(mb6)
 
-L(mf7):	mulx(	8,(up), w0, w1)
+L(mf7):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
 	lea	56(up), up
 	lea	56(rp), rp
+	add	w3, w0
 	jmp	L(mb7)
 
-L(mf1):	mulx(	8,(up), w0, w1)
+L(mf1):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
 	lea	8(up), up
 	lea	8(rp), rp
+	add	w3, w0
 	jmp	L(mb1)
 
-L(mf2):	mulx(	8,(up), w2, w3)
+L(mf2):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
 	lea	16(up), up
 	lea	16(rp), rp
 	dec	R32(n)
+	add	w1, w2
 	mulx(	(up), w0, w1)
 
 	ALIGN(16)
@@ -233,8 +258,8 @@
 L(mb1):	mulx(	8,(up), w2, w3)
 	adc	w1, w2
 	lea	64(up), up
-	mov	w0, (rp)
-L(mb0):	mov	w2, 8(rp)
+L(mb0):	mov	w0, (rp)
+	mov	w2, 8(rp)
 	mulx(	-48,(up), w0, w1)
 	lea	64(rp), rp
 	adc	w3, w0
@@ -259,29 +284,35 @@
 
 L(end):	mov	w2, -8(rp)
 	adc	w3, w0
-	mov	w0, (rp)
-	adc	%rcx, w1
-	mov	w1, 8(rp)
+C	mov	w0, (rp)
+C	adc	%rcx, w1
+C	mov	w1, 8(rp)
 
 	lea	L(atab)(%rip), %r10
 ifdef(`PIC',
 `	movslq	(%r10,%rax,4), %r11
 	lea	(%r11, %r10), %r11
-	jmp	*%r11
 ',`
-	jmp	*(%r10,%rax,8)
+	mov	(%r10,%rax,8), %r11
 ')
+	mov	$63, R32(%rax)
+	jmp	*%r11
 
 L(ed0):	adox(	(rp), w0)
 	adox(	%rcx, w1)		C relies on rcx = 0
-	mov	w0, (rp)
+L(f7):	mov	w0, (rp)
 	adc	%rcx, w1		C relies on rcx = 0
 	mov	w1, 8(rp)
-L(f7):	lea	-64(up,un_save,8), up
-	or	R32(un_save), R32(n)
-	mov	8(up), u0
-	mulx(	16,(up), w0, w1)
+	lea	-64(up,un_save,8), up
+	mov	R32(un_save), R32(n)
 	lea	-56(rp,un_save,8), rp
+	mov	(up), w1		C up[-1]
+	mov	8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
 	jmp	L(b7)
 
 	ALIGN(16)
@@ -292,9 +323,9 @@
 	mulx(	8,(up), w2, w3)
 	adox(	(rp), w0)
 	lea	8(n), R32(n)
-	mov	w0, (rp)
+L(b0):	mov	w0, (rp)
 	adcx(	w1, w2)
-L(b0):	mulx(	16,(up), w0, w1)
+	mulx(	16,(up), w0, w1)
 	adcx(	w3, w0)
 	adox(	8,(rp), w2)
 	mov	w2, 8(rp)
@@ -325,14 +356,22 @@
 
 L(ed1):	adox(	(rp), w0)
 	adox(	%rcx, w1)		C relies on rcx = 0
-	mov	w0, (rp)
+L(f0):	mov	w0, (rp)
 	adc	%rcx, w1		C relies on rcx = 0
 	mov	w1, 8(rp)
-L(f0):	lea	-64(up,un_save,8), up
-	or	R32(un_save), R32(n)
-	mov	(up), u0
+	lea	-64(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-8(up), w3		C up[-1]
+	mov	(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
 	mulx(	8,(up), w2, w3)
-	lea	-56(rp,un_save,8), rp
+	adox(	(rp), w0)
 	jmp	L(b0)
 
 	ALIGN(16)
@@ -376,15 +415,25 @@
 
 L(ed2):	adox(	(rp), w0)
 	adox(	%rcx, w1)		C relies on rcx = 0
-	mov	w0, (rp)
+L(f1):	mov	w0, (rp)
 	adc	%rcx, w1		C relies on rcx = 0
 	mov	w1, 8(rp)
-L(f1):	lea	(up,un_save,8), up
-	or	R32(un_save), R32(n)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
 	lea	8(un_save), un_save
-	mov	-8(up), u0
+	lea	-56(rp,un_save,8), rp
+	mov	-16(up), w1		C up[-1]
+	mov	-8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)			C FIXME: crossjump?
 	mulx(	(up), w0, w1)
-	lea	-56(rp,un_save,8), rp
+	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
 	jmp	L(b1)
 
 	ALIGN(16)
@@ -418,7 +467,7 @@
 	adox(	40,(rp), w2)
 	adcx(	w3, w0)
 	mov	w2, 40(rp)
-	adox(	48,(rp), w0)
+L(b2):	adox(	48,(rp), w0)
 	mulx(	-8,(up), w2, w3)
 	mov	w0, 48(rp)
 	lea	64(rp), rp
@@ -428,17 +477,22 @@
 
 L(ed3):	adox(	(rp), w0)
 	adox(	%rcx, w1)		C relies on rcx = 0
-	mov	w0, (rp)
+L(f2):	mov	w0, (rp)
 	adc	%rcx, w1		C relies on rcx = 0
 	mov	w1, 8(rp)
-L(f2):	lea	(up,un_save,8), up
+	lea	(up,un_save,8), up
 	or	R32(un_save), R32(n)
-	jz	L(corner2)
-	mov	-16(up), u0
-	mulx(	-8,(up), w2, w3)
-	lea	8(rp,un_save,8), rp
-	mulx(	(up), w0, w1)
-	jmp	L(tp2)
+	jz	L(cor3)
+	lea	-56(rp,un_save,8), rp
+	mov	-24(up), w3		C up[-1]
+	mov	-16(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	jmp	L(b2)
 
 	ALIGN(16)
 L(tp3):	adox(	-8,(rp), w2)
@@ -467,11 +521,11 @@
 	adcx(	w1, w2)
 	adox(	32,(rp), w0)
 	mov	w0, 32(rp)
-	mulx(	-16,(up), w0, w1)