div_qr_1 interface

Tue Oct 22 14:27:43 CEST 2013

I turned out the code was a bit slower on k8.

This patch changes that.  With it applied, things takes 11 c/l on both
pipelines.  This is also a 2 c/l improvement for piledriver.

I have not tested that this is correct.  If you like the patch, please
consider putting the result in the k8 subdir.

diff -r e9a5ec7f4003 mpn/x86_64/k10/div_qr_1n_pi1.asm

--- a/mpn/x86_64/k10/div_qr_1n_pi1.asm	Tue Oct 22 10:16:16 2013 +0200
+++ b/mpn/x86_64/k10/div_qr_1n_pi1.asm	Tue Oct 22 14:19:48 2013 +0200
@@ -117,15 +117,16 @@
 	dec	UN
 	mov	U1, %rax
 	jz	L(final)
+	mov	$0, R32(Q1)
 	
 	ALIGN(16)
 
-	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
-	C At entry, %rax holds an extra copy of U1, and carry holds an extra copy of U2.
+	C Loop is 28 instructions, 30 K8/K10 decoder slots, should run in 10
+	C cycles.  At entry, %rax holds an extra copy of U1, and carry holds
+	C an extra copy of U2.
 L(loop):
 	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
 	C Remains to add in B (U1 + c)
-	mov	$0, Q1
 	cmovc	DINV, Q1
 	mov	U2, Q2
 	neg	Q2
@@ -147,13 +148,14 @@
 	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
 	adc	U1, Q1
 	mov	-8(UP, UN, 8), U0
-	adc	Q2,8(QP, UN, 8)
+	adc	Q2, 8(QP, UN, 8)
 	jc	L(q_incr)
 L(q_incr_done):
 	add	%rax, U0
 	mov	T, %rax
 	adc	%rdx, %rax
 	mov	Q1, (QP, UN, 8)
+	mov	$0, R32(Q1)
 	sbb 	U2, U2
 	dec	UN
 	mov	%rax, U1 

-- 
Torbjörn