div_qr_1 interface
Torbjorn Granlund
tg at gmplib.org
Tue Oct 22 14:27:43 CEST 2013
I turned out the code was a bit slower on k8.
This patch changes that. With it applied, things takes 11 c/l on both
pipelines. This is also a 2 c/l improvement for piledriver.
I have not tested that this is correct. If you like the patch, please
consider putting the result in the k8 subdir.
diff -r e9a5ec7f4003 mpn/x86_64/k10/div_qr_1n_pi1.asm
--- a/mpn/x86_64/k10/div_qr_1n_pi1.asm Tue Oct 22 10:16:16 2013 +0200
+++ b/mpn/x86_64/k10/div_qr_1n_pi1.asm Tue Oct 22 14:19:48 2013 +0200
@@ -117,15 +117,16 @@
dec UN
mov U1, %rax
jz L(final)
+ mov $0, R32(Q1)
ALIGN(16)
- C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
- C At entry, %rax holds an extra copy of U1, and carry holds an extra copy of U2.
+ C Loop is 28 instructions, 30 K8/K10 decoder slots, should run in 10
+ C cycles. At entry, %rax holds an extra copy of U1, and carry holds
+ C an extra copy of U2.
L(loop):
C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
C Remains to add in B (U1 + c)
- mov $0, Q1
cmovc DINV, Q1
mov U2, Q2
neg Q2
@@ -147,13 +148,14 @@
C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
adc U1, Q1
mov -8(UP, UN, 8), U0
- adc Q2,8(QP, UN, 8)
+ adc Q2, 8(QP, UN, 8)
jc L(q_incr)
L(q_incr_done):
add %rax, U0
mov T, %rax
adc %rdx, %rax
mov Q1, (QP, UN, 8)
+ mov $0, R32(Q1)
sbb U2, U2
dec UN
mov %rax, U1
--
Torbjörn
More information about the gmp-devel
mailing list