[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Mar 10 14:08:10 CET 2011
details: /var/hg/gmp/rev/47068e673ecd
changeset: 14025:47068e673ecd
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 10 13:59:44 2011 +0100
description:
Move new aorrlsh_n.asm to new k8 dir. Revert mpn/x86_64/aorrlsh_n.asm.
details: /var/hg/gmp/rev/63b28e8d6496
changeset: 14026:63b28e8d6496
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 10 14:01:28 2011 +0100
description:
Setup path for new k8 directory.
details: /var/hg/gmp/rev/2c2dbed32630
changeset: 14027:2c2dbed32630
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 10 14:06:14 2011 +0100
description:
Suppress wind-down rp updates.
details: /var/hg/gmp/rev/4b0c70b6cf9c
changeset: 14028:4b0c70b6cf9c
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 10 14:07:13 2011 +0100
description:
Use 'n' instead of 'r11' directly.
details: /var/hg/gmp/rev/c6568fc594aa
changeset: 14029:c6568fc594aa
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 10 14:07:49 2011 +0100
description:
Add some more table entries (blank for now).
diffstat:
configure.in | 3 +-
mpn/x86/aorsmul_1.asm | 3 +-
mpn/x86/atom/sse2/mul_basecase.asm | 42 ++---
mpn/x86_64/aorrlsh_n.asm | 237 +++++++++++++++---------------------
mpn/x86_64/aorsmul_1.asm | 2 +-
mpn/x86_64/k8/aorrlsh_n.asm | 200 +++++++++++++++++++++++++++++++
6 files changed, 322 insertions(+), 165 deletions(-)
diffs (truncated from 695 to 300 lines):
diff -r c55f4c6e9cb1 -r c6568fc594aa configure.in
--- a/configure.in Thu Mar 10 11:16:38 2011 +0100
+++ b/configure.in Thu Mar 10 14:07:49 2011 +0100
@@ -1504,9 +1504,10 @@
x86_64)
;;
k10 | bobcat | bulldozer)
- path_64="x86_64/k10 $path_64"
+ path_64="x86_64/k10 x86_64/k8 $path_64"
;;
athlon64 | k8)
+ path_64="x86_64/k8 $path_64"
;;
pentium4)
path_64="x86_64/pentium4 $path_64"
diff -r c55f4c6e9cb1 -r c6568fc594aa mpn/x86/aorsmul_1.asm
--- a/mpn/x86/aorsmul_1.asm Thu Mar 10 11:16:38 2011 +0100
+++ b/mpn/x86/aorsmul_1.asm Thu Mar 10 14:07:49 2011 +0100
@@ -21,7 +21,6 @@
include(`../config.m4')
-
C cycles/limb
C P5 14.75
C P6 model 0-8,10-12 7.5
@@ -32,9 +31,11 @@
C P4 model 2 (Northwood) 24.0
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
+C Intel Atom
C AMD K6 12.5
C AMD K7 5.25
C AMD K8
+C AMD K10
ifdef(`OPERATION_addmul_1',`
diff -r c55f4c6e9cb1 -r c6568fc594aa mpn/x86/atom/sse2/mul_basecase.asm
--- a/mpn/x86/atom/sse2/mul_basecase.asm Thu Mar 10 11:16:38 2011 +0100
+++ b/mpn/x86/atom/sse2/mul_basecase.asm Thu Mar 10 14:07:49 2011 +0100
@@ -100,14 +100,14 @@
decl vn
jz L(done)
- lea 8(rp), rp
+ lea -8(rp), rp
L(ol3): mov 28(%esp), un
neg un
lea 4(vp), vp
movd (vp), %mm7 C read next V limb
mov 24(%esp), up
- lea (rp,un,4), rp
+ lea 16(rp,un,4), rp
movd (up), %mm0
pmuludq %mm7, %mm0
@@ -159,14 +159,13 @@
adc un, %edx C un is zero here
add %eax, 12(rp)
movd %mm0, %ebx
- lea 16(rp), rp
psrlq $32, %mm0
adc %edx, %ebx
movd %mm0, %eax
adc un, %eax
- add %ebx, (rp)
+ add %ebx, 16(rp)
adc un, %eax
- mov %eax, 4(rp)
+ mov %eax, 20(rp)
decl vn
jnz L(ol3)
@@ -204,24 +203,23 @@
decl vn
jz L(done)
- lea 12(rp), rp
+ lea -4(rp), rp
L(ol0): mov 28(%esp), un
neg un
lea 4(vp), vp
movd (vp), %mm7 C read next V limb
mov 24(%esp), up
- lea 4(rp,un,4), rp
+ lea 20(rp,un,4), rp
movd (up), %mm1
pmuludq %mm7, %mm1
sar $2, un
- xor %edx, %edx
movd 4(up), %mm0
lea -4(up), up
movd %mm1, %eax
pmuludq %mm7, %mm0
-
+ xor %edx, %edx C zero edx and CF
jmp L(a0)
L(la0): movd 4(up), %mm1
@@ -264,14 +262,13 @@
adc un, %edx C un is zero here
add %eax, 12(rp)
movd %mm0, %ebx
- lea 16(rp), rp
psrlq $32, %mm0
adc %edx, %ebx
movd %mm0, %eax
adc un, %eax
- add %ebx, (rp)
+ add %ebx, 16(rp)
adc un, %eax
- mov %eax, 4(rp)
+ mov %eax, 20(rp)
decl vn
jnz L(ol0)
@@ -309,13 +306,14 @@
decl vn
jz L(done)
+ lea -16(rp), rp
L(ol1): mov 28(%esp), un
neg un
lea 4(vp), vp
movd (vp), %mm7 C read next V limb
mov 24(%esp), up
- lea 8(rp,un,4), rp
+ lea 24(rp,un,4), rp
movd (up), %mm0
pmuludq %mm7, %mm0
@@ -364,17 +362,16 @@
inc un
jnz L(la1)
- adc un, %edx C un is zero here
+ adc un, %edx C un is zero here
add %eax, 12(rp)
movd %mm0, %ebx
- lea 16(rp), rp
psrlq $32, %mm0
adc %edx, %ebx
movd %mm0, %eax
adc un, %eax
- add %ebx, (rp)
+ add %ebx, 16(rp)
adc un, %eax
- mov %eax, 4(rp)
+ mov %eax, 20(rp)
decl vn
jnz L(ol1)
@@ -412,14 +409,14 @@
decl vn
jz L(done)
- lea 4(rp), rp
+ lea -12(rp), rp
L(ol2): mov 28(%esp), un
neg un
lea 4(vp), vp
movd (vp), %mm7 C read next V limb
mov 24(%esp), up
- lea -4(rp,un,4), rp
+ lea 12(rp,un,4), rp
movd (up), %mm1
pmuludq %mm7, %mm1
@@ -467,17 +464,16 @@
inc un
jnz L(la2)
- adc un, %edx C un is zero here
+ adc un, %edx C un is zero here
add %eax, 12(rp)
movd %mm0, %ebx
- lea 16(rp), rp
psrlq $32, %mm0
adc %edx, %ebx
movd %mm0, %eax
adc un, %eax
- add %ebx, (rp)
+ add %ebx, 16(rp)
adc un, %eax
- mov %eax, 4(rp)
+ mov %eax, 20(rp)
decl vn
jnz L(ol2)
diff -r c55f4c6e9cb1 -r c6568fc594aa mpn/x86_64/aorrlsh_n.asm
--- a/mpn/x86_64/aorrlsh_n.asm Thu Mar 10 11:16:38 2011 +0100
+++ b/mpn/x86_64/aorrlsh_n.asm Thu Mar 10 14:07:49 2011 +0100
@@ -19,36 +19,37 @@
include(`../config.m4')
+
C cycles/limb
-C AMD K8,K9 2.87 < 3.85 for lshift + add_n
-C AMD K10 2.75 < 3.85 for lshift + add_n
-C Intel P4 22 > 7.33 for lshift + add_n
-C Intel core2 4.1 > 3.27 for lshift + add_n
-C Intel NHM 4.4 > 3.75 for lshift + add_n
-C Intel SBR 3.17 < 3.46 for lshift + add_n
-C Intel atom ? ? 8.75 for lshift + add_n
+C AMD K8,K9 3.1 < 3.85 for lshift + add_n
+C AMD K10 3.1 < 3.85 for lshift + add_n
+C Intel P4 14.6 > 7.33 for lshift + add_n
+C Intel core2 3.87 > 3.27 for lshift + add_n
+C Intel NHM 4 > 3.75 for lshift + add_n
+C Intel SBR (5.8) > 3.46 for lshift + add_n
+C Intel atom (7.75) < 8.75 for lshift + add_n
C VIA nano 4.7 < 6.25 for lshift + add_n
-C TODO
-C * Can we propagate carry into rdx instead of using a special carry register?
-C That could save enough insns to get to 10 cycles/iteration.
+C This was written quickly and not optimized at all. Surely one could get
+C closer to 3 c/l or perhaps even under 3 c/l. Ideas:
+C 1) Use indexing to save the 3 LEA
+C 2) Write reasonable feed-in code
+C 3) Be more clever about register usage
+C 4) Unroll more, handling CL negation, carry save/restore cost much now
+C 5) Reschedule
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n_param', `%rcx')
-define(`cnt', `%r8')
-
-define(`vp', `%r12')
-define(`n', `%rbp')
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
ifdef(`OPERATION_addlsh_n',`
- define(ADDSUB, `add')
define(ADCSBB, `adc')
define(func, mpn_addlsh_n)
')
ifdef(`OPERATION_rsblsh_n',`
- define(ADDSUB, `sub')
define(ADCSBB, `sbb')
define(func, mpn_rsblsh_n)
')
@@ -56,145 +57,103 @@
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
ASM_START()
- TEXT
- ALIGN(16)
+ TEXT
+ ALIGN(16)
PROLOGUE(func)
push %r12
+ push %r13
+ push %r14
push %rbp
push %rbx
- mov (vp_param), %rax C load first V limb early
+ mov n, %rax
+ xor R32(%rbx), R32(%rbx) C clear carry save register
+ mov R32(%r8), R32(%rcx) C shift count
+ xor R32(%rbp), R32(%rbp) C limb carry
- mov $0, R32(n)
- sub n_param, n
+ mov R32(%rax), R32(%r11)
+ and $3, R32(%r11)
+ je L(4)
+ sub $1, R32(%r11)
- lea -16(up,n_param,8), up
- lea -16(rp,n_param,8), rp
- lea 16(vp_param,n_param,8), vp
+L(012): mov (vp), %r8
+ mov %r8, %r12
+ shl R8(%rcx), %r8
+ or %rbp, %r8
+ neg R8(%rcx)
+ mov %r12, %rbp
+ shr R8(%rcx), %rbp
+ neg R8(%rcx)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ sbb R32(%rbx), R32(%rbx)
+ lea 8(up), up
More information about the gmp-commit
mailing list