[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Mar 7 17:11:25 CET 2011
details: /var/hg/gmp/rev/3c333e20baec
changeset: 14006:3c333e20baec
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 07 17:05:37 2011 +0100
description:
Tweak rp loop updates.
details: /var/hg/gmp/rev/23aac43a7ddc
changeset: 14007:23aac43a7ddc
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 07 17:11:10 2011 +0100
description:
*** empty log message ***
diffstat:
ChangeLog | 5 +
mpn/x86/atom/sse2/mul_basecase.asm | 315 ++++++++++++++++--------------------
2 files changed, 144 insertions(+), 176 deletions(-)
diffs (truncated from 497 to 300 lines):
diff -r ff46de76bb88 -r 23aac43a7ddc ChangeLog
--- a/ChangeLog Mon Mar 07 00:23:06 2011 +0100
+++ b/ChangeLog Mon Mar 07 17:11:10 2011 +0100
@@ -1,3 +1,8 @@
+2011-03-07 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86/atom/sse2/mul_basecase.asm: Replace addmul_1 loops.
+ Tweak outer loop rp updates.
+
2011-03-06 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86/atom/sse2/sqr_basecase.asm: New file.
diff -r ff46de76bb88 -r 23aac43a7ddc mpn/x86/atom/sse2/mul_basecase.asm
--- a/mpn/x86/atom/sse2/mul_basecase.asm Mon Mar 07 00:23:06 2011 +0100
+++ b/mpn/x86/atom/sse2/mul_basecase.asm Mon Mar 07 17:11:10 2011 +0100
@@ -26,14 +26,11 @@
C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
C 4 large loops into one; we could use it for the outer loop branch.
C * Optimise code outside of inner loops.
-C * Play with rp and up offsets to save a bunch of lea insns.
C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
C outer each loop. ("Overlapping software pipelining")
C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs
C for inlined mul_1, allowing us to postpone all pushes.
-C * Perhaps write special code for un < M, for some small M.
-C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
-C with even less pipelined code.
+C * Perhaps write special code for vn <= un < M, for some small M.
C void mpn_mul_basecase (mp_ptr wp,
C mp_srcptr xp, mp_size_t xn,
@@ -103,81 +100,73 @@
decl vn
jz L(done)
+ lea 8(rp), rp
-L(ol3): lea 4(vp), vp
+L(ol3): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
movd (vp), %mm7 C read next V limb
- mov 20(%esp), rp
mov 24(%esp), up
- lea 4(rp), rp
- mov rp, 20(%esp)
- mov 28(%esp), un
+ lea (rp,un,4), rp
- movd (up), %mm1
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd 4(up), %mm1
+ movd %mm0, %ebx
pmuludq %mm7, %mm1
- shr $2, un C FIXME: move out
- lea 4(up), up
- lea -12(rp), rp
- movd %mm1, %ebx
- inc un
- movd (up), %mm0
+ lea -8(up), up
xor %edx, %edx C zero edx and CF
jmp L(a3)
-L(la3): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
+L(la3): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
pmuludq %mm7, %mm1
lea 16(rp), rp
psrlq $32, %mm0
- adc %edx, %eax
+ adc %edx, %ebx
movd %mm0, %edx
- movd %mm1, %ebx
+ movd %mm1, %eax
movd 8(up), %mm0
pmuludq %mm7, %mm0
adc $0, %edx
- add %eax, (rp)
+ add %ebx, (rp)
psrlq $32, %mm1
- adc %edx, %ebx
+ adc %edx, %eax
movd %mm1, %edx
- movd %mm0, %eax
+ movd %mm0, %ebx
movd 12(up), %mm1
pmuludq %mm7, %mm1
adc $0, %edx
- add %ebx, 4(rp)
- psrlq $32, %mm0
- adc %edx, %eax
+ add %eax, 4(rp)
+L(a3): psrlq $32, %mm0
+ adc %edx, %ebx
movd %mm0, %edx
- movd %mm1, %ebx
+ movd %mm1, %eax
lea 16(up), up
movd (up), %mm0
adc $0, %edx
- add %eax, 8(rp)
-L(a3): psrlq $32, %mm1
- adc %edx, %ebx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
movd %mm1, %edx
pmuludq %mm7, %mm0
- dec un
- movd 4(up), %mm1
+ inc un
jnz L(la3)
adc un, %edx C un is zero here
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
+ add %eax, 12(rp)
+ movd %mm0, %ebx
lea 16(rp), rp
psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- psrlq $32, %mm1
adc %edx, %ebx
- movd %mm1, %eax
+ movd %mm0, %eax
adc un, %eax
- add %ebx, 4(rp)
+ add %ebx, (rp)
adc un, %eax
- mov %eax, 8(rp)
+ mov %eax, 4(rp)
decl vn
jnz L(ol3)
@@ -215,81 +204,74 @@
decl vn
jz L(done)
+ lea 12(rp), rp
-L(ol0): lea 4(vp), vp
+L(ol0): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
movd (vp), %mm7 C read next V limb
- mov 20(%esp), rp
mov 24(%esp), up
- lea 4(rp), rp
- mov rp, 20(%esp)
- mov 28(%esp), un
+ lea 4(rp,un,4), rp
- movd (up), %mm0
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ xor %edx, %edx
+ movd 4(up), %mm0
+ lea -4(up), up
+ movd %mm1, %eax
pmuludq %mm7, %mm0
- shr $2, un C FIXME: move out
- movd 4(up), %mm1
- lea -8(up), up
- lea -8(rp), rp
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- xor %edx, %edx C zero edx and CF
+
jmp L(a0)
-L(la0): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
+L(la0): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
pmuludq %mm7, %mm1
lea 16(rp), rp
psrlq $32, %mm0
- adc %edx, %eax
+ adc %edx, %ebx
movd %mm0, %edx
- movd %mm1, %ebx
+ movd %mm1, %eax
movd 8(up), %mm0
pmuludq %mm7, %mm0
adc $0, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
+ add %ebx, (rp)
+L(a0): psrlq $32, %mm1
+ adc %edx, %eax
movd %mm1, %edx
- movd %mm0, %eax
+ movd %mm0, %ebx
movd 12(up), %mm1
pmuludq %mm7, %mm1
adc $0, %edx
- add %ebx, 4(rp)
-L(a0): psrlq $32, %mm0
- adc %edx, %eax
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
movd %mm0, %edx
- movd %mm1, %ebx
+ movd %mm1, %eax
lea 16(up), up
movd (up), %mm0
adc $0, %edx
- add %eax, 8(rp)
+ add %ebx, 8(rp)
psrlq $32, %mm1
- adc %edx, %ebx
+ adc %edx, %eax
movd %mm1, %edx
pmuludq %mm7, %mm0
- dec un
- movd 4(up), %mm1
+ inc un
jnz L(la0)
adc un, %edx C un is zero here
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
+ add %eax, 12(rp)
+ movd %mm0, %ebx
lea 16(rp), rp
psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- psrlq $32, %mm1
adc %edx, %ebx
- movd %mm1, %eax
+ movd %mm0, %eax
adc un, %eax
- add %ebx, 4(rp)
+ add %ebx, (rp)
adc un, %eax
- mov %eax, 8(rp)
+ mov %eax, 4(rp)
decl vn
jnz L(ol0)
@@ -328,81 +310,71 @@
decl vn
jz L(done)
-L(ol1): lea 4(vp), vp
+L(ol1): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
movd (vp), %mm7 C read next V limb
- mov 20(%esp), rp
mov 24(%esp), up
- lea 4(rp), rp
- mov rp, 20(%esp)
- mov 28(%esp), un
+ lea 8(rp,un,4), rp
- movd (up), %mm1
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd %mm0, %ebx
+ movd 4(up), %mm1
pmuludq %mm7, %mm1
- shr $2, un C FIXME: move out
-
- lea -4(rp), rp
- movd %mm1, %ebx
- movd 4(up), %mm0
- lea -4(up), up
- pmuludq %mm7, %mm0
xor %edx, %edx C zero edx and CF
+ inc un
jmp L(a1)
-L(la1): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
More information about the gmp-commit
mailing list