[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Feb 27 00:29:53 CET 2011
details: /var/hg/gmp/rev/2d6213f99747
changeset: 13915:2d6213f99747
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Feb 27 00:29:34 2011 +0100
description:
New file.
details: /var/hg/gmp/rev/98add4b37ac1
changeset: 13916:98add4b37ac1
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Feb 27 00:29:38 2011 +0100
description:
*** empty log message ***
diffstat:
ChangeLog | 14 +-
mpn/x86/atom/sse2/mul_basecase.asm | 530 +++++++++++++++++++++++++++++++++++++
2 files changed, 538 insertions(+), 6 deletions(-)
diffs (truncated from 560 to 300 lines):
diff -r 91344b91e869 -r 98add4b37ac1 ChangeLog
--- a/ChangeLog Sat Feb 26 17:11:37 2011 +0100
+++ b/ChangeLog Sun Feb 27 00:29:38 2011 +0100
@@ -1,14 +1,16 @@
+2011-02-27 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86/atom/sse2/mul_basecase.asm: New file.
+
+2011-02-26 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/x86/atom/sse2/aorsmul_1.asm: Optimise non-loop code.
+
2011-02-26 Torbjorn Granlund <tege at gmplib.org>
* mpn/powerpc64/mode64/aorsmul_1.asm: Add MULFUNC_PROLOGUE.
* mpn/m68k/mc68020/aorsmul_1.asm: Likewise.
-2011-02-26 Marco Bodrato <bodrato at mail.dm.unipi.it>
-
- * mpn/x86/atom/sse2/aorsmul_1.asm: Optimise non-loop code.
-
-2011-02-26 Torbjorn Granlund <tege at gmplib.org>
-
* mpn/powerpc64/mode64/aorsmul_1.asm: Add missing MULFUNC_PROLOGUE.
* mpn/m68k/mc68020/aorsmul_1.asm: Likewise.
diff -r 91344b91e869 -r 98add4b37ac1 mpn/x86/atom/sse2/mul_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/atom/sse2/mul_basecase.asm Sun Feb 27 00:29:38 2011 +0100
@@ -0,0 +1,530 @@
+dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
+dnl a third limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+dnl
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside if inner loops.
+C * Play with rp and up offsets to save a bunch of lea insns.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Postpone push of ebx until we know vn > 1.
+C * Perhaps write special code for un < M, for some small M.
+C * Replace addmul_1 loop by less pipelined loop. This could save perhaps 25%
+C of the code size.
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn,
+C mp_srcptr yp, mp_size_t yn);
+C
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`un', `%ecx')
+define(`vp', `%ebp')
+define(`vn', `36(%esp)')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ mov 28(%esp), un
+ mov 32(%esp), vp
+
+ movd (up), %mm0
+ movd (vp), %mm7
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+
+ mov un, %eax
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ ja L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ decl vn
+ jz L(done)
+
+L(ol3): lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ lea 4(rp), rp
+ mov rp, 20(%esp)
+ mov 28(%esp), un
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ shr $2, un C FIXME: move out
+ lea 4(up), up
+ lea -12(rp), rp
+ movd %mm1, %ebx
+ inc un
+ movd (up), %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a3)
+
+L(la3): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+L(a3): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec un
+ movd 4(up), %mm1
+ jnz L(la3)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ decl vn
+ jnz L(ol3)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(of0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ ja L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ decl vn
+ jz L(done)
+
+L(ol0): lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ lea 4(rp), rp
+ mov rp, 20(%esp)
+ mov 28(%esp), un
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ shr $2, un C FIXME: move out
+ movd 4(up), %mm1
+ lea -8(up), up
+ lea -8(rp), rp
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ xor %edx, %edx C zero edx and CF
+ jmp L(a0)
+
+L(la0): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+L(a0): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec un
More information about the gmp-commit
mailing list