[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Mar 6 23:57:06 CET 2011
details: /var/hg/gmp/rev/a0ab9c0716ac
changeset: 14001:a0ab9c0716ac
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Mar 06 23:54:27 2011 +0100
description:
New file.
details: /var/hg/gmp/rev/737f00928f9a
changeset: 14002:737f00928f9a
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Mar 06 23:54:40 2011 +0100
description:
*** empty log message ***
details: /var/hg/gmp/rev/3b8678f9aa52
changeset: 14003:3b8678f9aa52
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Mar 06 23:56:48 2011 +0100
description:
Edit TODO list.
diffstat:
ChangeLog | 4 +
mpn/x86/atom/sse2/mul_basecase.asm | 5 +-
mpn/x86/atom/sse2/sqr_basecase.asm | 610 +++++++++++++++++++++++++++++++++++++
3 files changed, 616 insertions(+), 3 deletions(-)
diffs (truncated from 640 to 300 lines):
diff -r f6e322dd8330 -r 3b8678f9aa52 ChangeLog
--- a/ChangeLog Sat Mar 05 23:16:16 2011 +0100
+++ b/ChangeLog Sun Mar 06 23:56:48 2011 +0100
@@ -1,3 +1,7 @@
+2011-03-06 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86/atom/sse2/sqr_basecase.asm: New file.
+
2011-03-05 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/bdiv_dbm1c.asm: Write proper feed-in code.
diff -r f6e322dd8330 -r 3b8678f9aa52 mpn/x86/atom/sse2/mul_basecase.asm
--- a/mpn/x86/atom/sse2/mul_basecase.asm Sat Mar 05 23:16:16 2011 +0100
+++ b/mpn/x86/atom/sse2/mul_basecase.asm Sun Mar 06 23:56:48 2011 +0100
@@ -32,9 +32,8 @@
C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs
C for inlined mul_1, allowing us to postpone all pushes.
C * Perhaps write special code for un < M, for some small M.
-C * Replace addmul_1 loop by less pipelined loop. This could save perhaps 25%
-C of the code size.
-C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm.
+C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
+C with even less pipelined code.
C void mpn_mul_basecase (mp_ptr wp,
C mp_srcptr xp, mp_size_t xn,
diff -r f6e322dd8330 -r 3b8678f9aa52 mpn/x86/atom/sse2/sqr_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/atom/sse2/sqr_basecase.asm Sun Mar 06 23:56:48 2011 +0100
@@ -0,0 +1,610 @@
+dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+dnl
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside of inner loops.
+C * Combine rp and up updates in outer loop to save a bunch of lea insns.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Postpone push of ebx until we know n > 1. Perhaps use caller-saves regs
+C for inlined mul_1, allowing us to postpone all pushes.
+C * Perhaps write special code for n < M, for some small M.
+C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
+C with even less pipelined code.
+C * Fix function header code.
+C * We run the outer loop too long, until we perform a 1-limb by 1-limb
+C multiply. The main problem with this is that the decreasing inner loop
+C trip counts will cause poor exit branch prediction; this hurts short loops
+C VERY much.
+
+C void mpn_sqr_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn);
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+define(`un', `%ebp')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ mov 28(%esp), n
+
+ lea 4(rp), rp C write triangular product starting at rp[1]
+ lea -1(n), %eax
+ neg n
+ movd (up), %mm7
+ movd 4(up), %mm0
+ lea 4(up), up
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+ lea 1(n), un C decr ABSOLUTE value
+ lea 1(n), n C decr ABSOLUTE value
+
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ js L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ inc n
+C jz L(done)
+ lea -12(up), up
+ lea 4(rp), rp
+ jmp L(ol2)
+
+C ================================================================
+L(of0): test n, n
+ jz L(one)
+ jmp L(xx0)
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(xx0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ js L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ inc n
+C jz L(done)
+ lea -8(up), up
+ lea 8(rp), rp
+ jmp L(ol3)
+
+C ================================================================
+ ALIGN(16)
+L(lm1): movd -12(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -12(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of1): paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, (rp)
+ lea 16(up), up
+ js L(lm1)
+
+ psrlq $32, %mm6
+ movd %mm6, 4(rp)
+
+ inc n
+ jz L(done)
+ lea -20(up), up
+ lea -4(rp), rp
+ jmp L(ol0)
+
+C ================================================================
+ ALIGN(16)
+L(lm2): movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of2): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 4(rp)
+ lea 16(up), up
+ js L(lm2)
+
+ psrlq $32, %mm6
+ movd %mm6, 8(rp)
+
+ inc n
+C jz L(done)
+ lea -16(up), up
+C lea (rp), rp
+C jmp L(ol1)
+
+C ================================================================
+
+L(ol1): lea 4(up,n,4), up
+ movd (up), %mm7 C read next U invariant limb
+ lea 8(rp,n,4), rp
+ mov n, un
+
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd %mm1, %ebx
+ inc un
+ jz L(eq1)
+
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a1)
+L(eq1):
+ psrlq $32, %mm1
+ movd %mm1, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+ jmp L(cj1)
+
+L(la1): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+L(a1): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
More information about the gmp-commit
mailing list