[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Aug 1 23:32:42 CEST 2013
details: /var/hg/gmp/rev/bfae4d76805c
changeset: 15900:bfae4d76805c
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Aug 01 20:23:31 2013 +0200
description:
Provide sandybridge mul_basecase.
details: /var/hg/gmp/rev/97be345d591d
changeset: 15901:97be345d591d
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Aug 01 23:24:16 2013 +0200
description:
Spacing.
details: /var/hg/gmp/rev/aa3ce4b670dd
changeset: 15902:aa3ce4b670dd
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Aug 01 23:26:23 2013 +0200
description:
Provide bulldozer mul_basecase.
details: /var/hg/gmp/rev/0f9d72423df7
changeset: 15903:0f9d72423df7
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Aug 01 23:32:13 2013 +0200
description:
Check in intended version of last file.
details: /var/hg/gmp/rev/57e195e2d75f
changeset: 15904:57e195e2d75f
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Aug 01 23:32:33 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 4 +
mpn/x86_64/bd1/mul_basecase.asm | 399 +++++++++++++++++++++++++++++++++++
mpn/x86_64/coreisbr/mul_basecase.asm | 391 ++++++++++++++++++++++++++++++++++
3 files changed, 794 insertions(+), 0 deletions(-)
diffs (truncated from 811 to 300 lines):
diff -r c61b482e52c1 -r 57e195e2d75f ChangeLog
--- a/ChangeLog Thu Aug 01 01:56:41 2013 +0200
+++ b/ChangeLog Thu Aug 01 23:32:33 2013 +0200
@@ -1,5 +1,9 @@
2013-08-01 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/x86_64/bd1/mul_basecase.asm: New file.
+
+ * mpn/x86_64/coreisbr/mul_basecase.asm: New file.
+
* mpn/x86_64/coreihwl/aorsmul_1.asm: New file.
2013-07-31 Torbjorn Granlund <tege at gmplib.org>
diff -r c61b482e52c1 -r 57e195e2d75f mpn/x86_64/bd1/mul_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bd1/mul_basecase.asm Thu Aug 01 23:32:33 2013 +0200
@@ -0,0 +1,399 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull ~4.8 ~4.55 - ~4.3
+C AMD pile ~4.6 ~4.55 - ~4.55
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
+C Alternatively, we could tweak the present code (which was loopmixed for a
+C different CPU).
+C * Merge faster mul_2. Current fastest mul_2 code is non-indexed, causing
+C some structure headaches.
+C * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-dowm code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %rbx
+ push %rbp
+ mov un_param, un C free up rdx
+ neg un
+
+ mov (up), %rax C shared for mul_1 and mul_2
+ lea (up,un_param,8), up C point at operand end
+ lea (rp,un_param,8), rp C point at rp[un-1]
+
+ mov (vp), v0 C shared for mul_1 and mul_2
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
+ mov %rdx, w1
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(m110)
+
+L(m100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(m1l0)
+
+L(m110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(m1l2)
+
+L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
+ mov %rdx, w0
+ test $2, R8(un)
+ jz L(m111)
+
+L(m101):lea 3(un), n C un = 1, 5, 9, ...
+ test n, n
+ js L(m1l1)
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ pop %rbp
+ pop %rbx
+ ret
+
+L(m111):lea 1(un), n C un = 3, 7, 11, ...
+ mov 8(up,un,8), %rax
+ jmp L(m1l3)
+
+ ALIGN(16)
+L(m1tp):mov %rdx, w0
+ add %rax, w1
+L(m1l1):mov -16(up,n,8), %rax
+ adc $0, w0
+ mul v0
+ add %rax, w0
+ mov w1, -24(rp,n,8)
+ mov -8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(m1l0):mul v0
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ adc $0, w0
+L(m1l3):mul v0
+ mov w1, -8(rp,n,8)
+ mov %rdx, w1
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc $0, w1
+L(m1l2):mul v0
+ mov w0, (rp,n,8)
+ add $4, n
+ jnc L(m1tp)
+
+L(m1ed):add %rax, w1
+ adc $0, %rdx
+ mov w1, I(-8(rp),-24(rp,n,8))
+ mov %rdx, I((rp),-16(rp,n,8))
+
+ dec R32(vn)
+ jz L(ret2)
+
+ lea 8(vp), vp
+ lea 8(rp), rp
+ push %r12
+ push %r13
+ push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+ push %r12
+ push %r13
+ push %r14
+
+ mov 8(vp), v1
+
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea (un), n
+ mov %rax, w2 C 0
+ mov (up,un,8), %rax
+ mov %rdx, w1 C 1
+ mul v1
+ mov %rax, w0 C 1
+ mov w2, (rp,un,8) C 0
+ mov 8(up,un,8), %rax
+ mov %rdx, w2 C 2
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ mov %rax, w0 C 1
+ mov %rdx, w3 C 2
+ mov (up,un,8), %rax
+ mul v1
+ mov w0, (rp,un,8) C 1
+ mov %rdx, w0 C 3
+ mov %rax, w2 C 0
+ mov 8(up,un,8), %rax
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):add %rax, w2 C 0
+ mov (up,n,8), %rax
+ adc $0, w0 C 1
+L(m2l1):mul v0
+ add %rax, w2 C 0
+ mov (up,n,8), %rax
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mul v1
+ add w3, w2 C 0
+ adc $0, w1 C 1
+ add %rax, w0 C 1
+ mov w2, (rp,n,8) C 0
+ mov 8(up,n,8), %rax
+ mov %rdx, w2 C 2
+ adc $0, w2 C 2
+L(m2l0):mul v0
+ add %rax, w0 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ add w1, w0 C 1
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+ mul v1
+ add $2, n
+ mov w0, -8(rp,n,8) C 1
+ mov %rdx, w0 C 3
+ jnc L(m2tp)
+
+L(m2ed):add %rax, w2
+ adc $0, %rdx
+ add w3, w2
+ adc $0, %rdx
+ mov w2, I((rp),(rp,n,8))
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $-2, R32(vn)
+ jz L(ret5)
+
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up,un,8), %rax
+ mul v0
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): mov %rax, X1
+ mov (up,un,8), %rax
+ mov %rdx, X0
+ mul v1
+ test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), n C un = 4, 8, 12, ...
+ mov (rp,un,8), w3
+ mov %rax, w0
+ mov 8(up,un,8), %rax
+ mov %rdx, w1
+ jmp L(lo0)
+
+L(b10): lea 2(un), n C un = 2, 6, 10, ...
+ mov (rp,un,8), w1
+ mov %rdx, w3
+ mov %rax, w2
+ mov 8(up,un,8), %rax
+ jmp L(lo2)
+
More information about the gmp-commit
mailing list