[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Sep 18 00:47:29 CEST 2013
details: /var/hg/gmp/rev/7ace27253c1e
changeset: 15997:7ace27253c1e
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Sep 18 00:35:04 2013 +0200
description:
Provide SBR and HWL mullo_basecase.
details: /var/hg/gmp/rev/f1003a731f99
changeset: 15998:f1003a731f99
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Sep 18 00:37:50 2013 +0200
description:
Cleanup, streamline.
details: /var/hg/gmp/rev/53a7c1b2f7ba
changeset: 15999:53a7c1b2f7ba
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Sep 18 00:44:34 2013 +0200
description:
Provide mul_basecase and sqr_basecase for Conroe, Wolfdale, Nehalem, Westmere.
details: /var/hg/gmp/rev/a0045d16ef28
changeset: 16000:a0045d16ef28
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Sep 18 00:45:47 2013 +0200
description:
Cosmetic fix to mulx byte output.
details: /var/hg/gmp/rev/eb6a8d6a0c8f
changeset: 16001:eb6a8d6a0c8f
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Sep 18 00:46:07 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 12 +
mpn/x86_64/core2/mul_basecase.asm | 962 ++++++++++++++++++++++++++++++++
mpn/x86_64/core2/sqr_basecase.asm | 971 +++++++++++++++++++++++++++++++++
mpn/x86_64/coreihwl/mullo_basecase.asm | 416 ++++++++++++++
mpn/x86_64/coreisbr/mul_basecase.asm | 150 ++--
mpn/x86_64/coreisbr/mullo_basecase.asm | 360 ++++++++++++
mpn/x86_64/coreisbr/sqr_basecase.asm | 138 ++--
mpn/x86_64/x86_64-defs.m4 | 2 +-
8 files changed, 2868 insertions(+), 143 deletions(-)
diffs (truncated from 3164 to 300 lines):
diff -r 52121fb11f27 -r eb6a8d6a0c8f ChangeLog
--- a/ChangeLog Mon Sep 16 19:57:46 2013 +0200
+++ b/ChangeLog Wed Sep 18 00:46:07 2013 +0200
@@ -1,3 +1,15 @@
+2013-09-18 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/core2/mul_basecase.asm: New file.
+ * mpn/x86_64/core2/sqr_basecase.asm: New file.
+
+ * mpn/x86_64/coreihwl/mullo_basecase.asm: New file.
+ * mpn/x86_64/coreisbr/mullo_basecase.asm: New file.
+
+2013-09-16 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/fastsse/copyi-palignr.asm: Preserve xmm6-xmm8 under DOS.
+
2013-09-15 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/tabselect.asm: Use R8 for bit testing.
diff -r 52121fb11f27 -r eb6a8d6a0c8f mpn/x86_64/core2/mul_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/core2/mul_basecase.asm Wed Sep 18 00:46:07 2013 +0200
@@ -0,0 +1,962 @@
+dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
+dnl It also seems good for Conroe/Wolfdale.
+
+dnl Copyright 2008, 2011, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.0 4.0 - 4.18-4.25
+C Intel NHM 3.75 3.8 - 4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
+C | | | |
+C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
+C | / | / | / | /
+C | / | / | / | /
+C | / | / | / | /
+C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
+C _____ _____ _____ _____
+C / \ / \ / \ / \
+C \|/ | \|/ | \|/ | \|/ |
+C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
+C \ /|\ \ /|\ \ /|\ \ /|\
+C \_____/ \_____/ \_____/ \_____/
+
+C TODO
+C * Tune. None done so far.
+C * Currently 2687 bytes, making it smaller would be nice.
+C * Implement some basecases, say for un < 4.
+C * Try zeroing with xor in m2 loops.
+C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C between loop header and wind-down code.
+C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance!
+define(`vn_param', `%r8')
+
+define(`un', `%r9')
+define(`vn', `(%rsp)')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r12')
+define(`i', `%r13')
+define(`vp', `%r14')
+
+define(`X0', `%r8')
+define(`X1', `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ mov (up), %rax C shared for mul_1 and mul_2
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov (vp_param), v0 C shared for mul_1 and mul_2
+
+ xor un, un
+ sub un_param, un C un = -un_param
+
+ lea (up,un_param,8), up
+ lea (rp,un_param,8), rp
+
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn_param)
+ jz L(m2)
+
+ lea 8(vp_param), vp C FIXME: delay until known needed
+
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):test $2, R8(un)
+ jnz L(m1s2)
+
+L(m1s0):
+ lea (un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ lea L(do_am0)(%rip), %rbp
+ jmp L(m1e0)
+
+L(m1s2):
+ lea 2(un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ mul v0
+ lea L(do_am2)(%rip), %rbp
+ test i, i
+ jnz L(m1e2)
+ add %rax, w0
+ adc $0, %rdx
+ mov w0, I(-8(rp),8(rp,un,8))
+ mov %rdx, I((rp),16(rp,un,8))
+ jmp L(ret2)
+
+L(m1x1):test $2, R8(un)
+ jz L(m1s3)
+
+L(m1s1):
+ lea 1(un), i
+ mov %rax, (rp,un,8)
+ test i, i
+ jz L(1)
+ mov 8(up,un,8), %rax
+ mov %rdx, w1 C FIXME: Use lea?
+ lea L(do_am1)(%rip), %rbp
+ jmp L(m1e1)
+L(1): mov %rdx, I((rp),8(rp,un,8))
+ jmp L(ret2)
+
+L(m1s3):
+ lea -1(un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w1 C FIXME: Use lea?
+ lea L(do_am3)(%rip), %rbp
+ jmp L(m1e3)
+
+ ALIGNx
+L(m1top):
+ mul v0
+ mov w1, -16(rp,i,8)
+L(m1e2):xor R32(w1), R32(w1)
+ add %rax, w0
+ mov (up,i,8), %rax
+ adc %rdx, w1
+ mov w0, -8(rp,i,8)
+L(m1e1):xor R32(w0), R32(w0)
+ mul v0
+ add %rax, w1
+ mov 8(up,i,8), %rax
+ adc %rdx, w0
+ mov w1, (rp,i,8)
+L(m1e0):xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w0
+ mov 16(up,i,8), %rax
+ adc %rdx, w1
+ mov w0, 8(rp,i,8)
+L(m1e3):xor R32(w0), R32(w0)
+ mul v0
+ add %rax, w1
+ mov 24(up,i,8), %rax
+ adc %rdx, w0
+ add $4, i
+ js L(m1top)
+
+ mul v0
+ mov w1, I(-16(rp),-16(rp,i,8))
+ add %rax, w0
+ adc $0, %rdx
+ mov w0, I(-8(rp),-8(rp,i,8))
+ mov %rdx, I((rp),(rp,i,8))
+
+ dec vn_param
+ jz L(ret2)
+ lea -8(rp), rp
+ jmp *%rbp
+
+L(m2):
+ mov 8(vp_param), v1
+ lea 16(vp_param), vp C FIXME: delay until known needed
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), i
+ mov %rax, (rp,un,8)
+ mov %rdx, w1 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov $0, R32(w2)
+ jmp L(m2e0)
+
+L(b10): lea -2(un), i
+ mov %rax, w2 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov %rdx, w3 C FIXME: Use lea?
+ mov $0, R32(w0)
+ jmp L(m2e2)
+
+L(bx1): test $2, R8(un)
+ jz L(b11)
+
+L(b01): lea 1(un), i
+ mov %rax, (rp,un,8)
+ mov (up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ mov $0, R32(w1)
+ jmp L(m2e1)
+
+L(b11): lea -1(un), i
+ mov %rax, w1 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov %rdx, w2 C FIXME: Use lea?
+ mov $0, R32(w3)
+ jmp L(m2e3)
+
+ ALIGNx
+L(m2top0):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
More information about the gmp-commit
mailing list