[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Sep 21 00:49:59 CEST 2013
details: /var/hg/gmp/rev/e9cf757fd930
changeset: 16007:e9cf757fd930
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Sep 20 23:47:22 2013 +0200
description:
Provide Sandy/Ivy bridge redc_1.
details: /var/hg/gmp/rev/3a69c285a68c
changeset: 16008:3a69c285a68c
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Sep 20 23:57:52 2013 +0200
description:
Add "Contributed ..." lines.
details: /var/hg/gmp/rev/1cdfc1d5d6df
changeset: 16009:1cdfc1d5d6df
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Sep 21 00:49:50 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 13 +
mpn/x86_64/core2/mul_basecase.asm | 8 +-
mpn/x86_64/core2/sqr_basecase.asm | 2 +
mpn/x86_64/coreihwl/mul_1.asm | 2 +
mpn/x86_64/coreisbr/aors_n.asm | 2 +
mpn/x86_64/coreisbr/aorsmul_1.asm | 2 +
mpn/x86_64/coreisbr/mul_1.asm | 2 +
mpn/x86_64/coreisbr/redc_1.asm | 531 ++++++++++++++++++++++++++++++++++++++
mpn/x86_64/fastavx/copyd.asm | 2 +
mpn/x86_64/fastavx/copyi.asm | 2 +
mpn/x86_64/fastsse/copyi.asm | 2 +
mpn/x86_64/k8/redc_1.asm | 2 +
mpn/x86_64/sqr_diag_addlsh1.asm | 2 +
13 files changed, 569 insertions(+), 3 deletions(-)
diffs (truncated from 692 to 300 lines):
diff -r 927fe93f3210 -r 1cdfc1d5d6df ChangeLog
--- a/ChangeLog Fri Sep 20 14:31:22 2013 +0200
+++ b/ChangeLog Sat Sep 21 00:49:50 2013 +0200
@@ -1,3 +1,16 @@
+2013-09-20 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/k8/redc_1.asm: Complete rewrite.
+
+ * mpn/x86_64/coreisbr/mullo_basecase.asm: Postpone pushes, short-
+ circuit a branch.
+
+ * mpn/x86_64/core2/mullo_basecase.asm: New file.
+
+2013-09-19 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/fastsse/copyi-palignr.asm: Allocate more stack under DOS.
+
2013-09-18 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/core2/mul_basecase.asm: New file.
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/core2/mul_basecase.asm
--- a/mpn/x86_64/core2/mul_basecase.asm Fri Sep 20 14:31:22 2013 +0200
+++ b/mpn/x86_64/core2/mul_basecase.asm Sat Sep 21 00:49:50 2013 +0200
@@ -1,6 +1,8 @@
dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
dnl It also seems good for Conroe/Wolfdale.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2008, 2011, 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -47,9 +49,9 @@
C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
C | | | |
C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
-C | / | / | / | /
-C | / | / | / | /
-C | / | / | / | /
+C | / | / | / | /
+C | / | / | / | /
+C | / | / | / | /
C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
C _____ _____ _____ _____
C / \ / \ / \ / \
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/core2/sqr_basecase.asm
--- a/mpn/x86_64/core2/sqr_basecase.asm Fri Sep 20 14:31:22 2013 +0200
+++ b/mpn/x86_64/core2/sqr_basecase.asm Sat Sep 21 00:49:50 2013 +0200
@@ -1,6 +1,8 @@
dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
dnl It also seems good for Conroe/Wolfdale.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2008, 2011, 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/coreihwl/mul_1.asm
--- a/mpn/x86_64/coreihwl/mul_1.asm Fri Sep 20 14:31:22 2013 +0200
+++ b/mpn/x86_64/coreihwl/mul_1.asm Sat Sep 21 00:49:50 2013 +0200
@@ -1,5 +1,7 @@
dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/coreisbr/aors_n.asm
--- a/mpn/x86_64/coreisbr/aors_n.asm Fri Sep 20 14:31:22 2013 +0200
+++ b/mpn/x86_64/coreisbr/aors_n.asm Sat Sep 21 00:49:50 2013 +0200
@@ -1,6 +1,8 @@
dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
dnl Haswell.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012, 2013 Free
dnl Software Foundation, Inc.
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/coreisbr/aorsmul_1.asm
--- a/mpn/x86_64/coreisbr/aorsmul_1.asm Fri Sep 20 14:31:22 2013 +0200
+++ b/mpn/x86_64/coreisbr/aorsmul_1.asm Sat Sep 21 00:49:50 2013 +0200
@@ -1,5 +1,7 @@
dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
dnl Foundation, Inc.
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/coreisbr/mul_1.asm
--- a/mpn/x86_64/coreisbr/mul_1.asm Fri Sep 20 14:31:22 2013 +0200
+++ b/mpn/x86_64/coreisbr/mul_1.asm Sat Sep 21 00:49:50 2013 +0200
@@ -1,5 +1,7 @@
dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
dnl Foundation, Inc.
diff -r 927fe93f3210 -r 1cdfc1d5d6df mpn/x86_64/coreisbr/redc_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreisbr/redc_1.asm Sat Sep 21 00:49:50 2013 +0200
@@ -0,0 +1,531 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR 3.24
+C Intel IBR 3.04
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea 8(mp_param,n,8), mp
+ lea 8(up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %r10
+ mov %rdx, %r11
+ add %rax, %r10
+ mov (mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov %rdx, %r9
+ mov (up,n,8), %rbx
+ add %rax, %rbx
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbx
+ mov %rbx, -8(up,i,8) C next low remainder limb
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+L(e1): mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp1)
+
+L(ed1): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %r10
+ mov %rdx, %r11
+ add %rax, %r10
+ mov (mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r9
More information about the gmp-commit
mailing list