[Gmp-commit] /var/hg/gmp: 7 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Aug 3 00:39:04 CEST 2013
details: /var/hg/gmp/rev/f1551bf500b7
changeset: 15906:f1551bf500b7
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Aug 02 18:31:20 2013 +0200
description:
Complete rewrite of sandybridge addmul_2.
details: /var/hg/gmp/rev/4c12b13e8e37
changeset: 15907:4c12b13e8e37
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Aug 02 18:39:55 2013 +0200
description:
Fix typo.
details: /var/hg/gmp/rev/8757d877662f
changeset: 15908:8757d877662f
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Aug 03 00:30:40 2013 +0200
description:
Provide sandybridge mul_2.
details: /var/hg/gmp/rev/9ac67df52775
changeset: 15909:9ac67df52775
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Aug 03 00:35:16 2013 +0200
description:
Support DOS64.
details: /var/hg/gmp/rev/2482bb627173
changeset: 15910:2482bb627173
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Aug 03 00:35:58 2013 +0200
description:
Spacing.
details: /var/hg/gmp/rev/efb65b3b6a6c
changeset: 15911:efb65b3b6a6c
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Aug 03 00:38:35 2013 +0200
description:
Save some O(n) and O(1) cycles.
details: /var/hg/gmp/rev/d62fd347f7c7
changeset: 15912:d62fd347f7c7
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Aug 03 00:39:01 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 10 +
mpn/x86_64/bd1/mul_basecase.asm | 1 +
mpn/x86_64/coreisbr/addmul_2.asm | 296 +++++++++++++++++-----------------
mpn/x86_64/coreisbr/mul_2.asm | 153 ++++++++++++++++++
mpn/x86_64/coreisbr/mul_basecase.asm | 29 +--
5 files changed, 328 insertions(+), 161 deletions(-)
diffs (truncated from 628 to 300 lines):
diff -r 43339e712783 -r d62fd347f7c7 ChangeLog
--- a/ChangeLog Fri Aug 02 13:23:26 2013 +0200
+++ b/ChangeLog Sat Aug 03 00:39:01 2013 +0200
@@ -1,3 +1,13 @@
+2013-08-03 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/coreisbr/mul_basecase.asm: Save some O(n) and O(1) cycles.
+
+ * mpn/x86_64/coreisbr/mul_2.asm: New file.
+
+2013-08-02 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/coreisbr/addmul_2.asm: Complete rewrite.
+
2013-08-01 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/bd1/mul_basecase.asm: New file.
diff -r 43339e712783 -r d62fd347f7c7 mpn/x86_64/bd1/mul_basecase.asm
--- a/mpn/x86_64/bd1/mul_basecase.asm Fri Aug 02 13:23:26 2013 +0200
+++ b/mpn/x86_64/bd1/mul_basecase.asm Sat Aug 03 00:39:01 2013 +0200
@@ -3,6 +3,7 @@
dnl Contributed to the GNU project by Torbjörn Granlund.
dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
diff -r 43339e712783 -r d62fd347f7c7 mpn/x86_64/coreisbr/addmul_2.asm
--- a/mpn/x86_64/coreisbr/addmul_2.asm Fri Aug 02 13:23:26 2013 +0200
+++ b/mpn/x86_64/coreisbr/addmul_2.asm Sat Aug 03 00:39:01 2013 +0200
@@ -1,6 +1,9 @@
-dnl X86-64 mpn_addmul_2 optimised for Intel Sandy Bridge.
+dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -19,188 +22,193 @@
include(`../config.m4')
-C cycles/limb
+C cycles/limb best
C AMD K8,K9
-C AMD K10 4.07
-C AMD bd1
-C AMD bobcat 5.25
-C Intel P4 16.1
-C Intel core2
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
C Intel NHM
-C Intel SBR 3.2
+C Intel SBR 2.93 this
+C Intel IBR 2.66 this
+C Intel HWL 2.5 2.0
+C Intel BWL
C Intel atom
-C VIA nano 5.23
+C VIA nano
C This code is the result of running a code generation and optimisation tool
C suite written by David Harvey and Torbjorn Granlund.
-C TODO
-C * Tune feed-in and wind-down code.
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-dowm code.
+define(`I',`$1')
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vp', `%rcx')
-define(`v0', `%r12')
-define(`v1', `%r13')
-define(`n', `%r11')
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+define(`X0', `%r12')
+define(`X1', `%r13')
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
ASM_START()
TEXT
- ALIGN(16)
+ ALIGN(32)
PROLOGUE(mpn_addmul_2)
FUNC_ENTRY(4)
push %rbx
+ push %rbp
push %r12
push %r13
- push %r14
+
+ mov (vp), v0
+ mov 8(vp), v1
mov (up), %rax
mov n_param, n
- mov 0(vp), v0
- mov 8(vp), v1
- shr $2, n
- and $3, R32(n_param)
- jz L(b0)
- cmp $2, R32(n_param)
- jb L(b1)
- jz L(b2)
+ neg n
-L(b3): mov (rp), %r10
- mov $0, R32(%rcx)
+ lea (up,n_param,8), up
+ lea 8(rp,n_param,8), rp
mul v0
- add %rax, %r10
- mov %rdx, %r14
- adc $0, %r14
- lea -16(rp), rp
- lea -16(up), up
- mov $0, R32(%r9)
- mov $0, R32(%rbx)
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): mov -8(rp,n,8), X0
+ mov %rdx, w1
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ xor w0, w0
+ xor w3, w3
+ test $2, R8(n)
+ jnz L(b10)
+
+L(b00): nop C this nop make loop go faster on SBR!
+ mul v1
+ mov (rp,n,8), X1
+ jmp L(lo0)
+
+L(b10): lea -2(n), n
+ jmp L(lo2)
+
+L(bx1): mov -8(rp,n,8), X1
+ mov %rdx, w3
+ add %rax, X1
+ adc $0, w3
+ mov (up,n,8), %rax
+ xor w1, w1
+ xor w2, w2
+ test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov (rp,n,8), X0
inc n
- jmp L(L3)
+ jmp L(lo1)
-L(b0): mov (rp), %r8
- mul v0
- add %rax, %r8
- mov %rdx, %r9
- adc $0, %r9
- mov $0, R32(%rbx)
- lea -8(rp), rp
- lea -8(up), up
- jmp L(L0)
-
-L(b1): mov (rp), %r10
- mov $0, R32(%rcx)
- mul v0
- add %rax, %r10
- mov %rdx, %r14
- adc $0, %r14
- mov %r10, 0(rp)
- jmp L(L1)
-
-L(b2): mov (rp), %r8
- mul v0
- add %rax, %r8
- mov $0, R32(%rbx)
- mov %rdx, %r9
- adc $0, %r9
- lea -24(rp), rp
- lea -24(up), up
- inc n
- jmp L(L2)
+L(b11): dec n
+ jmp L(lo3)
ALIGN(32)
-L(top): mov %r10, 32(rp)
- adc %rbx, %r14 C 10
- lea 32(rp), rp
-L(L1): mov 0(up), %rax
- adc $0, R32(%rcx)
+L(top):
+L(lo1): mul v1
+ mov %rdx, w0 C 1
+ add %rax, X0 C 0
+ adc $0, w0 C 1
+ add w1, X1 C 3
+ adc $0, w3 C 0
+ add w2, X0 C 0
+ adc $0, w0 C 1
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, X0 C 0
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mov (up,n,8), %rax
mul v1
- mov $0, R32(%rbx)
- mov 8(rp), %r8
- add %rax, %r8
- mov %rdx, %r9
- mov 8(up), %rax
- adc $0, %r9
+ mov X1, -16(rp,n,8) C 3
+ mov (rp,n,8), X1 C 1
+ add w3, X0 C 0
+ adc $0, w1 C 1
+L(lo0): mov %rdx, w2 C 2
+ mov X0, -8(rp,n,8) C 0
+ add %rax, X1 C 1
+ adc $0, w2 C 2
+ mov 8(up,n,8), %rax
+ add w0, X1 C 1
+ adc $0, w2 C 2
mul v0
- add %rax, %r8
- adc %rdx, %r9
- adc $0, R32(%rbx)
- add %r14, %r8 C 0 12
- adc %rcx, %r9 C 1
-L(L0): mov 8(up), %rax
- adc $0, R32(%rbx)
- mov 16(rp), %r10
- mul v1
- add %rax, %r10
- mov %rdx, %r14
- mov 16(up), %rax
- mov $0, R32(%rcx)
- adc $0, %r14
+ add %rax, X1 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+L(lo3): mul v1
+ add w1, X1 C 1
+ mov 8(rp,n,8), X0 C 2
+ adc $0, w3 C 2
+ mov %rdx, w0 C 3
+ add %rax, X0 C 2
+ adc $0, w0 C 3
+ mov 16(up,n,8), %rax
mul v0
- add %rax, %r10
- adc %rdx, %r14
- adc $0, R32(%rcx)
- mov %r8, 8(rp)
-L(L3): mov 24(rp), %r8
- mov 16(up), %rax
- mul v1
- add %r9, %r10 C 3
- adc %rbx, %r14 C 4
- adc $0, R32(%rcx)
- add %rax, %r8
- mov %rdx, %r9
- adc $0, %r9
- mov 24(up), %rax
+ add w2, X0 C 2
+ mov X1, (rp,n,8) C 1
More information about the gmp-commit
mailing list