[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Apr 23 22:22:09 UTC 2017
details: /var/hg/gmp/rev/7bd9f4521ecf
changeset: 17363:7bd9f4521ecf
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Apr 20 03:00:05 2017 +0200
description:
Add more c/l numbers.
details: /var/hg/gmp/rev/9bcf001debdc
changeset: 17364:9bcf001debdc
user: Torbjorn Granlund <tg at gmplib.org>
date: Thu Apr 20 03:08:59 2017 +0200
description:
Add more c/l numbers.
details: /var/hg/gmp/rev/b8d7c87ee026
changeset: 17365:b8d7c87ee026
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Apr 23 21:01:06 2017 +0200
description:
Rewrite feed-in code and add mul_1c entry point.
details: /var/hg/gmp/rev/daaf1eaf2767
changeset: 17366:daaf1eaf2767
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Apr 23 21:39:07 2017 +0200
description:
Replace "bt" by "test".
diffstat:
mpn/x86_64/aorsmul_1.asm | 39 ++++----
mpn/x86_64/coreisbr/mul_1.asm | 174 ++++++++++++++++++++++++-----------------
mpn/x86_64/mod_34lsub1.asm | 10 ++
3 files changed, 132 insertions(+), 91 deletions(-)
diffs (truncated from 303 to 300 lines):
diff -r 021277dcb21f -r daaf1eaf2767 mpn/x86_64/aorsmul_1.asm
--- a/mpn/x86_64/aorsmul_1.asm Tue Apr 18 23:47:55 2017 +0200
+++ b/mpn/x86_64/aorsmul_1.asm Sun Apr 23 21:39:07 2017 +0200
@@ -31,25 +31,26 @@
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.52
-C AMD K10 2.51
-C AMD bull 4.43
-C AMD pile 5.03 5.63
-C AMD steam
-C AMD excavator
-C AMD bobcat 6.20
-C AMD jaguar 5.57 6.56
-C Intel P4 14.9 17.1
-C Intel core2 5.15
-C Intel NHM 4.93
-C Intel SBR 3.95
-C Intel IBR 3.75
-C Intel HWL 3.62
-C Intel BWL 2.53
-C Intel SKL 2.53
-C Intel atom 21.3
-C Intel SLM 9.0
-C VIA nano 5.0
+C AMD K8,K9 2.52
+C AMD K10 2.51
+C AMD bd1 4.43
+C AMD bd2 5.03 5.63
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bobcat 6.20
+C AMD jaguar 5.57 6.56
+C Intel P4 14.9 17.1
+C Intel core2 5.15
+C Intel NHM 4.93
+C Intel SBR 3.95
+C Intel IBR 3.75
+C Intel HWL 3.62
+C Intel BWL 2.53
+C Intel SKL 2.53
+C Intel atom 21.3
+C Intel SLM 9.0
+C VIA nano 5.0
C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
diff -r 021277dcb21f -r daaf1eaf2767 mpn/x86_64/coreisbr/mul_1.asm
--- a/mpn/x86_64/coreisbr/mul_1.asm Tue Apr 18 23:47:55 2017 +0200
+++ b/mpn/x86_64/coreisbr/mul_1.asm Sun Apr 23 21:39:07 2017 +0200
@@ -2,7 +2,8 @@
dnl Contributed to the GNU project by Torbjörn Granlund.
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -56,114 +57,143 @@
C The loop of this code is the result of running a code generation and
C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-C TODO
-C * The loop is great, but the prologue code was quickly written. Tune it!
-C * Add mul_1c entry point.
-C * We could preserve one less register under DOS64 calling conventions, using
-C r10 instead of rsi.
-
define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
+define(`up_param',`%rsi') C rdx
define(`n_param', `%rdx') C r8
define(`v0', `%rcx') C r9
+define(`cin', `%r8') C stack
-define(`n', `%r11')
+define(`up', `%rsi') C same as rp_param
+define(`n', `%r9')
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
-IFDOS(` define(`up', ``%rsi'')') dnl
-IFDOS(` define(`rp', ``%rcx'')') dnl
-IFDOS(` define(`v0', ``%r9'')') dnl
-IFDOS(` define(`r9', ``rdi'')') dnl
-IFDOS(` define(`n_param',``%r8'')') dnl
-IFDOS(` define(`n', ``%r8'')') dnl
-IFDOS(` define(`r8', ``r11'')') dnl
+IFDOS(` define(`rp', `%rcx')')
+IFDOS(` define(`up_param',`%rdx')')
+IFDOS(` define(`n_param', `%r8')')
+IFDOS(` define(`v0', `%r9')')
+IFDOS(` define(`cin', `48(%rsp)')')
+
+IFDOS(` define(`up', `%rsi')')
+IFDOS(` define(`n', `%r8')')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_1)
-
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
- mov (up), %rax
- mov R32(`n_param'), R32(%r10)
-IFSTD(` mov n_param, n ')
-
- lea (up,n_param,8), up
+IFDOS(` push %rsi ')
+ mov (up_param), %rax
+IFSTD(` mov n_param, n ')
+ lea (up_param,n_param,8), up
lea -8(rp,n_param,8), rp
neg n
mul v0
- and $3, R32(%r10)
- jz L(b0)
- cmp $2, R32(%r10)
- jb L(b1)
- jz L(b2)
-L(b3): add $-1, n
- mov %rax, %r9
- mov %rdx, %r8
- mov 16(up,n,8), %rax
+ test $1, R8(n)
+ jz L(x0)
+L(x1): mov %rax, %r11
+ mov %rdx, %r10
+ test $2, R8(n)
+ jnz L(01)
+
+L(11): mov 8(up,n,8), %rax
+ dec n
jmp L(L3)
-L(b1): mov %rax, %r9
- mov %rdx, %r8
- add $1, n
- jnc L(L1)
+L(01): inc n
+ jnz L(L1)
mov %rax, (rp)
mov %rdx, %rax
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
+IFDOS(` pop %rsi ')
ret
-L(b2): add $-2, n
- mov %rax, %r8
- mov %rdx, %r9
- mov 24(up,n,8), %rax
+L(x0): mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up,n,8), %rax
+ test $2, R8(n)
+ jz L(L0)
+
+L(10): add $-2, n
jmp L(L2)
-L(b0): mov %rax, %r8
- mov %rdx, %r9
- mov 8(up,n,8), %rax
- jmp L(L0)
-
ALIGN(8)
-L(top): mov %rdx, %r8
- add %rax, %r9
+L(top): mov %rdx, %r10
+ add %rax, %r11
L(L1): mov 0(up,n,8), %rax
- adc $0, %r8
+ adc $0, %r10
mul v0
- add %rax, %r8
- mov %r9, 0(rp,n,8)
+ add %rax, %r10
+ mov %r11, 0(rp,n,8)
mov 8(up,n,8), %rax
- mov %rdx, %r9
- adc $0, %r9
+ mov %rdx, %r11
+L(L0c): adc $0, %r11
L(L0): mul v0
- mov %r8, 8(rp,n,8)
- add %rax, %r9
- mov %rdx, %r8
- mov 16(up,n,8), %rax
- adc $0, %r8
+ mov %r10, 8(rp,n,8)
+ add %rax, %r11
+ mov %rdx, %r10
+L(L3c): mov 16(up,n,8), %rax
+ adc $0, %r10
L(L3): mul v0
- mov %r9, 16(rp,n,8)
- mov %rdx, %r9
- add %rax, %r8
- mov 24(up,n,8), %rax
- adc $0, %r9
+ mov %r11, 16(rp,n,8)
+ mov %rdx, %r11
+ add %rax, %r10
+L(L2c): mov 24(up,n,8), %rax
+ adc $0, %r11
L(L2): mul v0
- mov %r8, 24(rp,n,8)
+ mov %r10, 24(rp,n,8)
add $4, n
jnc L(top)
-L(end): add %rax, %r9
+L(end): add %rax, %r11
mov %rdx, %rax
adc $0, %rax
- mov %r9, (rp)
+ mov %r11, (rp)
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
+IFDOS(` pop %rsi ')
ret
EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(` push %rsi ')
+ mov (up_param), %rax
+IFSTD(` mov n_param, n ')
+ lea (up_param,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ neg n
+ mul v0
+
+ test $1, R8(n)
+ jz L(x0c)
+L(x1c): mov %rax, %r11
+ mov %rdx, %r10
+ test $2, R8(n)
+ jnz L(01c)
+
+L(11c): add cin, %r11
+ dec n
+ jmp L(L3c)
+
+L(01c): add cin, %r11
+ inc n
+ jnz L(L1)
+ mov %r11, (rp)
+ mov %rdx, %rax
+ adc $0, %rax
+IFDOS(` pop %rsi ')
+ ret
+
+L(x0c): mov %rax, %r10
+ mov %rdx, %r11
+ test $2, R8(n)
+ jz L(00c)
+
+L(10c): add $-2, n
+ add cin, %r10
+ jmp L(L2c)
+
+L(00c): add cin, %r10
+ mov 8(up,n,8), %rax
+ jmp L(L0c)
+EPILOGUE()
diff -r 021277dcb21f -r daaf1eaf2767 mpn/x86_64/mod_34lsub1.asm
--- a/mpn/x86_64/mod_34lsub1.asm Tue Apr 18 23:47:55 2017 +0200
+++ b/mpn/x86_64/mod_34lsub1.asm Sun Apr 23 21:39:07 2017 +0200
@@ -36,12 +36,22 @@
C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way
C AMD K10 0.67 this seems hard to beat
C AMD bd1 1
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 0.62
C AMD bobcat 1.07
+C AMD jaguar 1
C Intel P4 7.35 terrible, use old code
C Intel core2 1.25 1+epsilon with huge unrolling
C Intel NHM 1.15 this seems hard to beat
C Intel SBR 0.93
+C Intel IBR 0.93
+C Intel HWL 0.82
+C Intel BWL 0.64
+C Intel SKY 0.60
C Intel atom 2.5
+C Intel SLM 1.59
More information about the gmp-commit
mailing list