[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Feb 13 00:37:50 CET 2011
details: /var/hg/gmp/rev/3cb34e9e9bf2
changeset: 13847:3cb34e9e9bf2
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 12 19:11:54 2011 +0100
description:
Add more c/l numbers.
details: /var/hg/gmp/rev/091da29d9813
changeset: 13848:091da29d9813
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 12 19:12:51 2011 +0100
description:
Add comment.
details: /var/hg/gmp/rev/7521314d019c
changeset: 13849:7521314d019c
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 12 19:15:27 2011 +0100
description:
Minor tweaks, update c/l numbers.
details: /var/hg/gmp/rev/fad5399da6b8
changeset: 13850:fad5399da6b8
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Feb 13 00:11:22 2011 +0100
description:
New file for Atom/64.
details: /var/hg/gmp/rev/87361d967b0f
changeset: 13851:87361d967b0f
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Feb 13 00:12:12 2011 +0100
description:
*** empty log message ***
diffstat:
ChangeLog | 8 +-
mpn/x86_64/aorrlshC_n.asm | 3 +-
mpn/x86_64/aorrlsh_n.asm | 121 +++++++++++++--------------
mpn/x86_64/aors_n.asm | 3 +-
mpn/x86_64/atom/aorrlsh2_n.asm | 174 ++++++++++++++++++++++++++++++++++++++++
mpn/x86_64/core2/aors_n.asm | 3 +-
mpn/x86_64/core2/rsh1aors_n.asm | 3 +-
mpn/x86_64/core2/sublshC_n.asm | 3 +-
mpn/x86_64/divrem_2.asm | 2 +-
mpn/x86_64/rsh1aors_n.asm | 1 +
10 files changed, 252 insertions(+), 69 deletions(-)
diffs (truncated from 489 to 300 lines):
diff -r ebcc7e666700 -r 87361d967b0f ChangeLog
--- a/ChangeLog Sat Feb 12 16:08:29 2011 +0100
+++ b/ChangeLog Sun Feb 13 00:12:12 2011 +0100
@@ -1,5 +1,11 @@
+2011-02-13 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/atom/aorrlsh2_n.asm: New file.
+
2011-02-12 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/x86_64/aorrlsh_n.asm: Minor tweaks, update c/l numbers.
+
* mpn/x86_64/atom/sublsh1_n.asm: New file.
* mpn/x86_64/atom/aorrlsh1_n.asm: New file.
@@ -55,7 +61,7 @@
* tests/mpn/t-toom8h.c: No tests below MPN_TOOM8H_MIN.
* mpz/lucnum_ui.c: Use mpn_addlsh2_n.
-
+
2011-02-04 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/atom/rsh1aors_n.asm: Add a MULFUNC_PROLOGUE.
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/aorrlshC_n.asm
--- a/mpn/x86_64/aorrlshC_n.asm Sat Feb 12 16:08:29 2011 +0100
+++ b/mpn/x86_64/aorrlshC_n.asm Sun Feb 13 00:12:12 2011 +0100
@@ -24,7 +24,8 @@
C AMD K10 2
C Intel P4 ?
C Intel core2 3
-C Intel corei 2.75
+C Intel NHM 2.75
+C Intel SBR 2.55
C Intel atom ?
C VIA nano ?
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/aorrlsh_n.asm
--- a/mpn/x86_64/aorrlsh_n.asm Sat Feb 12 16:08:29 2011 +0100
+++ b/mpn/x86_64/aorrlsh_n.asm Sun Feb 13 00:12:12 2011 +0100
@@ -2,7 +2,7 @@
dnl ("rsb" means reversed subtract, name mandated by mpn_sublsh1_n which
dnl subtacts the shifted operand from the unshifted operand.)
-dnl Copyright 2006, 2010 Free Software Foundation, Inc.
+dnl Copyright 2006, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -23,13 +23,14 @@
C cycles/limb
-C AMD K8,K9 3.25 (mpn_lshift + mpn_add_n costs 3.85 c/l)
-C AMD K10 3.25 (mpn_lshift + mpn_add_n costs 3.85 c/l)
-C Intel P4 15 (mpn_lshift + mpn_add_n costs 7.33 c/l)
-C Intel core2 4 (mpn_lshift + mpn_add_n costs 3.27 c/l)
-C Intel corei 4 (mpn_lshift + mpn_add_n costs 3.75 c/l)
-C Intel atom ?
-C VIA nano 4.7 (mpn_lshift + mpn_add_n costs 6.25 c/l)
+C AMD K8,K9 3.1 < 3.85 for lshift + add_n, using mul might reach 2.83
+C AMD K10 3.1 < 3.85 for lshift + add_n, using mul might reach 2.83
+C Intel P4 14.6 > 7.33 for lshift + add_n
+C Intel core2 3.87 > 3.27 for lshift + add_n
+C Intel NHM 4 > 3.75 for lshift + add_n
+C Intel SBR (5.8) > 3.46 for lshift + add_n
+C Intel atom (7.75) < 8.75 for lshift + add_n
+C VIA nano 4.7 < 6.25 for lshift + add_n
C This was written quickly and not optimized at all. Surely one could get
C closer to 3 c/l or perhaps even under 3 c/l. Ideas:
@@ -48,11 +49,11 @@
define(`cnt', `%r8')
ifdef(`OPERATION_addlsh_n',`
- define(ADDSUBC, `adc')
+ define(ADCSBB, `adc')
define(func, mpn_addlsh_n)
')
ifdef(`OPERATION_rsblsh_n',`
- define(ADDSUBC, `sbb')
+ define(ADCSBB, `sbb')
define(func, mpn_rsblsh_n)
')
@@ -62,48 +63,45 @@
TEXT
ALIGN(16)
PROLOGUE(func)
-
push %r12
push %r13
push %r14
- push %r15
+ push %rbp
push %rbx
mov n, %rax
- xor %ebx, %ebx C clear carry save register
- mov %r8d, %ecx C shift count
- xor %r15d, %r15d C limb carry
+ xor R32(%rbx), R32(%rbx) C clear carry save register
+ mov R32(%r8), R32(%rcx) C shift count
+ xor R32(%rbp), R32(%rbp) C limb carry
- mov %eax, %r11d
- and $3, %r11d
+ mov R32(%rax), R32(%r11)
+ and $3, R32(%r11)
je L(4)
- sub $1, %r11d
+ sub $1, R32(%r11)
-L(oopette):
- mov 0(vp), %r8
+L(012): mov (vp), %r8
mov %r8, %r12
- shl %cl, %r8
- or %r15, %r8
- neg %cl
- mov %r12, %r15
- shr %cl, %r15
- neg %cl
- add %ebx, %ebx
- ADDSUBC 0(up), %r8
- mov %r8, 0(rp)
- sbb %ebx, %ebx
+ shl R8(%rcx), %r8
+ or %rbp, %r8
+ neg R8(%rcx)
+ mov %r12, %rbp
+ shr R8(%rcx), %rbp
+ neg R8(%rcx)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ sbb R32(%rbx), R32(%rbx)
lea 8(up), up
lea 8(vp), vp
lea 8(rp), rp
- sub $1, %r11d
- jnc L(oopette)
+ sub $1, R32(%r11)
+ jnc L(012)
-L(4):
- sub $4, %rax
+L(4): sub $4, %rax
jc L(end)
-L(oop):
- mov 0(vp), %r8
+ ALIGN(16)
+L(top): mov (vp), %r8
mov %r8, %r12
mov 8(vp), %r9
mov %r9, %r13
@@ -111,55 +109,54 @@
mov %r10, %r14
mov 24(vp), %r11
- shl %cl, %r8
- shl %cl, %r9
- shl %cl, %r10
- or %r15, %r8
- mov %r11, %r15
- shl %cl, %r11
+ shl R8(%rcx), %r8
+ shl R8(%rcx), %r9
+ shl R8(%rcx), %r10
+ or %rbp, %r8
+ mov %r11, %rbp
+ shl R8(%rcx), %r11
- neg %cl
+ neg R8(%rcx)
- shr %cl, %r12
- shr %cl, %r13
- shr %cl, %r14
- shr %cl, %r15 C used next loop
+ shr R8(%rcx), %r12
+ shr R8(%rcx), %r13
+ shr R8(%rcx), %r14
+ shr R8(%rcx), %rbp C used next iteration
or %r12, %r9
or %r13, %r10
or %r14, %r11
- neg %cl
+ neg R8(%rcx)
- add %ebx, %ebx C restore carry flag
+ add R32(%rbx), R32(%rbx) C restore carry flag
- ADDSUBC 0(up), %r8
- ADDSUBC 8(up), %r9
- ADDSUBC 16(up), %r10
- ADDSUBC 24(up), %r11
+ ADCSBB (up), %r8
+ ADCSBB 8(up), %r9
+ ADCSBB 16(up), %r10
+ ADCSBB 24(up), %r11
- mov %r8, 0(rp)
+ mov %r8, (rp)
mov %r9, 8(rp)
mov %r10, 16(rp)
mov %r11, 24(rp)
- sbb %ebx, %ebx C save carry flag
+ sbb R32(%rbx), R32(%rbx) C save carry flag
lea 32(up), up
lea 32(vp), vp
lea 32(rp), rp
sub $4, %rax
- jnc L(oop)
-L(end):
- add %ebx, %ebx
- ADDSUBC $0, %r15
- mov %r15, %rax
+ jnc L(top)
+
+L(end): add R32(%rbx), R32(%rbx)
+ ADCSBB $0, %rbp
+ mov %rbp, %rax
pop %rbx
- pop %r15
+ pop %rbp
pop %r14
pop %r13
pop %r12
-
ret
EPILOGUE()
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/aors_n.asm
--- a/mpn/x86_64/aors_n.asm Sat Feb 12 16:08:29 2011 +0100
+++ b/mpn/x86_64/aors_n.asm Sun Feb 13 00:12:12 2011 +0100
@@ -25,7 +25,8 @@
C AMD K10 1.5
C Intel P4 ?
C Intel core2 4.9
-C Intel corei ?
+C Intel NHM 5.5
+C Intel SBR 1.59
C Intel atom 4
C VIA nano 3.25
diff -r ebcc7e666700 -r 87361d967b0f mpn/x86_64/atom/aorrlsh2_n.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/aorrlsh2_n.asm Sun Feb 13 00:12:12 2011 +0100
@@ -0,0 +1,174 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+dnl Optimised for Intel Atom.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5.75
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+define(`LSH', 2)
+define(`RSH', 62)
+define(M, eval(m4_lshift(1,LSH)))
+
+ifdef(`OPERATION_addlsh2_n', `
More information about the gmp-commit
mailing list