[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Mar 3 00:09:46 CET 2011
details: /var/hg/gmp/rev/1397a21714ec
changeset: 13982:1397a21714ec
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 03 00:09:20 2011 +0100
description:
New file.
details: /var/hg/gmp/rev/7ab84ee0842e
changeset: 13983:7ab84ee0842e
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 03 00:09:43 2011 +0100
description:
Comment changes.
diffstat:
ChangeLog | 4 +
mpn/x86_64/coreinhm/aorrlsh_n.asm | 177 ++++++++++++++++++++++++++++++++++++++
mpn/x86_64/coreisbr/aorrlsh_n.asm | 7 +-
3 files changed, 185 insertions(+), 3 deletions(-)
diffs (215 lines):
diff -r 03b5f89c6861 -r 7ab84ee0842e ChangeLog
--- a/ChangeLog Wed Mar 02 20:47:20 2011 +0100
+++ b/ChangeLog Thu Mar 03 00:09:43 2011 +0100
@@ -1,3 +1,7 @@
+2011-03-03 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/coreinhm/aorrlsh_n.asm: New file.
+
2011-03-02 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/coreisbr/aorrlsh_n.asm: New file.
diff -r 03b5f89c6861 -r 7ab84ee0842e mpn/x86_64/coreinhm/aorrlsh_n.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm Thu Mar 03 00:09:43 2011 +0100
@@ -0,0 +1,177 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+dnl Optimised for Nehalem.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 4.75
+C Intel P4 ?
+C Intel core2 2.8-3
+C Intel NHM 2.8
+C Intel SBR 3.55
+C Intel atom ?
+C VIA nano ?
+
+C The inner-loop probably runs close to optimally on Nehalem. The rest of the
+C code is quite crude, and could perhaps be made both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+define(`cy', `%r9') C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(IFRSB, )
+ define(func_n, mpn_addlsh_n)
+ define(func_nc, mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(IFRSB, `$1')
+ define(func_n, mpn_rsblsh_n)
+ define(func_nc, mpn_rsblsh_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n mpn_rsblsh_nc)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func_n)
+ push %rbx
+ xor R32(%rbx), R32(%rbx) C clear CF save register
+L(ent): push %rbp
+ mov R32(n), R32(%rbp)
+ mov n, %rax
+
+ mov R32(cnt), R32(%rcx)
+ neg R32(%rcx)
+
+ lea -8(up,%rax,8), up
+ lea -8(vp,%rax,8), vp
+ lea -40(rp,%rax,8), rp
+ neg %rax
+
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): xor R32(%r9), R32(%r9)
+ mov 8(vp,%rax,8), %r10
+ mov 16(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(up,%rax,8), %r9
+ mov 24(vp,%rax,8), %r8
+ ADCSBB 16(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $3, %rax
+ jmp L(lo3)
+
+L(b0): mov 8(vp,%rax,8), %r9
+ xor R32(%r8), R32(%r8)
+ shrd %cl, %r9, %r8
+ mov 16(vp,%rax,8), %r10
+ mov 24(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(up,%rax,8), %r8
+ mov %r8, 40(rp,%rax,8) C offset 40
+ ADCSBB 16(up,%rax,8), %r9
+ mov 32(vp,%rax,8), %r8
+ ADCSBB 24(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $4, %rax
+ jmp L(lo0)
+
+L(b1): mov 8(vp,%rax,8), %r8
+ add $1, %rax
+ jz L(1)
+ mov 8(vp,%rax,8), %r9
+ xor R32(%rbp), R32(%rbp)
+ jmp L(lo1)
+L(1): xor R32(%r11), R32(%r11)
+ jmp L(wd1)
+
+L(b2): xor %r10, %r10
+ mov 8(vp,%rax,8), %r11
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ mov 16(vp,%rax,8), %r8
+ ADCSBB 8(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $2, %rax
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov 8(vp,%rax,8), %r9
+ mov %r11, %rbp
+L(lo2): mov %r10, 24(rp,%rax,8) C offset 24
+L(lo1): shrd %cl, %r8, %rbp
+ shrd %cl, %r9, %r8
+ mov 16(vp,%rax,8), %r10
+ mov 24(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up,%rax,8), %rbp
+ ADCSBB 8(up,%rax,8), %r8
+ mov %r8, 40(rp,%rax,8) C offset 40
+ ADCSBB 16(up,%rax,8), %r9
+ mov 32(vp,%rax,8), %r8
+ ADCSBB 24(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $4, %rax
+ mov %rbp, (rp,%rax,8) C offset 32
+L(lo0):
+L(lo3): mov %r9, 16(rp,%rax,8) C offset 48
+ jnz L(top)
+
+L(end): mov %r10, 24(rp,%rax,8)
+L(wd1): shrd %cl, %r8, %r11
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up,%rax,8), %r11
+ mov %r11, 32(rp,%rax,8) C offset 32
+ adc R32(%rax), R32(%rax) C rax is zero after loop
+ shr R8(%rcx), %r8
+ ADDSUB %r8, %rax
+IFRSB( neg %rax)
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ push %rbx
+ neg cy
+ sbb R32(%rbx), R32(%rbx) C initialise CF save register
+ jmp L(ent)
+EPILOGUE()
diff -r 03b5f89c6861 -r 7ab84ee0842e mpn/x86_64/coreisbr/aorrlsh_n.asm
--- a/mpn/x86_64/coreisbr/aorrlsh_n.asm Wed Mar 02 20:47:20 2011 +0100
+++ b/mpn/x86_64/coreisbr/aorrlsh_n.asm Thu Mar 03 00:09:43 2011 +0100
@@ -1,5 +1,6 @@
dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+dnl Optimised for Sandy Bridge.
dnl Contributed to the GNU project by Torbjorn Granlund.
@@ -32,9 +33,9 @@
C Intel atom ?
C VIA nano ?
-C This code probably runs close to optimally on Sandy Bridge, and reasonably
-C well on Core 2, but it runs poorly on all other processors, including
-C Nehalem.
+C The inner-loop probably runs close to optimally on Sandy Bridge using 4-way
+C unrolling. The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
C INPUT PARAMETERS
define(`rp', `%rdi')
More information about the gmp-commit
mailing list