[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Feb 12 16:08:33 CET 2011
details: /var/hg/gmp/rev/c29c5c4bf094
changeset: 13845:c29c5c4bf094
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 12 16:07:14 2011 +0100
description:
New file for Atom/64.
details: /var/hg/gmp/rev/ebcc7e666700
changeset: 13846:ebcc7e666700
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 12 16:08:29 2011 +0100
description:
Add misc comments.
diffstat:
ChangeLog | 2 +
mpn/x86_64/atom/aorrlsh1_n.asm | 9 +-
mpn/x86_64/atom/sublsh1_n.asm | 219 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 226 insertions(+), 4 deletions(-)
diffs (273 lines):
diff -r 0474659eb4b7 -r ebcc7e666700 ChangeLog
--- a/ChangeLog Sat Feb 12 11:39:09 2011 +0100
+++ b/ChangeLog Sat Feb 12 16:08:29 2011 +0100
@@ -1,5 +1,7 @@
2011-02-12 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/x86_64/atom/sublsh1_n.asm: New file.
+
* mpn/x86_64/atom/aorrlsh1_n.asm: New file.
2011-02-11 Torbjorn Granlund <tege at gmplib.org>
diff -r 0474659eb4b7 -r ebcc7e666700 mpn/x86_64/atom/aorrlsh1_n.asm
--- a/mpn/x86_64/atom/aorrlsh1_n.asm Sat Feb 12 11:39:09 2011 +0100
+++ b/mpn/x86_64/atom/aorrlsh1_n.asm Sat Feb 12 16:08:29 2011 +0100
@@ -1,5 +1,6 @@
dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+dnl Optimised for Intel Atom.
dnl Copyright 2011 Free Software Foundation, Inc.
@@ -21,7 +22,8 @@
include(`../config.m4')
C TODO
-C * This code is slightly large at 441 bytes.
+C * This code is slightly large at 433 bytes.
+C * sublsh1_n.asm and this file use the same basic pattern.
C cycles/limb
C AMD K8,K9 ?
@@ -30,7 +32,7 @@
C Intel core2 ?
C Intel NHM ?
C Intel SBR ?
-C Intel atom 4.875
+C Intel atom 4.875 (4.75 is probably possible)
C VIA nano ?
C INPUT PARAMETERS
@@ -192,8 +194,7 @@
L(x): sub $8, n
jge L(top)
-L(end):
- pop %rbx
+L(end): pop %rbx
pop %r14
pop %r13
pop %r12
diff -r 0474659eb4b7 -r ebcc7e666700 mpn/x86_64/atom/sublsh1_n.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/sublsh1_n.asm Sat Feb 12 16:08:29 2011 +0100
@@ -0,0 +1,219 @@
+dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) optimised for Intel Atom.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * This code is slightly large at 501 bytes.
+C * aorrlsh1_n.asm and this file use the same basic pattern.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5 (4.875 is probably possible)
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sublsh1_n)
+ push %rbp
+ push %r15
+ xor R32(%rbp), R32(%rbp)
+L(ent): mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r8
+ add %r8, %r8
+ lea 8(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 8(up), up
+ lea 8(rp), rp
+ jmp L(b0)
+
+L(b2): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ lea 16(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 16(up), up
+ lea 16(rp), rp
+ jmp L(b0)
+
+L(b3): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ lea 24(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ mov 16(up), %r15
+ sbb %r10, %r15
+ mov %r15, 16(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 24(up), up
+ lea 24(rp), rp
+
+L(b0): test $4, R8(n)
+ jz L(skp)
+ add R32(%rax), R32(%rax) C restore scy
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ lea 32(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ mov 16(up), %r15
+ sbb %r10, %r15
+ mov %r15, 16(rp)
+ mov 24(up), %r15
+ sbb %r11, %r15
+ mov %r15, 24(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ sbb R32(%rbp), R32(%rbp) C save acy
+
+L(skp): cmp $8, n
+ jl L(rtn)
+
+ push %r12
+ push %r13
+ push %r14
+ push %rbx
+ lea -64(rp), rp
+ jmp L(x)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ add %eax, %eax
+ lea 64(vp), vp
+ adc %r8, %r8
+ mov -56(vp), %r9
+ adc %r9, %r9
+ mov -48(vp), %r10
+ adc %r10, %r10
+ mov -40(vp), %r11
+ adc %r11, %r11
+ mov -32(vp), %r12
+ adc %r12, %r12
+ mov -24(vp), %r13
+ adc %r13, %r13
+ mov -16(vp), %r14
+ adc %r14, %r14
+ mov -8(vp), %r15
+ adc %r15, %r15
+ sbb %eax, %eax
+ add %ebp, %ebp
+ mov (up), %rbp
+ lea 64(rp), rp
+ mov 8(up), %rbx
+ sbb %r8, %rbp
+ mov 32(up), %r8
+ mov %rbp, (rp)
+ sbb %r9, %rbx
+ mov 16(up), %rbp
+ mov %rbx, 8(rp)
+ sbb %r10, %rbp
+ mov 24(up), %rbx
+ mov %rbp, 16(rp)
+ sbb %r11, %rbx
+ mov %rbx, 24(rp)
+ sbb %r12, %r8
+ mov 40(up), %r9
+ mov %r8, 32(rp)
+ sbb %r13, %r9
+ mov 48(up), %rbp
+ mov %r9, 40(rp)
+ sbb %r14, %rbp
+ mov 56(up), %rbx
+ mov %rbp, 48(rp)
+ sbb %r15, %rbx
+ lea 64(up), up
+ mov %rbx, 56(rp)
+ sbb %ebp, %ebp
+L(x): sub $8, n
+ jge L(top)
+
+L(end): pop %rbx
+ pop %r14
+ pop %r13
+ pop %r12
+L(rtn):
+ add R32(%rbp), R32(%rax)
+ neg R32(%rax)
+
+ pop %r15
+ pop %rbp
+ ret
+EPILOGUE()
+PROLOGUE(mpn_sublsh1_nc)
+ push %rbp
+ push %r15
+ neg %r8 C set CF
+ sbb R32(%rbp), R32(%rbp) C save acy
+ jmp L(ent)
+EPILOGUE()
More information about the gmp-commit
mailing list