[Gmp-commit] /var/hg/gmp: New file.
mercurial at gmplib.org
mercurial at gmplib.org
Wed Mar 2 20:47:25 CET 2011
details: /var/hg/gmp/rev/03b5f89c6861
changeset: 13981:03b5f89c6861
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Mar 02 20:47:20 2011 +0100
description:
New file.
diffstat:
ChangeLog | 4 +
mpn/x86_64/coreisbr/aorrlsh_n.asm | 192 ++++++++++++++++++++++++++++++++++++++
2 files changed, 196 insertions(+), 0 deletions(-)
diffs (207 lines):
diff -r a91d9f531583 -r 03b5f89c6861 ChangeLog
--- a/ChangeLog Wed Mar 02 10:33:29 2011 +0100
+++ b/ChangeLog Wed Mar 02 20:47:20 2011 +0100
@@ -1,3 +1,7 @@
+2011-03-02 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/coreisbr/aorrlsh_n.asm: New file.
+
2011-03-01 Niels Möller <nisse at lysator.liu.se>
* mpn/x86_64/mod_1_1.asm (mpn_mod_1_1p_cps): Eliminated a neg and
diff -r a91d9f531583 -r 03b5f89c6861 mpn/x86_64/coreisbr/aorrlsh_n.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreisbr/aorrlsh_n.asm Wed Mar 02 20:47:20 2011 +0100
@@ -0,0 +1,192 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 5.25
+C Intel P4 ?
+C Intel core2 3.1
+C Intel NHM 3.95
+C Intel SBR 2.75
+C Intel atom ?
+C VIA nano ?
+
+C This code probably runs close to optimally on Sandy Bridge, and reasonably
+C well on Core 2, but it runs poorly on all other processors, including
+C Nehalem.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+define(`cy', `%r9') C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(IFRSB, )
+ define(func_n, mpn_addlsh_n)
+ define(func_nc, mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(IFRSB, `$1')
+ define(func_n, mpn_rsblsh_n)
+ define(func_nc, mpn_rsblsh_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n mpn_rsblsh_nc)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func_n)
+ push %rbx
+ xor R32(%rbx), R32(%rbx) C clear CF save register
+L(ent): push %rbp
+ mov R32(n), R32(%rbp)
+ mov n, %rax
+ mov R32(cnt), R32(%rcx)
+ neg R32(%rcx)
+ and $3, R32(%rbp)
+ jz L(b0)
+ lea -32(vp,%rbp,8), vp
+ lea -32(up,%rbp,8), up
+ lea -32(rp,%rbp,8), rp
+ cmp $2, R32(%rbp)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): xor %r8, %r8
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r9, %r8
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $3, %rax
+ jz L(3)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ jmp L(lo3)
+L(3): add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ jmp L(wd3)
+
+L(b0): mov (vp), %r8
+ mov 8(vp), %r9
+ xor R32(%rbp), R32(%rbp)
+ jmp L(lo0)
+
+L(b1): xor %r10, %r10
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $1, %rax
+ jz L(1)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ mov (vp), %r8
+ jmp L(lo1)
+L(1): add R32(%rbx), R32(%rbx)
+ ADCSBB 24(up), %r10
+ jmp L(wd1)
+
+L(b2): xor %r9, %r9
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $2, %rax
+ jz L(2)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ jmp L(lo2)
+L(2): add R32(%rbx), R32(%rbx)
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ jmp L(wd2)
+
+ ALIGN(32) C 16-byte alignment is not enough!
+L(top): shrd R8(%rcx), %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ mov %rbp, (rp)
+ lea 32(up), up
+L(lo3): mov %r8, 8(rp)
+L(lo2): mov %r9, 16(rp)
+ mov (vp), %r8
+L(lo1): mov %r10, 24(rp)
+ mov 8(vp), %r9
+ mov %r11, %rbp
+ lea 32(rp), rp
+ sbb R32(%rbx), R32(%rbx)
+L(lo0): shrd R8(%rcx), %r8, %rbp
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r9, %r8
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ sub $4, %rax
+ jg L(top)
+
+ shrd R8(%rcx), %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ mov %rbp, (rp)
+L(wd3): mov %r8, 8(rp)
+L(wd2): mov %r9, 16(rp)
+L(wd1): mov %r10, 24(rp)
+ adc R32(%rax), R32(%rax) C rax is zero after loop
+ shr R8(%rcx), %r11
+ ADDSUB %r11, %rax
+IFRSB( neg %rax)
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ push %rbx
+ neg cy
+ sbb R32(%rbx), R32(%rbx) C initialise CF save register
+ jmp L(ent)
+EPILOGUE()
More information about the gmp-commit
mailing list