[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Feb 19 14:50:08 CET 2011
details: /var/hg/gmp/rev/aecb2b334ec5
changeset: 13861:aecb2b334ec5
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 19 14:46:53 2011 +0100
description:
New Atom/64 shifting files.
details: /var/hg/gmp/rev/3368bb65af64
changeset: 13862:3368bb65af64
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Feb 19 14:48:40 2011 +0100
description:
New Atom/64 file.
diffstat:
ChangeLog | 8 +
mpn/x86_64/atom/lshift.asm | 104 +++++++++++++++++
mpn/x86_64/atom/lshiftc.asm | 108 +++++++++++++++++
mpn/x86_64/atom/rsh1aors_n.asm | 248 ++++++++++++++++++++++++++++++++++++++++-
mpn/x86_64/atom/rshift.asm | 102 ++++++++++++++++
5 files changed, 569 insertions(+), 1 deletions(-)
diffs (truncated from 597 to 300 lines):
diff -r 22231ea25f79 -r 3368bb65af64 ChangeLog
--- a/ChangeLog Thu Feb 17 11:09:18 2011 +0100
+++ b/ChangeLog Sat Feb 19 14:48:40 2011 +0100
@@ -1,3 +1,11 @@
+2011-02-19 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/atom/rsh1aors_n.asm: New file.
+
+ * mpn/x86_64/atom/lshift.asm: New file.
+ * mpn/x86_64/atom/rshift.asm: New file.
+ * mpn/x86_64/atom/lshiftc.asm: New file.
+
2011-02-17 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/x86/atom/aorsmul_1.asm: Small improvements for small sizes.
diff -r 22231ea25f79 -r 3368bb65af64 mpn/x86_64/atom/lshift.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/lshift.asm Sat Feb 19 14:48:40 2011 +0100
@@ -0,0 +1,104 @@
+dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
+C larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshift)
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ test n, n
+ jnz L(gt1)
+ mov %r11, (rp)
+ ret
+
+L(gt1): mov -8(up), %r8
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ mov -8(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ neg R8(%rcx)
+ dec n
+ lea 8(rp), rp
+ lea -8(up), up
+ jz L(end)
+
+ ALIGN(8)
+L(top): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ mov -8(up), %r8
+ mov %r8, %r10
+ mov %r9, -8(rp)
+ shr R8(%rcx), %r8
+ lea -16(rp), rp
+L(lo1): mov -16(up), %r9
+ or %r11, %r8
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ lea -16(up), up
+ neg R8(%rcx)
+ mov %r8, (rp)
+ dec n
+ jg L(top)
+
+L(end): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ mov %r9, -8(rp)
+ mov %r11, -16(rp)
+ ret
+EPILOGUE()
diff -r 22231ea25f79 -r 3368bb65af64 mpn/x86_64/atom/lshiftc.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/lshiftc.asm Sat Feb 19 14:48:40 2011 +0100
@@ -0,0 +1,108 @@
+dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5
+C times larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ test n, n
+ jnz L(gt1)
+ not %r11
+ mov %r11, (rp)
+ ret
+
+L(gt1): mov -8(up), %r8
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ mov -8(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ neg R8(%rcx)
+ lea 8(rp), rp
+ lea -8(up), up
+ jmp L(lo0)
+
+C ALIGN(16)
+L(top): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ not %r9
+ neg R8(%rcx)
+ mov -8(up), %r8
+ lea -16(rp), rp
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ mov %r9, 8(rp)
+L(lo1): or %r11, %r8
+ mov -16(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ lea -16(up), up
+ neg R8(%rcx)
+ not %r8
+ mov %r8, (rp)
+L(lo0): dec n
+ jg L(top)
+
+L(end): shl R8(%rcx), %r10
+ or %r10, %r9
+ not %r9
+ shl R8(%rcx), %r11
+ not %r11
+ mov %r9, -8(rp)
+ mov %r11, -16(rp)
+ ret
+EPILOGUE()
diff -r 22231ea25f79 -r 3368bb65af64 mpn/x86_64/atom/rsh1aors_n.asm
--- a/mpn/x86_64/atom/rsh1aors_n.asm Thu Feb 17 11:09:18 2011 +0100
+++ b/mpn/x86_64/atom/rsh1aors_n.asm Sat Feb 19 14:48:40 2011 +0100
@@ -21,5 +21,251 @@
include(`../config.m4')
+C TODO
+C * Schedule loop less. It is now almost surely overscheduled, resulting in
+C large feed-in and wind-down code.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NMH ?
+C Intel SBR ?
+C Intel atom 5.25
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
-include_mpn(`x86_64/pentium4/rsh1aors_n.asm')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), %r15
+ ADDSUB (vp), %r15
+ sbb R32(%rbx), R32(%rbx)
+ xor R32(%rax), R32(%rax)
+ shr %r15
+ adc R32(%rax), R32(%rax) C return value
+
+ mov R32(n), R32(%rbp)
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jae L(b23)
+
More information about the gmp-commit
mailing list