[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Tue Sep 24 16:16:38 CEST 2013
details: /var/hg/gmp/rev/b6c39aba9d46
changeset: 16025:b6c39aba9d46
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Sep 24 16:15:05 2013 +0200
description:
Provide Atom redc_1.
details: /var/hg/gmp/rev/fff25440e878
changeset: 16026:fff25440e878
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Sep 24 16:15:39 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 4 +
mpn/x86_64/atom/redc_1.asm | 564 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 568 insertions(+), 0 deletions(-)
diffs (truncated from 579 to 300 lines):
diff -r 0d5699067ff3 -r fff25440e878 ChangeLog
--- a/ChangeLog Tue Sep 24 12:56:48 2013 +0200
+++ b/ChangeLog Tue Sep 24 16:15:39 2013 +0200
@@ -1,3 +1,7 @@
+2013-09-24 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/atom/redc_1.asm: New file.
+
2013-09-23 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/bobcat/redc_1.asm: Make the code for 1 <= n <= 3 work.
diff -r 0d5699067ff3 -r fff25440e878 mpn/x86_64/atom/redc_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/atom/redc_1.asm Tue Sep 24 16:15:39 2013 +0200
@@ -0,0 +1,564 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat 5.0
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+C * Make lead-in code for the inner loops be more similar.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+define(`w0', `%rbp')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %rbp
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ add (up,n,8), %rbp
+ mov %rax, %rbp
+ adc %r9, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 32(mp,n,8), %rax
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+L(e1): add $4, i
+ mov %rdx, %r10
+ js L(tp1)
+
+L(ed1): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %rbp
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ add (up,n,8), %rbp
+ mov %rax, %rbp
+ mov 24(mp,n,8), %rax
+ adc %r9, %rbx
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %r11
+ mov 32(mp,n,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+L(e3): mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp3)
+
+L(ed3): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+ CALL( mpn_add_n)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0): cmp $-4, R32(n)
+ jz L(n4)
+
More information about the gmp-commit
mailing list