[Gmp-commit] /var/hg/gmp: Provide Nehalem redc_1.
mercurial at gmplib.org
mercurial at gmplib.org
Sat Sep 21 16:03:21 CEST 2013
details: /var/hg/gmp/rev/3f2e154fe2a3
changeset: 16015:3f2e154fe2a3
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Sep 21 16:02:01 2013 +0200
description:
Provide Nehalem redc_1.
diffstat:
mpn/x86_64/coreinhm/redc_1.asm | 534 +++++++++++++++++++++++++++++++++++++++++
1 files changed, 534 insertions(+), 0 deletions(-)
diffs (truncated from 538 to 300 lines):
diff -r ab4949cf56a0 -r 3f2e154fe2a3 mpn/x86_64/coreinhm/redc_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreinhm/redc_1.asm Sat Sep 21 16:02:01 2013 +0200
@@ -0,0 +1,534 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 3(n), i
+ mov (mp,n,8), %rax
+ mov (up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ mov $0, R32(%r11)
+ mov 8(up,n,8), %rbx
+ add %rax, %rbx
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r11
+ add %r9, %rbx
+ adc $0, %r11
+ mov 16(up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r9
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+L(e1): add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp1)
+
+L(ed1): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 5(n), i
+ mov (mp,n,8), %rax
+ mov (up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ mov 8(up,n,8), %rbx
+ mov $0, R32(%r11)
+ add %rax, %rbx
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r11
+ add %r9, %rbx
+ adc $0, %r11
+ mov 16(up,n,8), %rbp
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+C jmp L(tp3)
+
+ ALIGNx
+L(tp3): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp3)
+
+L(ed3): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea -8(up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+ CALL( mpn_add_n)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov $0, R32(%r11)
+ mov (up,n,8), %r10
+ add %rax, %r10
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r11
+ mov 8(up,n,8), %rbx
+ mul q0
+ add %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(mp,n,8), %rax
More information about the gmp-commit
mailing list