[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Sep 22 23:41:14 CEST 2013
details: /var/hg/gmp/rev/b3274fc82d47
changeset: 16017:b3274fc82d47
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 22 23:15:05 2013 +0200
description:
Provide Conroe/Wolfdale as well as Bobcat redc_1.
details: /var/hg/gmp/rev/8b5156eaf065
changeset: 16018:8b5156eaf065
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 22 23:16:27 2013 +0200
description:
Slightly tweak basecase code.
details: /var/hg/gmp/rev/9d6331ced367
changeset: 16019:9d6331ced367
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 22 23:36:10 2013 +0200
description:
Fix typo in DOS64-specific code.
details: /var/hg/gmp/rev/e9627ec5edb8
changeset: 16020:e9627ec5edb8
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 22 23:40:32 2013 +0200
description:
Remove an invalid comment.
details: /var/hg/gmp/rev/cb51e25676ef
changeset: 16021:cb51e25676ef
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 22 23:40:59 2013 +0200
description:
Spacing cleanup.
diffstat:
mpn/x86_64/bobcat/aorsmul_1.asm | 3 -
mpn/x86_64/bobcat/redc_1.asm | 481 ++++++++++++++++++++++++++++++++++++++++
mpn/x86_64/core2/redc_1.asm | 415 ++++++++++++++++++++++++++++++++++
mpn/x86_64/coreinhm/redc_1.asm | 2 +-
mpn/x86_64/coreisbr/redc_1.asm | 4 +-
5 files changed, 899 insertions(+), 6 deletions(-)
diffs (truncated from 950 to 300 lines):
diff -r afb1ca80f5d9 -r cb51e25676ef mpn/x86_64/bobcat/aorsmul_1.asm
--- a/mpn/x86_64/bobcat/aorsmul_1.asm Sun Sep 22 12:19:20 2013 +0200
+++ b/mpn/x86_64/bobcat/aorsmul_1.asm Sun Sep 22 23:40:59 2013 +0200
@@ -32,9 +32,6 @@
C Intel atom 23
C VIA nano 5.63
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
diff -r afb1ca80f5d9 -r cb51e25676ef mpn/x86_64/bobcat/redc_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bobcat/redc_1.asm Sun Sep 22 23:40:59 2013 +0200
@@ -0,0 +1,481 @@
+dnl X86-64 mpn_redc_1 optimised for AMD bobcat.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat 5.0
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+define(`w0', `%rbp')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w1
+ add (up,n,8), w2
+ adc w3, %rbx
+ adc $0, w1
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w1, w2
+ adc $0, w3
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+L(e1): mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp1)
+
+L(ed1): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w1
+ add (up,n,8), w2
+ adc w3, %rbx
+ adc $0, w1
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w1, w2
+ adc $0, w3
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+L(e3): mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp3)
+
+L(ed3): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+ CALL( mpn_add_n)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea (n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w3
+ add (up,n,8), w0
+ adc w1, %rbx
+ adc $0, w3
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
More information about the gmp-commit
mailing list