[Gmp-commit] /var/hg/gmp: Add bobcat mul_basecase.
mercurial at gmplib.org
mercurial at gmplib.org
Tue Mar 6 23:35:48 CET 2012
details: /var/hg/gmp/rev/2bd67045652a
changeset: 14732:2bd67045652a
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Mar 06 23:35:44 2012 +0100
description:
Add bobcat mul_basecase.
diffstat:
ChangeLog | 4 +
mpn/x86_64/bobcat/mul_basecase.asm | 480 +++++++++++++++++++++++++++++++++++++
2 files changed, 484 insertions(+), 0 deletions(-)
diffs (truncated from 495 to 300 lines):
diff -r efd50d52d4e2 -r 2bd67045652a ChangeLog
--- a/ChangeLog Sun Mar 04 22:05:13 2012 +0100
+++ b/ChangeLog Tue Mar 06 23:35:44 2012 +0100
@@ -1,3 +1,7 @@
+2012-03-06 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/bobcat/mul_basecase.asm: New file.
+
2012-03-04 Torbjorn Granlund <tege at gmplib.org>
* mpn/x86_64/bobcat/mul_1.asm: New file.
diff -r efd50d52d4e2 -r 2bd67045652a mpn/x86_64/bobcat/mul_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bobcat/mul_basecase.asm Tue Mar 06 23:35:44 2012 +0100
@@ -0,0 +1,480 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD bobcat.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bd1 4.75
+C AMD bobcat 5
+C Intel P4 17.7
+C Intel core2 5.5
+C Intel NHM 5.43
+C Intel SBR 3.92
+C Intel atom 23
+C VIA nano 5.63
+
+C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8. The structure is unusual: it falls into mul_1
+C without in the same way for all counts, then it splits into 4 different
+C wind-down blocks and 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations. Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C * Tune un < 3 code.
+C * Fix slowdown for un=vn=3 (67->71) compared to default code.
+C * This is 1266 bytes, compared to 1099 bytes for default code. Consider
+C combining addmul loops like that code. Tolerable slowdown?
+C * Lots of space could instead be saved by replacing the "switch" code by
+C gradual jump out from mul_1 winddown code, perhaps with no added overhead.
+C * Is ALIGN(16) really necessary? It adds about 40 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+C Standard allocations
+define(`un', `%rbx')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ mov (up), %rax
+ mov (vp), v0
+
+ cmp $2, un_param
+ ja L(ge3)
+ jz L(u2)
+
+ mul v0 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ DOS64_EXIT()
+ ret
+
+L(u2): mul v0 C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rax
+ mov %rdx, w0
+ mul v0
+ add %rax, w0
+ mov %rdx, w1
+ adc $0, w1
+ cmp $1, R32(vn)
+ jnz L(u2v2)
+ mov w0, 8(rp)
+ mov w1, 16(rp)
+ DOS64_EXIT()
+ ret
+
+L(u2v2):mov 8(vp), v0
+ mov (up), %rax
+ mul v0
+ add %rax, w0
+ mov w0, 8(rp)
+ mov %rdx, %r8 C CAUTION: r8 realloc
+ adc $0, %r8
+ mov 8(up), %rax
+ mul v0
+ add w1, %r8
+ adc $0, %rdx
+ add %r8, %rax
+ adc $0, %rdx
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ DOS64_EXIT()
+ ret
+
+
+L(ge3): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ lea 8(vp), vp
+
+ lea -24(rp,un_param,8), rp
+ lea -24(up,un_param,8), up
+ xor R32(un), R32(un)
+ mov $2, R32(n)
+ sub un_param,un
+ sub un_param, n
+
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(top): mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, 8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(top)
+
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+
+C Switch on n into right addmul_l loop
+ test n, n
+ jz L(r2)
+ cmp $2, R32(n)
+ ja L(r3)
+ jz L(r0)
+ jmp L(r1)
+
+
+L(r3): mov w2, X(-8(rp,n,8),16(rp))
+ mov w3, X((rp,n,8),24(rp))
+ add $2, un
+
+C outer loop(3)
+L(to3): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov 8(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al3)
+
+ ALIGN(16)
+L(ta3): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta3)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, %rax
+ mov %rax, X((rp,n,8),24(rp))
+ jmp L(to3)
+
+
+L(r2): mov X(0(up,n,8),(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),-8(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),(rp))
+ add w1, w2
+ adc $0, w3
+ mov X(16(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X(16(rp,n,8),16(rp))
+ adc $0, w3
+ mov w1, X(24(rp,n,8),24(rp))
+ inc un
+
+C outer loop(2)
+L(to2): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov 16(up,un,8), %rax
+ lea 8(vp), vp
More information about the gmp-commit
mailing list