[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Mar 12 14:02:46 CET 2012
details: /var/hg/gmp/rev/3f44bd313919
changeset: 14748:3f44bd313919
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 12 14:02:17 2012 +0100
description:
Add bobcat sqr_basecase.
details: /var/hg/gmp/rev/e8acb1f4ae01
changeset: 14749:e8acb1f4ae01
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 12 14:02:42 2012 +0100
description:
Minor tuning.
diffstat:
ChangeLog | 5 +
mpn/x86_64/bobcat/mul_basecase.asm | 28 +-
mpn/x86_64/bobcat/sqr_basecase.asm | 555 +++++++++++++++++++++++++++++++++++++
3 files changed, 572 insertions(+), 16 deletions(-)
diffs (truncated from 652 to 300 lines):
diff -r 531457fe1ff8 -r e8acb1f4ae01 ChangeLog
--- a/ChangeLog Sat Mar 10 17:05:58 2012 +0100
+++ b/ChangeLog Mon Mar 12 14:02:42 2012 +0100
@@ -1,3 +1,8 @@
+2012-03-12 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/bobcat/sqr_basecase.asm: New file.
+ * mpn/x86_64/bobcat/mul_basecase.asm: Minor tuning.
+
2012-03-10 Torbjorn Granlund <tege at gmplib.org>
* configure.in (fat_functions): Add addlsh1_n, addlsh2_n, addmul_2,
diff -r 531457fe1ff8 -r e8acb1f4ae01 mpn/x86_64/bobcat/mul_basecase.asm
--- a/mpn/x86_64/bobcat/mul_basecase.asm Sat Mar 10 17:05:58 2012 +0100
+++ b/mpn/x86_64/bobcat/mul_basecase.asm Mon Mar 12 14:02:42 2012 +0100
@@ -34,9 +34,9 @@
C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
C multiply insn bandwidth, without any apparent loop branch exit pipeline
-C replays experienced on K8. The structure is unusual: it falls into mul_1
-C without in the same way for all counts, then it splits into 4 different
-C wind-down blocks and 4 separate addmul_1 loops.
+C replays experienced on K8. The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
C
C We have not tried using the same addmul_1 loops with a switch into feed-in
C code, as we do in other basecase implementations. Doing that could save
@@ -45,11 +45,11 @@
C TODO
C * Tune un < 3 code.
C * Fix slowdown for un=vn=3 (67->71) compared to default code.
-C * This is 1266 bytes, compared to 1099 bytes for default code. Consider
+C * This is 1263 bytes, compared to 1099 bytes for default code. Consider
C combining addmul loops like that code. Tolerable slowdown?
-C * Lots of space could instead be saved by replacing the "switch" code by
-C gradual jump out from mul_1 winddown code, perhaps with no added overhead.
-C * Is ALIGN(16) really necessary? It adds about 40 bytes of padding.
+C * Lots of space could be saved by replacing the "switch" code by gradual
+C jumps out from mul_1 winddown code, perhaps with no added overhead.
+C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
@@ -139,7 +139,7 @@
lea -24(up,un_param,8), up
xor R32(un), R32(un)
mov $2, R32(n)
- sub un_param,un
+ sub un_param, un
sub un_param, n
mul v0
@@ -246,8 +246,7 @@
adc $0, w3
add w2, X(-8(rp,n,8),16(rp))
adc $0, w3
- mov w3, %rax
- mov %rax, X((rp,n,8),24(rp))
+ mov w3, X((rp,n,8),24(rp))
jmp L(to3)
@@ -327,8 +326,7 @@
adc $0, w3
add w2, X(-8(rp,n,8),16(rp))
adc $0, w3
- mov w3, %rax
- mov %rax, X((rp,n,8),24(rp))
+ mov w3, X((rp,n,8),24(rp))
jmp L(to2)
@@ -400,8 +398,7 @@
adc $0, w3
add w2, X(-8(rp,n,8),16(rp))
adc $0, w3
- mov w3, %rax
- mov %rax, X((rp,n,8),24(rp))
+ mov w3, X((rp,n,8),24(rp))
jmp L(to1)
@@ -466,8 +463,7 @@
adc $0, w3
add w2, X(-8(rp,n,8),16(rp))
adc $0, w3
- mov w3, %rax
- mov %rax, X((rp,n,8),24(rp))
+ mov w3, X((rp,n,8),24(rp))
jmp L(to0)
diff -r 531457fe1ff8 -r e8acb1f4ae01 mpn/x86_64/bobcat/sqr_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/bobcat/sqr_basecase.asm Mon Mar 12 14:02:42 2012 +0100
@@ -0,0 +1,555 @@
+dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bd1 4.75
+C AMD bobcat 5
+C Intel P4 17.7
+C Intel core2 5.5
+C Intel NHM 5.43
+C Intel SBR 3.92
+C Intel atom 23
+C VIA nano 5.63
+
+C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8. The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations. Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C * Tune un < 4 code.
+C * Perhaps implement a larger final corner (it is now 2 x 1).
+C * Lots of space could be saved by replacing the "switch" code by gradual
+C jumps out from mul_1 winddown code, perhaps with no added overhead.
+C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+C Standard allocations
+define(`un', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+define(`n', `%rbp')
+define(`v0', `%rcx')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+dnl define(`X',`$1')
+
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_sqr_basecase)
+ DOS64_ENTRY(3)
+
+ mov (up), %rax
+
+ cmp $2, R32(un_param)
+ jae L(ge2)
+
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ DOS64_EXIT()
+ ret
+
+L(ge2): mov (up), v0
+ jnz L(g2)
+
+ mul %rax
+ mov %rax, (rp)
+ mov 8(up), %rax
+ mov %rdx, w0
+ mul v0
+ add %rax, w0
+ mov %rdx, w1
+ adc $0, w1
+ mov 8(up), v0
+ mov (up), %rax
+ mul v0
+ add %rax, w0
+ mov w0, 8(rp)
+ mov %rdx, w0 C CAUTION: r8 realloc
+ adc $0, w0
+ mov 8(up), %rax
+ mul v0
+ add w1, w0
+ adc $0, %rdx
+ add w0, %rax
+ adc $0, %rdx
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ DOS64_EXIT()
+ ret
+
+L(g2): cmp $3, R32(un_param)
+ ja L(g3)
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ mov 8(up), %rax
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov (up), v0
+ mov 8(up), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 16(up), %rax
+ mul v0
+ xor R32(w2), R32(w2)
+ add %rax, w1
+ adc %rdx, w2
+
+ mov 8(up), v0
+ mov 16(up), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, w2
+ adc %rdx, w3
+ add w0, w0
+ adc w1, w1
+ adc w2, w2
+ adc w3, w3
+ mov $0, R32(v0)
+ adc v0, v0
+ add w0, 8(rp)
+ adc w1, 16(rp)
+ adc w2, 24(rp)
+ adc w3, 32(rp)
+ adc v0, 40(rp)
+ DOS64_EXIT()
+ ret
+
+L(g3): push %rbx
+ push %rbp
+
+ mov 8(up), %rax
+ lea -24(rp,un_param,8), rp
+ lea -24(up,un_param,8), up
+ neg un_param
+ push un_param C for sqr_diag_addlsh1
+ lea (un_param), un
+ lea 3(un_param), n
+
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(top): mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, 8(rp,n,8)
More information about the gmp-commit
mailing list