[Gmp-commit] /home/hgfiles/gmp: Add 64-but p6 divrem_1.
mercurial at gmplib.org
mercurial at gmplib.org
Wed Mar 3 20:10:57 CET 2010
details: /home/hgfiles/gmp/rev/419f6a4cc606
changeset: 13472:419f6a4cc606
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Mar 03 20:10:08 2010 +0100
description:
Add 64-but p6 divrem_1.
diffstat:
ChangeLog | 4 +
mpn/x86_64/core2/divrem_1.asm | 284 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 288 insertions(+), 0 deletions(-)
diffs (299 lines):
diff -r dfa7eae3b856 -r 419f6a4cc606 ChangeLog
--- a/ChangeLog Mon Mar 01 17:09:35 2010 +0100
+++ b/ChangeLog Wed Mar 03 20:10:08 2010 +0100
@@ -1,3 +1,7 @@
+2010-03-03 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/core2/divrem_1.asm: New file.
+
2010-02-26 Niels Möller <<nisse at lysator.liu.se>>
* tune/speed.c (routine): Added udiv_qrnnd_preinv3.
diff -r dfa7eae3b856 -r 419f6a4cc606 mpn/x86_64/core2/divrem_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/core2/divrem_1.asm Wed Mar 03 20:10:08 2010 +0100
@@ -0,0 +1,284 @@
+dnl x86-64 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 2004, 2005, 2007, 2008, 2009, 2010 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C norm unorm frac
+C K8 14 14 12
+C P4 ? ? ?
+C P6 core2 23 23 19.5
+C P6 corei7 19 19 18
+C P6 atom ? ? ?
+
+C TODO
+C * Compute the inverse without relying on the div instruction.
+C Newton's method and mulq, or perhaps the faster fdiv.
+C * Tune prologue.
+C * Optimize for Core 2.
+
+C The code for unnormalized divisors works also for normalized divisors, but
+C for some reason it runs really slowly (on K8) for that case. Use special
+C code until we can address this. The Intel Atom is also affected, but
+C understandably (shld slowness).
+define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',0)
+
+C mp_limb_t
+C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d)
+
+C mp_limb_t
+C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d,
+C mp_limb_t dinv, int cnt)
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`fn_param', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx')
+define(`d', `%r8')
+define(`dinv', `%r9') C only for mpn_preinv_divrem_1
+C shift passed on stack C only for mpn_preinv_divrem_1
+
+define(`cnt', `%rcx')
+define(`up', `%rsi')
+define(`fn', `%r12')
+define(`un', `%rbx')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C cnt qp d dinv
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+
+ lea -8(qp,un_param,8), qp
+
+ifelse(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1,`
+ test d, d
+ js L(nent)
+')
+ mov 40(%rsp), R8(cnt)
+ shl R8(cnt), d
+ jmp L(uent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+ je L(ret)
+
+ lea -8(qp,un_param,8), qp
+ xor R32(%rbp), R32(%rbp)
+
+
+ifelse(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1,`
+ test d, d
+ jns L(unnormalized)
+
+L(normalized):
+ test un, un
+ je L(8) C un == 0
+ mov -8(up,un,8), %rbp
+ dec un
+ mov %rbp, %rax
+ sub d, %rbp
+ cmovb %rax, %rbp
+ sbb R32(%rax), R32(%rax)
+ inc R32(%rax)
+ mov %rax, (qp)
+ lea -8(qp), qp
+L(8):
+ mov d, %rdx
+ mov $-1, %rax
+ not %rdx
+ div d C FREE rax rdx rcx r9 r10 r11
+ mov %rax, dinv
+ mov %rbp, %rax
+ lea (%rbp), %rax C
+ jmp L(nent)
+
+ ALIGN(16)
+L(nloop): C K8-K10 P6-CNR P6-NHM P4
+ mov (up,un,8), %r10 C
+ mul dinv C 0,13 0,20 0,18 0,45
+ add %r10, %rax C 4 8 3 12
+ adc %rbp, %rdx C 5 9 10 13
+ mov %rax, %rbp C 5 9 4 13
+ mov %rdx, %r13 C 6 11 12 23
+ imul d, %rdx C 6 11 11 23
+ sub %rdx, %r10 C 10 16 14 33
+ mov d, %rax C
+ add %r10, %rax C 11 17 15 34
+ cmp %rbp, %r10 C 11 17 15 34
+ cmovb %r10, %rax C 12 18 16 35
+ adc $-1, %r13 C
+ cmp d, %rax C
+ jae L(nfx) C
+L(nok): mov %r13, (qp) C
+ lea 1(%rax), %rbp C
+ sub $8, qp C
+L(nent):dec un C
+ jns L(nloop) C
+
+ xor R32(%rcx), R32(%rcx)
+ jmp L(87)
+
+L(nfx): sub d, %rax
+ inc %r13
+ jmp L(nok)
+')
+
+L(unnormalized):
+ test un, un
+ je L(44)
+ mov -8(up,un,8), %rax
+ cmp d, %rax
+ jae L(44)
+ mov %rbp, (qp)
+ mov %rax, %rbp
+ lea -8(qp), qp
+ je L(ret)
+ dec un
+L(44):
+ bsr d, %rcx
+ not R32(%rcx)
+ sal %cl, d
+ sal %cl, %rbp
+ mov d, %rdx
+ mov $-1, %rax
+ not %rdx
+ div d C FREE rax rdx r9 r10 r11
+ test un, un
+ mov %rax, dinv
+ mov %rbp, %rax
+ je L(87)
+L(uent):
+ mov -8(up,un,8), %rbp
+ shr %cl, %rax
+ shld %cl, %rbp, %rax
+ sub $2, un
+ js L(ulast)
+
+ ALIGN(16)
+L(uloop):
+ lea 1(%rax), %r11
+ mul dinv
+ mov (up,un,8), %r10
+ shld %cl, %r10, %rbp
+ add %rbp, %rax
+ adc %r11, %rdx
+ mov %rax, %r11
+ mov %rdx, %r13
+ imul d, %rdx
+ sub %rdx, %rbp
+ mov d, %rax
+ add %rbp, %rax
+ sub $8, qp
+ cmp %r11, %rbp
+ cmovb %rbp, %rax
+ adc $-1, %r13
+ cmp d, %rax
+ jae L(ufx)
+L(uok):
+ dec un
+ mov %r13, 8(qp)
+ mov %r10, %rbp
+ jns L(uloop)
+L(ulast):
+ lea 1(%rax), %r11
+ sal %cl, %rbp
+ mul dinv
+ add %rbp, %rax
+ adc %r11, %rdx
+ mov %rax, %r11
+ mov %rdx, %r13
+ imul d, %rdx
+ sub %rdx, %rbp
+ mov d, %rax
+ add %rbp, %rax
+ cmp %r11, %rbp
+ cmovb %rbp, %rax
+ adc $-1, %r13
+ cmp d, %rax
+ jae L(93)
+L(69): mov %r13, (qp)
+ sub $8, qp
+ jmp L(87)
+
+L(ufx): sub d, %rax
+ inc %r13
+ jmp L(uok)
+
+L(93): sub d, %rax
+ inc %r13
+ jmp L(69)
+
+L(87): mov d, %rbp
+ neg %rbp
+ jmp L(87b)
+
+ ALIGN(16)
+L(floop): C K8-K10 P6-CNR P6-NHM P4
+ mul dinv C 0,12
+ add %r11, %rdx C 5
+ mov %rax, %r11 C 4
+ mov %rdx, %r13 C 6
+ imul %rbp, %rdx C 6
+ mov d, %rax C
+ add %rdx, %rax C 10
+ cmp %r11, %rdx C 10
+ cmovb %rdx, %rax C 11
+ adc $-1, %r13 C
+ mov %r13, (qp) C
+ sub $8, qp C
+L(87b): lea 1(%rax), %r11 C
+ dec fn C
+ jns L(floop) C
+
+ shr %cl, %rax
+L(ret): pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ ret
+EPILOGUE()
More information about the gmp-commit
mailing list