[Gmp-commit] /var/hg/gmp: Moved x86_64/k10/div_qr_1n_pi1.asm to x86_64/k8. Ap...
mercurial at gmplib.org
mercurial at gmplib.org
Tue Oct 22 16:52:21 CEST 2013
details: /var/hg/gmp/rev/60e7f3167327
changeset: 16066:60e7f3167327
user: Niels Möller <nisse at lysator.liu.se>
date: Tue Oct 22 16:51:52 2013 +0200
description:
Moved x86_64/k10/div_qr_1n_pi1.asm to x86_64/k8. Applied k8 tweak by Torbjörn.
diffstat:
ChangeLog | 7 +-
mpn/x86_64/k10/div_qr_1n_pi1.asm | 218 --------------------------------------
mpn/x86_64/k8/div_qr_1n_pi1.asm | 220 +++++++++++++++++++++++++++++++++++++++
3 files changed, 225 insertions(+), 220 deletions(-)
diffs (truncated from 462 to 300 lines):
diff -r e9a5ec7f4003 -r 60e7f3167327 ChangeLog
--- a/ChangeLog Tue Oct 22 10:16:16 2013 +0200
+++ b/ChangeLog Tue Oct 22 16:51:52 2013 +0200
@@ -1,7 +1,10 @@
2013-10-22 Niels Möller <nisse at lysator.liu.se>
- * mpn/x86_64/k10/div_qr_1n_pi1.asm: New file. Differs from generic
- x86_64 version by using cmov.
+ * mpn/x86_64/k8/div_qr_1n_pi1.asm: Moved the below k10 file here.
+ Applied tweak from Torbjörn to get it to run well on k8.
+
+ * mpn/x86_64/k10/div_qr_1n_pi1.asm: New file (renamed above).
+ Differs from generic x86_64 version by using cmov.
* mpn/x86_64/div_qr_1n_pi1.asm: Reordered arguments to second mul.
Deleted misleading cycle annotations.
diff -r e9a5ec7f4003 -r 60e7f3167327 mpn/x86_64/k10/div_qr_1n_pi1.asm
--- a/mpn/x86_64/k10/div_qr_1n_pi1.asm Tue Oct 22 10:16:16 2013 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,218 +0,0 @@
-dnl x86-64 mpn_div_qr_1n_pi1
-dnl -- Divide an mpn number by a normalized single-limb number,
-dnl using a single-limb inverse.
-
-dnl Contributed to the GNU project by Niels Möller
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C c/l
-C AMD K10 11.25
-
-C INPUT Parameters
-define(`QP', `%rdi')
-define(`UP', `%rsi')
-define(`UN_INPUT', `%rdx')
-define(`U1', `%rcx') C Also in %rax
-define(`D', `%r8')
-define(`DINV', `%r9')
-
-C Invariants
-define(`B2', `%rbp')
-define(`B2md', `%rbx')
-
-C Variables
-define(`UN', `%r8') C Overlaps D input
-define(`T', `%r10')
-define(`U0', `%r11')
-define(`U2', `%r12')
-define(`Q0', `%r13')
-define(`Q1', `%r14')
-define(`Q2', `%r15')
-
-ABI_SUPPORT(STD64)
-
- ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_div_qr_1n_pi1)
- FUNC_ENTRY(6)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- dec UN_INPUT
- jnz L(first)
-
- C Just a single 2/1 division.
- C T, U0 are allocated in scratch registers
- lea 1(U1), T
- mov U1, %rax
- mul DINV
- mov (UP), U0
- add U0, %rax
- adc T, %rdx
- imul D, %rdx
- sub %rdx, U0
- cmp U0, %rax
- lea (U0, D), %rax
- cmovnc U0, %rax
- sbb $0, T
- cmp D, %rax
- jc L(single_div_done)
- sub D, %rax
- add $1, T
-L(single_div_done):
- mov T, (QP)
- FUNC_EXIT
- ret
-L(first):
- C FIXME: Could delay some of these until we enter the loop.
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbx
- push %rbp
-
- mov D, B2
- imul DINV, B2
- neg B2
- mov B2, B2md
- sub D, B2md
-
- C D not needed until final reduction
- push D
- mov UN_INPUT, UN C Clobbers D
-
- mov DINV, %rax
- mul U1
- mov %rax, Q0
- add U1, %rdx
- mov %rdx, (QP, UN, 8)
-
- mov B2, %rax
- mul U1
- mov -8(UP, UN, 8), U0
- mov (UP, UN, 8), U1
- add %rax, U0
- adc %rdx, U1
- sbb U2, U2
- dec UN
- mov U1, %rax
- jz L(final)
-
- ALIGN(16)
-
- C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
- C At entry, %rax holds an extra copy of U1, and carry holds an extra copy of U2.
-L(loop):
- C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
- C Remains to add in B (U1 + c)
- mov $0, Q1
- cmovc DINV, Q1
- mov U2, Q2
- neg Q2
- mul DINV
- add %rdx, Q1
- adc $0, Q2
- add Q0, Q1
- mov %rax, Q0
- mov B2, %rax
- lea (B2md, U0), T
- adc $0, Q2
-
- C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
- mul U1
- and B2, U2
- add U2, U0
- cmovnc U0, T
-
- C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
- adc U1, Q1
- mov -8(UP, UN, 8), U0
- adc Q2,8(QP, UN, 8)
- jc L(q_incr)
-L(q_incr_done):
- add %rax, U0
- mov T, %rax
- adc %rdx, %rax
- mov Q1, (QP, UN, 8)
- sbb U2, U2
- dec UN
- mov %rax, U1
- jnz L(loop)
-
-L(final):
- pop D
-
- mov U2, Q1
- and D, U2
- sub U2, %rax
- neg Q1
-
- mov %rax, U1
- sub D, %rax
- cmovc U1, %rax
- sbb $-1, Q1
-
- lea 1(%rax), T
- mul DINV
- add U0, %rax
- adc T, %rdx
- mov %rdx, T
- imul D, %rdx
- sub %rdx, U0
- cmp U0, %rax
- lea (U0, D), %rax
- cmovnc U0, %rax
- sbb $0, T
- cmp D, %rax
- jc L(div_done)
- sub D, %rax
- add $1, T
-L(div_done):
- add T, Q0
- mov Q0, (QP)
- adc Q1, 8(QP)
- jnc L(done)
-L(final_q_incr):
- addq $1, 16(QP)
- lea 8(QP), QP
- jc L(final_q_incr)
-
-L(done):
- pop %rbp
- pop %rbx
- pop %r12
- pop %r13
- pop %r14
- pop %r15
- FUNC_EXIT
- ret
-
-L(q_incr):
- C U1 is not live, so use it for indexing
- lea 16(QP, UN, 8), U1
-L(q_incr_loop):
- addq $1, (U1)
- jnc L(q_incr_done)
- lea 8(U1), U1
- jmp L(q_incr_loop)
-EPILOGUE()
diff -r e9a5ec7f4003 -r 60e7f3167327 mpn/x86_64/k8/div_qr_1n_pi1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/k8/div_qr_1n_pi1.asm Tue Oct 22 16:51:52 2013 +0200
@@ -0,0 +1,220 @@
+dnl x86-64 mpn_div_qr_1n_pi1
+dnl -- Divide an mpn number by a normalized single-limb number,
+dnl using a single-limb inverse.
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l
+C AMD K8,K10 11
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx') C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8') C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+ ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+ FUNC_ENTRY(6)
+IFDOS(` mov 56(%rsp), %r8 ')
More information about the gmp-commit
mailing list