[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Mar 27 03:00:38 CET 2013
details: /var/hg/gmp/rev/de7b192e67c7
changeset: 15648:de7b192e67c7
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Mar 27 02:57:52 2013 +0100
description:
Add BMI submul_1 via aorsmul_1.asm, remove separate addmul_1.asm.
details: /var/hg/gmp/rev/83d1fd014b13
changeset: 15649:83d1fd014b13
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Mar 27 02:58:57 2013 +0100
description:
Add BMI submul_1 via aorsmul_1.asm, remove separate addmul_1.asm.
details: /var/hg/gmp/rev/e5d7b7e1a55d
changeset: 15650:e5d7b7e1a55d
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Mar 27 02:59:42 2013 +0100
description:
Ad cycle lines for AMD's BMI processors.
details: /var/hg/gmp/rev/89f4015788ab
changeset: 15651:89f4015788ab
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Mar 27 03:00:32 2013 +0100
description:
ChangeLog
diffstat:
ChangeLog | 5 +
mpn/x86_64/mulx/addmul_1.asm | 138 --------------------------------------
mpn/x86_64/mulx/aorsmul_1.asm | 150 ++++++++++++++++++++++++++++++++++++++++++
mpn/x86_64/mulx/mul_1.asm | 2 +
4 files changed, 157 insertions(+), 138 deletions(-)
diffs (truncated from 321 to 300 lines):
diff -r a99f55d8c95d -r 89f4015788ab ChangeLog
--- a/ChangeLog Tue Mar 26 09:18:46 2013 +0100
+++ b/ChangeLog Wed Mar 27 03:00:32 2013 +0100
@@ -1,3 +1,8 @@
+2013-03-27 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/mulx/aorsmul_1.asm: New file.
+ * mpn/x86_64/mulx/addmul_1.asm: Remove.
+
2013-03-26 Niels Möller <nisse at lysator.liu.se>
Make mpn_cnd_add_n and mpn_cnd_sub_n public.
diff -r a99f55d8c95d -r 89f4015788ab mpn/x86_64/mulx/addmul_1.asm
--- a/mpn/x86_64/mulx/addmul_1.asm Tue Mar 26 09:18:46 2013 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,138 +0,0 @@
-dnl AMD64 mpn_addmul_1 for CPUs with mulx but without adx.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 -
-C AMD K10 -
-C AMD bd1 -
-C AMD bobcat -
-C Intel P4 -
-C Intel PNR -
-C Intel NHM -
-C Intel SBR -
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom -
-C VIA nano -
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rcx')
-define(`v0', `%rdx')
-
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`v0', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``r11'') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_addmul_1)
- mov (up), %r8
-
- push %rbx
- push %r12
- push %r13
- push %r14
-
- lea (up,n_param,8), up
- lea -32(rp,n_param,8), rp
- mov R32(n_param), R32(%rax)
- xchg v0_param, v0 C FIXME: is this insn fast?
-
- neg n
-
- and $3, R8(%rax)
- jz L(b0)
- cmp $2, R8(%rax)
- jz L(b2)
- jg L(b3)
-
-L(b1): mulx %r8, %rbx, %rax
- sub $-1, n
- jz L(wd1)
- mulx (up,n,8), %r9, %r8
- mulx 8(up,n,8), %r11, %r10
- and %rax, %rax C clear cy
- jmp L(lo1)
-
-L(b0): mulx %r8, %r9, %r8
- mulx 8(up,n,8), %r11, %r10
- mulx 16(up,n,8), %r13, %r12
- xor R32(%rax), R32(%rax)
- jmp L(lo0)
-
-L(b3): mulx %r8, %r11, %r10
- mulx 8(up,n,8), %r13, %r12
- mulx 16(up,n,8), %rbx, %rax
- add %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- sub $-3, n
- jz L(wd3)
- and %rax, %rax C clear cy
- jmp L(lo3)
-
-L(b2): mulx %r8, %r13, %r12
- mulx 8(up,n,8), %rbx, %rax
- add %r12, %rbx
- adc $0, %rax
- sub $-2, n
- jz L(wd2)
- mulx (up,n,8), %r9, %r8
- and %rax, %rax C clear cy
- jmp L(lo2)
-
-L(top): add %r9, (rp,n,8)
-L(lo3): mulx (up,n,8), %r9, %r8
- adc %r11, 8(rp,n,8)
-L(lo2): mulx 8(up,n,8), %r11, %r10
- adc %r13, 16(rp,n,8)
-L(lo1): mulx 16(up,n,8), %r13, %r12
- adc %rbx, 24(rp,n,8)
-L(lo0): mulx 24(up,n,8), %rbx, %r14
- adc %rax, %r9
- adc %r8, %r11
- adc %r10, %r13
- adc %r12, %rbx
- mov $0, R32(%rax)
- adc %r14, %rax C rax = carry limb
- add $4, n
- js L(top)
-
-L(end): add %r9, (rp)
-L(wd3): adc %r11, 8(rp)
-L(wd2): adc %r13, 16(rp)
-L(wd1): adc %rbx, 24(rp)
- adc n, %rax
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- ret
-EPILOGUE()
-ASM_END()
diff -r a99f55d8c95d -r 89f4015788ab mpn/x86_64/mulx/aorsmul_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/mulx/aorsmul_1.asm Wed Mar 27 03:00:32 2013 +0100
@@ -0,0 +1,150 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 ?
+C AMD bobcat -
+C AMD jaguar ?
+C Intel P4 -
+C Intel PNR -
+C Intel NHM -
+C Intel SBR -
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom -
+C VIA nano -
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rdx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`ADCSBB', `adc')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`ADCSBB', `sbb')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ mov (up), %r8
+
+ push %rbx
+ push %r12
+ push %r13
+
+ lea (up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+ mov R32(n_param), R32(%rax)
+ xchg v0_param, v0 C FIXME: is this insn fast?
+
+ neg n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mulx %r8, %rbx, %rax
+ sub $-1, n
+ jz L(wd1)
+ mulx (up,n,8), %r9, %r8
+ mulx 8(up,n,8), %r11, %r10
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo1)
+
+L(b0): mulx %r8, %r9, %r8
+ mulx 8(up,n,8), %r11, %r10
+ mulx 16(up,n,8), %r13, %r12
+ xor R32(%rax), R32(%rax)
+ jmp L(lo0)
+
+L(b3): mulx %r8, %r11, %r10
+ mulx 8(up,n,8), %r13, %r12
+ mulx 16(up,n,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ sub $-3, n
+ jz L(wd3)
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo3)
+
+L(b2): mulx %r8, %r13, %r12
+ mulx 8(up,n,8), %rbx, %rax
+ add %r12, %rbx
+ adc $0, %rax
+ sub $-2, n
+ jz L(wd2)
+ mulx (up,n,8), %r9, %r8
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo2)
+
+L(top): ADDSUB %r9, (rp,n,8)
+L(lo3): mulx (up,n,8), %r9, %r8
+ ADCSBB %r11, 8(rp,n,8)
+L(lo2): mulx 8(up,n,8), %r11, %r10
+ ADCSBB %r13, 16(rp,n,8)
+L(lo1): mulx 16(up,n,8), %r13, %r12
+ ADCSBB %rbx, 24(rp,n,8)
+ adc %rax, %r9
+L(lo0): mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax C rax = carry limb
+ add $4, n
+ js L(top)
+
+L(end): ADDSUB %r9, (rp)
+L(wd3): ADCSBB %r11, 8(rp)
+L(wd2): ADCSBB %r13, 16(rp)
More information about the gmp-commit
mailing list