[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Tue Dec 25 12:50:15 CET 2012
details: /var/hg/gmp/rev/cfb6bb278b1e
changeset: 15212:cfb6bb278b1e
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Dec 25 12:47:39 2012 +0100
description:
Use LEA for binvert_limb_table.
details: /var/hg/gmp/rev/e18ddf28b9ae
changeset: 15213:e18ddf28b9ae
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Dec 25 12:48:59 2012 +0100
description:
Cleanup.
details: /var/hg/gmp/rev/45715760bd2a
changeset: 15214:45715760bd2a
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Dec 25 12:50:03 2012 +0100
description:
ChangeLog
diffstat:
ChangeLog | 11 ++++++++
mpn/x86_64/bdiv_q_1.asm | 45 +++++++++++++++--------------------
mpn/x86_64/mode1o.asm | 61 ++++++++++++++++++++----------------------------
3 files changed, 55 insertions(+), 62 deletions(-)
diffs (216 lines):
diff -r d5cb4915f5a5 -r 45715760bd2a ChangeLog
--- a/ChangeLog Sun Dec 23 23:06:54 2012 +0100
+++ b/ChangeLog Tue Dec 25 12:50:03 2012 +0100
@@ -1,3 +1,14 @@
+2012-12-25 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/bdiv_q_1.asm: Use LEA for binvert_limb_table.
+
+2012-12-23 Torbjorn Granlund <tege at gmplib.org>
+
+ * tests/mpz/t-get_d.c (check_onebit): Decrease vax limit to avoid
+ overflow in last, unused 'want' value.
+
+ * config.guess: Recognise AMD family 22 as a future bobcat.
+
2012-12-21 Torbjorn Granlund <tege at gmplib.org>
* configure.ac: Rename configure.in.
diff -r d5cb4915f5a5 -r 45715760bd2a mpn/x86_64/bdiv_q_1.asm
--- a/mpn/x86_64/bdiv_q_1.asm Sun Dec 23 23:06:54 2012 +0100
+++ b/mpn/x86_64/bdiv_q_1.asm Tue Dec 25 12:50:03 2012 +0100
@@ -11,7 +11,7 @@
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
@@ -33,13 +33,12 @@
C INPUT PARAMETERS
-C rp rdi
-C up rsi
-C n rdx
-C d rcx
-C di r8 just mpn_pi1_bdiv_q_1
-C shift r9 just mpn_pi1_bdiv_q_1
-
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`d', `%rcx')
+define(`di', `%r8') C just mpn_pi1_bdiv_q_1
+define(`shift', `%r9') C just mpn_pi1_bdiv_q_1
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
@@ -62,11 +61,7 @@
shr R32(%rax)
and $127, R32(%rax) C d/2, 7 bits
-ifdef(`PIC',`
- mov binvert_limb_table at GOTPCREL(%rip), %rdx
-',`
- movabs $binvert_limb_table, %rdx
-')
+ LEA( binvert_limb_table, %rdx)
movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
@@ -103,15 +98,15 @@
mov %rcx, %r11 C d
mov %rdx, %r10 C n
mov %r9, %rcx C shift
-L(com):
- mov (%rsi), %rax C up[0]
+
+L(com): mov (up), %rax C up[0]
dec %r10
jz L(one)
- mov 8(%rsi), %rdx C up[1]
- lea (%rsi,%r10,8), %rsi C up end
- lea (%rdi,%r10,8), %rdi C rp end
+ mov 8(up), %rdx C up[1]
+ lea (up,%r10,8), up C up end
+ lea (rp,%r10,8), rp C rp end
neg %r10 C -n
shrd R8(%rcx), %rdx, %rax
@@ -125,13 +120,11 @@
C rbx carry bit, 0 or 1
C rcx shift
C rdx
- C rsi up end
- C rdi rp end
C r10 counter, limbs, negative
mul %r11 C carry limb in rdx
- mov (%rsi,%r10,8), %rax
- mov 8(%rsi,%r10,8), %r9
+ mov (up,%r10,8), %rax
+ mov 8(up,%r10,8), %r9
shrd R8(%rcx), %r9, %rax
nop
sub %rbx, %rax C apply carry bit
@@ -139,24 +132,24 @@
sub %rdx, %rax C apply carry limb
adc $0, %rbx
L(ent): imul %r8, %rax
- mov %rax, (%rdi,%r10,8)
+ mov %rax, (rp,%r10,8)
inc %r10
jnz L(top)
mul %r11 C carry limb in rdx
- mov (%rsi), %rax C up high limb
+ mov (up), %rax C up high limb
shr R8(%rcx), %rax
sub %rbx, %rax C apply carry bit
sub %rdx, %rax C apply carry limb
imul %r8, %rax
- mov %rax, (%rdi)
+ mov %rax, (rp)
pop %rbx
FUNC_EXIT()
ret
L(one): shr R8(%rcx), %rax
imul %r8, %rax
- mov %rax, (%rdi)
+ mov %rax, (rp)
pop %rbx
FUNC_EXIT()
ret
diff -r d5cb4915f5a5 -r 45715760bd2a mpn/x86_64/mode1o.asm
--- a/mpn/x86_64/mode1o.asm Sun Dec 23 23:06:54 2012 +0100
+++ b/mpn/x86_64/mode1o.asm Tue Dec 25 12:50:03 2012 +0100
@@ -1,20 +1,20 @@
-dnl AMD64 mpn_modexact_1_odd -- exact division style remainder.
+dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2011, 2012 Free
dnl Software Foundation, Inc.
-dnl
+
dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -31,37 +31,26 @@
C VIA nano ?
-C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor, mp_limb_t carry);
-C
-C
C The dependent chain in the main loop is
C
C cycles
-C subq %rdx, %rax 1
-C imulq %r9, %rax 4
-C mulq %r8 5
+C sub %rdx, %rax 1
+C imul %r9, %rax 4
+C mul %r8 5
C ----
C total 10
C
-C The movq load from src seems to need to be scheduled back before the jz to
-C achieve this speed, out-of-order execution apparently can't completely
-C hide the latency otherwise.
+C The mov load from src seems to need to be scheduled back before the jz to
+C achieve this speed, out-of-order execution apparently can't completely hide
+C the latency otherwise.
C
-C The l=src[i]-cbit step is rotated back too, since that allows us to avoid
-C it for the first iteration (where there's no cbit).
+C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
+C for the first iteration (where there's no cbit).
C
-C The code alignment used (32-byte) for the loop also seems necessary.
-C Without that the non-PIC case has adcq crossing the 0x60 offset,
-C apparently making it run at 11 cycles instead of 10.
-C
-C Not done:
-C
-C divq for size==1 was measured at about 79 cycles, compared to the inverse
-C at about 25 cycles (both including function call overheads), so that's not
-C used.
+C The code alignment used (32-byte) for the loop also seems necessary. Without
+C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
+C run at 11 cycles instead of 10.
+
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
More information about the gmp-commit
mailing list