[Gmp-commit] /var/hg/gmp: 3 new changesets

Tue Dec 25 12:50:15 CET 2012

details:   /var/hg/gmp/rev/cfb6bb278b1e
changeset: 15212:cfb6bb278b1e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Dec 25 12:47:39 2012 +0100
description:
Use LEA for binvert_limb_table.

details:   /var/hg/gmp/rev/e18ddf28b9ae
changeset: 15213:e18ddf28b9ae
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Dec 25 12:48:59 2012 +0100
description:
Cleanup.

details:   /var/hg/gmp/rev/45715760bd2a
changeset: 15214:45715760bd2a
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Dec 25 12:50:03 2012 +0100
description:
ChangeLog

diffstat:

 ChangeLog               |  11 ++++++++
 mpn/x86_64/bdiv_q_1.asm |  45 +++++++++++++++--------------------
 mpn/x86_64/mode1o.asm   |  61 ++++++++++++++++++++----------------------------
 3 files changed, 55 insertions(+), 62 deletions(-)

diffs (216 lines):

diff -r d5cb4915f5a5 -r 45715760bd2a ChangeLog

--- a/ChangeLog	Sun Dec 23 23:06:54 2012 +0100
+++ b/ChangeLog	Tue Dec 25 12:50:03 2012 +0100
@@ -1,3 +1,14 @@
+2012-12-25  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/bdiv_q_1.asm: Use LEA for binvert_limb_table.
+
+2012-12-23  Torbjorn Granlund  <tege at gmplib.org>
+
+	* tests/mpz/t-get_d.c (check_onebit): Decrease vax limit to avoid
+	overflow in last, unused 'want' value.
+
+	* config.guess: Recognise AMD family 22 as a future bobcat.
+
 2012-12-21  Torbjorn Granlund  <tege at gmplib.org>
 
 	* configure.ac: Rename configure.in.
diff -r d5cb4915f5a5 -r 45715760bd2a mpn/x86_64/bdiv_q_1.asm
--- a/mpn/x86_64/bdiv_q_1.asm	Sun Dec 23 23:06:54 2012 +0100
+++ b/mpn/x86_64/bdiv_q_1.asm	Tue Dec 25 12:50:03 2012 +0100
@@ -11,7 +11,7 @@
 dnl  by the Free Software Foundation; either version 3 of the License, or (at
 dnl  your option) any later version.
 
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 dnl  License for more details.
@@ -33,13 +33,12 @@
 
 
 C INPUT PARAMETERS
-C rp		rdi
-C up		rsi
-C n		rdx
-C d		rcx
-C di		r8	just mpn_pi1_bdiv_q_1
-C shift		r9	just mpn_pi1_bdiv_q_1
-
+define(`rp',		`%rdi')
+define(`up',		`%rsi')
+define(`n',		`%rdx')
+define(`d',		`%rcx')
+define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
+define(`shift',		`%r9')		C	just mpn_pi1_bdiv_q_1
 
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
@@ -62,11 +61,7 @@
 	shr	R32(%rax)
 	and	$127, R32(%rax)		C d/2, 7 bits
 
-ifdef(`PIC',`
-	mov	binvert_limb_table at GOTPCREL(%rip), %rdx
-',`
-	movabs	$binvert_limb_table, %rdx
-')
+	LEA(	binvert_limb_table, %rdx)
 
 	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
 
@@ -103,15 +98,15 @@
 	mov	%rcx, %r11		C d
 	mov	%rdx, %r10		C n
 	mov	%r9, %rcx		C shift
-L(com):
-	mov	(%rsi), %rax		C up[0]
+
+L(com):	mov	(up), %rax		C up[0]
 
 	dec	%r10
 	jz	L(one)
 
-	mov	8(%rsi), %rdx		C up[1]
-	lea	(%rsi,%r10,8), %rsi	C up end
-	lea	(%rdi,%r10,8), %rdi	C rp end
+	mov	8(up), %rdx		C up[1]
+	lea	(up,%r10,8), up		C up end
+	lea	(rp,%r10,8), rp		C rp end
 	neg	%r10			C -n
 
 	shrd	R8(%rcx), %rdx, %rax
@@ -125,13 +120,11 @@
 	C rbx	carry bit, 0 or 1
 	C rcx	shift
 	C rdx
-	C rsi	up end
-	C rdi	rp end
 	C r10	counter, limbs, negative
 
 	mul	%r11			C carry limb in rdx
-	mov	(%rsi,%r10,8), %rax
-	mov	8(%rsi,%r10,8), %r9
+	mov	(up,%r10,8), %rax
+	mov	8(up,%r10,8), %r9
 	shrd	R8(%rcx), %r9, %rax
 	nop
 	sub	%rbx, %rax		C apply carry bit
@@ -139,24 +132,24 @@
 	sub	%rdx, %rax		C apply carry limb
 	adc	$0, %rbx
 L(ent):	imul	%r8, %rax
-	mov	%rax, (%rdi,%r10,8)
+	mov	%rax, (rp,%r10,8)
 	inc	%r10
 	jnz	L(top)
 
 	mul	%r11			C carry limb in rdx
-	mov	(%rsi), %rax		C up high limb
+	mov	(up), %rax		C up high limb
 	shr	R8(%rcx), %rax
 	sub	%rbx, %rax		C apply carry bit
 	sub	%rdx, %rax		C apply carry limb
 	imul	%r8, %rax
-	mov	%rax, (%rdi)
+	mov	%rax, (rp)
 	pop	%rbx
 	FUNC_EXIT()
 	ret
 
 L(one):	shr	R8(%rcx), %rax
 	imul	%r8, %rax
-	mov	%rax, (%rdi)
+	mov	%rax, (rp)
 	pop	%rbx
 	FUNC_EXIT()
 	ret
diff -r d5cb4915f5a5 -r 45715760bd2a mpn/x86_64/mode1o.asm
--- a/mpn/x86_64/mode1o.asm	Sun Dec 23 23:06:54 2012 +0100
+++ b/mpn/x86_64/mode1o.asm	Tue Dec 25 12:50:03 2012 +0100
@@ -1,20 +1,20 @@
-dnl  AMD64 mpn_modexact_1_odd -- exact division style remainder.
+dnl  AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
 
 dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2011, 2012 Free
 dnl  Software Foundation, Inc.
-dnl
+
 dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or
-dnl  modify it under the terms of the GNU Lesser General Public License as
-dnl  published by the Free Software Foundation; either version 3 of the
-dnl  License, or (at your option) any later version.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful,
-dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-dnl  Lesser General Public License for more details.
-dnl
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
@@ -31,37 +31,26 @@
 C VIA nano	 ?
 
 
-C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
-C                               mp_limb_t divisor);
-C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
-C                                mp_limb_t divisor, mp_limb_t carry);
-C
-C
 C The dependent chain in the main loop is
 C
 C                            cycles
-C	subq	%rdx, %rax	1
-C	imulq	%r9, %rax	4
-C	mulq	%r8		5
+C	sub	%rdx, %rax	1
+C	imul	%r9, %rax	4
+C	mul	%r8		5
 C			      ----
 C       total		       10
 C
-C The movq load from src seems to need to be scheduled back before the jz to
-C achieve this speed, out-of-order execution apparently can't completely
-C hide the latency otherwise.
+C The mov load from src seems to need to be scheduled back before the jz to
+C achieve this speed, out-of-order execution apparently can't completely hide
+C the latency otherwise.
 C
-C The l=src[i]-cbit step is rotated back too, since that allows us to avoid
-C it for the first iteration (where there's no cbit).
+C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
+C for the first iteration (where there's no cbit).
 C
-C The code alignment used (32-byte) for the loop also seems necessary.
-C Without that the non-PIC case has adcq crossing the 0x60 offset,
-C apparently making it run at 11 cycles instead of 10.
-C
-C Not done:
-C
-C divq for size==1 was measured at about 79 cycles, compared to the inverse
-C at about 25 cycles (both including function call overheads), so that's not
-C used.
+C The code alignment used (32-byte) for the loop also seems necessary.  Without
+C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
+C run at 11 cycles instead of 10.
+
 
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)