[Gmp-commit] /var/hg/gmp: 2 new changesets

Thu Mar 29 16:25:27 CEST 2012

details:   /var/hg/gmp/rev/ff62540bec48
changeset: 14782:ff62540bec48
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 29 16:24:26 2012 +0200
description:
Fix a typo.

details:   /var/hg/gmp/rev/411ea70d7f53
changeset: 14783:411ea70d7f53
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 29 16:25:24 2012 +0200
description:
Add gcd_1 for sparc64.

diffstat:

 ChangeLog             |    4 +
 mpn/sparc64/README    |    2 +-
 mpn/sparc64/gcd_1.asm |  144 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 1 deletions(-)

diffs (171 lines):

diff -r 74a39ab15a41 -r 411ea70d7f53 ChangeLog

--- a/ChangeLog	Tue Mar 27 08:25:08 2012 +0200
+++ b/ChangeLog	Thu Mar 29 16:25:24 2012 +0200
@@ -1,3 +1,7 @@
+2012-03-29  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/sparc64/gcd_1.asm: New file.
+
 2012-03-27  Torbjorn Granlund  <tege at gmplib.org>
 
 	* config.guess: Fix typo in coreisbr recognition.
diff -r 74a39ab15a41 -r 411ea70d7f53 mpn/sparc64/README
--- a/mpn/sparc64/README	Tue Mar 27 08:25:08 2012 +0200
+++ b/mpn/sparc64/README	Thu Mar 29 16:25:24 2012 +0200
@@ -65,7 +65,7 @@
 instructions.  No conditional move can issue 1-5 cycles after a load.  (This
 might have been fixed for UltraSPARC-3.)
 
-The UltraSPARC-3 pipeline is very simular to he one of UltraSPARC-1/2 , but is
+The UltraSPARC-3 pipeline is very simular to the one of UltraSPARC-1/2 , but is
 somewhat slower.  Branches execute slower, and there may be other new stalls.
 But integer multiply doesn't stall the entire CPU and also has a much lower
 latency.  But it's still not pipelined, and thus useless for our needs.
diff -r 74a39ab15a41 -r 411ea70d7f53 mpn/sparc64/gcd_1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/sparc64/gcd_1.asm	Thu Mar 29 16:25:24 2012 +0200
@@ -0,0 +1,144 @@
+dnl  SPARC64 mpn_gcd_1.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for SPARC by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000, 2001, 2002, 2005, 2009, 2011, 2012 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		  cycles/bit (approx)
+C UltraSPARC 1&2:      ?
+C UltraSPARC 3:        5.0
+C UltraSPARC T1:      12.8
+C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+	.section	".rodata"
+ctz_table:
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+
+
+C Threshold of when to call bmod when U is one limb.  Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`BMOD_THRES_LOG2', 14)
+
+C INPUT PARAMETERS
+define(`up',    `%i0')
+define(`n',     `%i1')
+define(`v0',    `%i2')
+
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_gcd_1)
+	save	%sp, -192, %sp
+	ldx	[up+0], %g1		C U low limb
+	mov	-1, %i4
+	or	v0, %g1, %g2		C x | y
+
+L(twos):
+	inc	%i4
+	andcc	%g2, 1, %g0
+	bz,a	%xcc, L(twos)
+	 srlx	%g2, 1, %g2
+
+L(divide_strip_y):
+	andcc	v0, 1, %g0
+	bz,a	%xcc, L(divide_strip_y)
+	 srlx	v0, 1, v0
+
+	cmp	n, 1			C if n > 1 we need
+	bnz	%xcc, L(bmod)		C to call bmod_1
+	 nop
+
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+	srlx	%g1, BMOD_THRES_LOG2, %g2
+	cmp	%g2, v0
+	bleu	%xcc, L(noreduce)
+	 mov	%g1, %o0
+
+L(bmod):
+	mov	up, %o0
+	mov	n, %o1
+	mov	v0, %o2
+	call	mpn_modexact_1c_odd
+	 mov	0, %o3
+
+L(noreduce):
+
+ifdef(`PIC',`
+	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), %l7
+	call	L(LGETPC0)
+	add	%l7, %lo(_GLOBAL_OFFSET_TABLE_+4), %l7
+	sethi	%hi(ctz_table), %g1
+	or	%g1, %lo(ctz_table), %g1
+	ldx	[%l7+%g1], %i5
+',`
+	sethi	%hh(ctz_table), %l7
+	or	%l7, %hm(ctz_table), %l7
+	sllx	%l7, 32, %l7
+	sethi	%lm(ctz_table), %g1
+	add	%l7, %g1, %l7
+	or	%l7, %lo(ctz_table), %i5
+')
+
+	cmp	%o0, 0
+	bnz	%xcc, L(mid)
+	 andcc	%o0, MASK, %g3		C
+
+	return	%i7+8
+	 sllx	%o2, %o4, %o0		C CAUTION: v0 alias for o2
+
+	ALIGN(16)
+L(top):	movcc	%xcc, %l4, v0		C v = min(u,v)
+	movcc	%xcc, %l2, %o0		C u = |v - u]
+	cmp	%g3, 0			C are all MAXSHIFT low bits zero?
+L(mid):	ldub	[%i5+%g3], %g3		C
+	bz,a	%xcc, L(shift_alot)	C
+	 srlx	%o0, MAXSHIFT, %o0
+	srlx	%o0, %g3, %l4		C new u, odd
+	nop				C force parallel exec of sub insns
+	subcc	v0, %l4, %l2		C v - u, set flags for branch and movcc
+	sub	%l4, v0, %o0		C u - v
+	bnz	%xcc, L(top)		C
+	 and	%l2, MASK, %g3		C extract low MAXSHIFT bits from (v-u)
+
+	return	%i7+8
+	 sllx	%o2, %o4, %o0		C CAUTION: v0 alias for o2
+
+L(shift_alot):
+	b	L(mid)
+	 andcc	%o0, MASK, %g3		C
+
+ifdef(`PIC',`
+L(LGETPC0):
+	retl
+	add	%o7, %l7, %l7
+')
+EPILOGUE()