[PATCH] T3/T4 sparc shifts, plus more timings

David Miller davem at davemloft.net
Wed Mar 27 20:43:54 CET 2013


Just a quick update on pipelining shifts on T3/T4.

I noticed the existing 64-bit sparc ultrasparc1234 shift code
and decided to toy with it on T4.

I first made a 2-way unrolled version, this runs at the expected
3.5 cycles per limb.

I then used the 4-way ultrasparc1234 code as-is with the fanops
(carefully) removed, and this executes at 3.0 cycles per limb.  The
main 4-way unrolled loop executes in 12 cycles.

My suggestion at this point is that we use the ultrasparc1234 code
with the fanops removed, even on T1/T2 since the decrease in the
number of bookkeeping operations will help even on those chips.

Just a note that some of the fanop removals have to be done
non-trivially since they live in delay slots.  In all such cases I
simply moved the first instruction I could from before the branch into
the delay slot.

For reverence here is the 2-way version I was toying with:

dnl  SPARC v9 mpn_lshift for T3/T4.

dnl  Contributed to the GNU project by David Miller.

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C		   cycles/limb
C UltraSPARC T3:	 ?
C UltraSPARC T4:	 3.5

C INPUT PARAMETERS
define(`rp',     `%i0')
define(`up',     `%i1')
define(`n',      `%i2')
define(`cnt',    `%i3')

define(`tcnt',   `%i4')
define(`retval', `%i5')
define(`u0',     `%l0')
define(`u1',     `%l1')
define(`r0',     `%l6')
define(`r1',     `%l7')

ASM_START()
	REGISTER(%g2,#scratch)
	REGISTER(%g3,#scratch)
PROLOGUE(mpn_lshift)
	save	%sp, -176, %sp

	C Setup limb pointers and shift count
	sllx	n, 3, %g1
	sub	%g0, cnt, tcnt
	add	up, %g1, up
	add	rp, %g1, rp

	ldx	[up - 8], u1
	subcc	n, 3, n
	srlx	u1, tcnt, retval
	bl,pn	%xcc, L(end12)
	 sllx	u1, cnt, %l3

	subcc	n, 2, n
	ldx	[up - 16], u0
	ldx	[up - 24], u1
	add	up, -16, up
	bl,pn	%xcc, L(end34)
	 srlx	u0, tcnt, %l4

L(top):
	sllx	u0, cnt, %l2
	or	%l4, %l3, r0

	ldx	[up - 16], u0
	srlx	u1, tcnt, %l5

	stx	r0, [rp - 8]
	sllx	u1, cnt, %l3

	or	%l2, %l5, r1
	ldx	[up - 24], u1

	srlx	u0, tcnt, %l4
	add	up, -16, up

	stx	r1, [rp - 16]
	subcc	n, 2, n

	bge,pt	%xcc, L(top)
	 add	rp, -16, rp

L(end34):
	sllx	u0, cnt, %l2
	or	%l4, %l3, r0

	srlx	u1, tcnt, %l5
	stx	r0, [rp - 8]

	sllx	u1, cnt, %l3
	or	%l2, %l5, r1

	stx	r1, [rp - 16]
	add	rp, -16, rp

L(end12):
	addcc	n, 2, n
	bz,pn	%xcc, L(done)
	 nop
L(loop0):
	add	rp, -8, rp
	subcc	n, 1, n
	ldx	[up - 16], u1
	add	up, -8, up
	srlx	u1, tcnt, %l4
	or	%l4, %l3, r0
	stx	r0, [rp + 0]
	bnz,pt	%xcc, L(loop0)
	 sllx	u1, cnt, %l3
L(done):
	stx	%l3, [rp - 8]
	ret
	restore retval, 0, %o0
EPILOGUE()


More information about the gmp-devel mailing list