[PATCH] T3/T4 sparc shifts, plus more timings

David Miller davem at davemloft.net
Wed Mar 27 20:43:54 CET 2013

Just a quick update on pipelining shifts on T3/T4.

I noticed the existing 64-bit sparc ultrasparc1234 shift code
and decided to toy with it on T4.

I first made a 2-way unrolled version, this runs at the expected
3.5 cycles per limb.

I then used the 4-way ultrasparc1234 code as-is with the fanops
(carefully) removed, and this executes at 3.0 cycles per limb.  The
main 4-way unrolled loop executes in 12 cycles.

My suggestion at this point is that we use the ultrasparc1234 code
with the fanops removed, even on T1/T2 since the decrease in the
number of bookkeeping operations will help even on those chips.

Just a note that some of the fanop removals have to be done
non-trivially since they live in delay slots.  In all such cases I
simply moved the first instruction I could from before the branch into
the delay slot.

For reverence here is the 2-way version I was toying with:

C		   cycles/limb
C UltraSPARC T3:	 ?
C UltraSPARC T4:	 3.5

define(`rp',     `%i0')
define(`up',     `%i1')
define(`n',      `%i2')
define(`cnt',    `%i3')

define(`tcnt',   `%i4')
define(`retval', `%i5')
define(`u0',     `%l0')
define(`u1',     `%l1')
define(`r0',     `%l6')
define(`r1',     `%l7')

	save	%sp, -176, %sp

	C Setup limb pointers and shift count
	sllx	n, 3, %g1
	sub	%g0, cnt, tcnt
	add	up, %g1, up
	add	rp, %g1, rp

	ldx	[up - 8], u1
	subcc	n, 3, n
	srlx	u1, tcnt, retval
	bl,pn	%xcc, L(end12)
	 sllx	u1, cnt, %l3

	subcc	n, 2, n
	ldx	[up - 16], u0
	ldx	[up - 24], u1
	add	up, -16, up
	bl,pn	%xcc, L(end34)
	 srlx	u0, tcnt, %l4

	sllx	u0, cnt, %l2
	or	%l4, %l3, r0

	ldx	[up - 16], u0
	srlx	u1, tcnt, %l5

	stx	r0, [rp - 8]
	sllx	u1, cnt, %l3

	or	%l2, %l5, r1
	ldx	[up - 24], u1

	srlx	u0, tcnt, %l4
	add	up, -16, up

	stx	r1, [rp - 16]
	subcc	n, 2, n

	bge,pt	%xcc, L(top)
	 add	rp, -16, rp

	sllx	u0, cnt, %l2
	or	%l4, %l3, r0

	srlx	u1, tcnt, %l5
	stx	r0, [rp - 8]

	sllx	u1, cnt, %l3
	or	%l2, %l5, r1

	stx	r1, [rp - 16]
	add	rp, -16, rp

	addcc	n, 2, n
	bz,pn	%xcc, L(done)
	add	rp, -8, rp
	subcc	n, 1, n
	ldx	[up - 16], u1
	add	up, -8, up
	srlx	u1, tcnt, %l4
	or	%l4, %l3, r0
	stx	r0, [rp + 0]
	bnz,pt	%xcc, L(loop0)
	 sllx	u1, cnt, %l3
	stx	%l3, [rp - 8]
	restore retval, 0, %o0

