[PATCH] T3/T4 sparc shifts, plus more timings
David Miller
davem at davemloft.net
Wed Mar 27 20:43:54 CET 2013
Just a quick update on pipelining shifts on T3/T4.
I noticed the existing 64-bit sparc ultrasparc1234 shift code
and decided to toy with it on T4.
I first made a 2-way unrolled version, this runs at the expected
3.5 cycles per limb.
I then used the 4-way ultrasparc1234 code as-is with the fanops
(carefully) removed, and this executes at 3.0 cycles per limb. The
main 4-way unrolled loop executes in 12 cycles.
My suggestion at this point is that we use the ultrasparc1234 code
with the fanops removed, even on T1/T2 since the decrease in the
number of bookkeeping operations will help even on those chips.
Just a note that some of the fanop removals have to be done
non-trivially since they live in delay slots. In all such cases I
simply moved the first instruction I could from before the branch into
the delay slot.
For reverence here is the 2-way version I was toying with:
dnl SPARC v9 mpn_lshift for T3/T4.
dnl Contributed to the GNU project by David Miller.
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C UltraSPARC T3: ?
C UltraSPARC T4: 3.5
C INPUT PARAMETERS
define(`rp', `%i0')
define(`up', `%i1')
define(`n', `%i2')
define(`cnt', `%i3')
define(`tcnt', `%i4')
define(`retval', `%i5')
define(`u0', `%l0')
define(`u1', `%l1')
define(`r0', `%l6')
define(`r1', `%l7')
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_lshift)
save %sp, -176, %sp
C Setup limb pointers and shift count
sllx n, 3, %g1
sub %g0, cnt, tcnt
add up, %g1, up
add rp, %g1, rp
ldx [up - 8], u1
subcc n, 3, n
srlx u1, tcnt, retval
bl,pn %xcc, L(end12)
sllx u1, cnt, %l3
subcc n, 2, n
ldx [up - 16], u0
ldx [up - 24], u1
add up, -16, up
bl,pn %xcc, L(end34)
srlx u0, tcnt, %l4
L(top):
sllx u0, cnt, %l2
or %l4, %l3, r0
ldx [up - 16], u0
srlx u1, tcnt, %l5
stx r0, [rp - 8]
sllx u1, cnt, %l3
or %l2, %l5, r1
ldx [up - 24], u1
srlx u0, tcnt, %l4
add up, -16, up
stx r1, [rp - 16]
subcc n, 2, n
bge,pt %xcc, L(top)
add rp, -16, rp
L(end34):
sllx u0, cnt, %l2
or %l4, %l3, r0
srlx u1, tcnt, %l5
stx r0, [rp - 8]
sllx u1, cnt, %l3
or %l2, %l5, r1
stx r1, [rp - 16]
add rp, -16, rp
L(end12):
addcc n, 2, n
bz,pn %xcc, L(done)
nop
L(loop0):
add rp, -8, rp
subcc n, 1, n
ldx [up - 16], u1
add up, -8, up
srlx u1, tcnt, %l4
or %l4, %l3, r0
stx r0, [rp + 0]
bnz,pt %xcc, L(loop0)
sllx u1, cnt, %l3
L(done):
stx %l3, [rp - 8]
ret
restore retval, 0, %o0
EPILOGUE()
More information about the gmp-devel
mailing list