Better tabselect
David Miller
davem at davemloft.net
Fri Apr 12 05:58:21 CEST 2013
From: David Miller <davem at davemloft.net>
Date: Thu, 11 Apr 2013 19:06:17 -0400 (EDT)
> From: Torbjorn Granlund <tg at gmplib.org>
> Date: Thu, 11 Apr 2013 23:55:18 +0200
>
>> I think we need to write new tabselect also for ppc64, sparc64, and
>> perhaps x86_32. The latter could use a variant of our
>> x64-sse-horis-tabselect-w8.asm, at least some intel cpus.
>
> I'll take a stab at sparc64.
The existing C code approaches 6 cycles/limb on T4, the best I can do
without pipelining with this new approach at 4 way unrolling is ~4.5
cycles/limb:
sll n, 3, stride
sub nents, which, which
mov tp, tporig
mov n, j
L(outer_loop):
clr data0
clr data1
clr data2
clr data3
mov tporig, tp
mov nents, i
L(top):
ldx [tp + 0], t0
clr mask
ldx [tp + 8], t1
cmp which, i
ldx [tp + 16], t2
move %icc, -1, mask
ldx [tp + 24], t3
sub i, 1, i
add tp, stride, tp
and t0, mask, t0
and t1, mask, t1
or t0, data0, data0
and t2, mask, t2
or t1, data1, data1
and t3, mask, t3
or t2, data2, data2
brnz i, L(top)
or t3, data3, data3
stx data0, [rp + 0]
subcc j, 4, j
stx data1, [rp + 8]
stx data2, [rp + 16]
stx data3, [rp + 24]
add tporig, (4 * 8), tporig
bne,pt %icc, L(outer_loop)
add rp, (4 * 8), rp
More information about the gmp-devel
mailing list