rshift for 64bit Core2
Peter Cordes
peter at cordes.ca
Sat Mar 22 12:37:56 CET 2008
On Fri, Mar 21, 2008 at 11:19:14PM +0100, Torbjorn Granlund wrote:
> I reached 1.25 c/l with 4-way unrolling some time ago. This is the
> inner loop:
Thanks for the starting point. It only goes 1.33c/l for me, though.
(counting overhead, so maybe it's really 1.25). I haven't profiled it yet,
either.
Instead of trying to set up all the registers for a computed jump into the
middle of the loop, I just used a rolled-up loop that goes until n&3 == 0,
at which point we're ready to enter the unrolled loop. It's not the fastest
possible for small n, but it's pretty good and doesn't fill up the Icache.
It could also be used as a cleanup loop at the end, since it already tests
for n==0 as an exit condition.
It also makes it easy to try different loop unrolling, because the loop
only has to be entered in one place. Unrolling to 8limbs/iter can hit 1.20c/l.
It requires the loop body to be ALIGN(16)ed, though. Without that, it's
slower than 4limbs/iter.
That was just a naive copy of the loop body, adjusting the offsets. I could
set up my intro loop to jump to the middle of that, only adjusting %rsi,
%rdi, and %rdx. (Since the two halves of the loop need the same limbs in
the same registers.) More unrolling makes the intro loop even worse: it can
run up to 7 times, instead of 3.
Pipelining it deeper might get it down even closer to 1c/l, even without
unrolling.
The better pipelining means it beats the SSE2 loop all the time. I get a
flat ~2.4 c/l from from SIZE=2500 up to SIZE=120000 (2.5c/l), gradually
increasing to ~10.8 c/l. (my best SSE2 version with aligned 128bit
loads/stores also got 10.8 c/l. The others were slower.)
The "intro loop" column is the relevant one for the function listed below.
C conroe: intro loop computed goto ; triple jcc version (pollutes the branch predictor)
C SIZE=1; 11.008 10.016 cycles/limb 8.992
C SIZE=2; 6.504 7.008 7.464
C SIZE=3; 5.355 5.333 cycles/limb 5.013
C SIZE=4; 4.760 4.260 cycles/limb 5.760
C SIZE=5; 3.014 3.206 cycles/limb 3.014
C SIZE=6; 3.005 3.509 cycles/limb 3.509
C SIZE=7; 2.999 3.163 cycles/limb 2.999
C SIZE=8; 3.006 2.934 cycles/limb 2.718
C SIZE=9; 2.222 2.596 cycles/limb 2.347
C SIZE=10; 2.306 2.693 cycles/limb 2.710
C SIZE=496; 1.331 1.571 cycles/limb 1.571
C Harpertown:
C the same
C K8
C size 497: ~4.15 c/l
ASM_START()
ALIGN(16)
PROLOGUE(mpn_rshift_core2)
C C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on...
C cmp $16000, %rdx
C jge mpn_rshift_sse2 C TODO: and not penryn/harpertown unless we can use the aligned version
C regs for the loop. use macros to make register reallocation easy.
define(n,%rdx)
define(reg1,%r9)
define(reg2,%rax)
define(reg3,%r8) C referenced the fewest times
define(reg4,%r11)
C shift count can stay where it is in %rcx
C push reg2
C push reg4
mov (%rsi), reg4 C reg4 = limb0
xor %eax, %eax
shrd %cl, reg4, %rax C %rax = ret val limb. %rbx still = limb0
push %rax C save retval
C mov %rsi, %r9
jmp L(c2_entry)
L(c2_shortloop):
shrd %cl, reg1, reg4
mov reg4, (%rdi)
mov reg1, reg4
add $8, %rdi
C add $8, %r9
L(c2_entry):
dec n C sub looks like it makes things align better, but dec has the same timings
C sub $1, n
jle L(c2_end)
add $8, %rsi
mov (%rsi), reg1 C reg4=limb0 reg1=limb1
test $3, %dl
jnz L(c2_shortloop)
C loop last iter stores ith limb to dest, and loads i+1st limb from src
C reg4=limb(i) reg1=limb(i+1). %rdx=n-i-1, %rdx%4=0 %rsi -> limb(i+1)
C mov (%rsi), reg1
mov 8(%rsi), reg2
lea 16(%rsi),%rsi
C debug: %r9 = %rsi
C mov (%r9), reg4
C mov 8(%r9), reg1
C mov 16(%r9),reg2
C require: reg1=limb1; reg2=limb2; reg3=xxx; reg4=limb0
C loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer, so alignment doesn't matter
C If this is misaligned so it doesn't fit in the loob buffer, it runs ~1.57 c/l instead of ~1.33
C further unrolling will push it beyond the size of the loop stream detector. (already close in bytes). 8 limbs/iter runs at ~1.67 c/l
C use simple addressing modes, not (%rdi,%rdx,8). That generates too many reads of not-in-flight register values
C ALIGN(16)
L(c2_loop): shrd %cl, reg1, reg4
mov (%rsi), reg3
mov reg4, (%rdi)
C L(c2_10):
shrd %cl, reg2, reg1
mov 8(%rsi), reg4
mov reg1, 8(%rdi)
C L(c2_01):
shrd %cl, reg3, reg2
mov 16(%rsi),reg1
mov reg2, 16(%rdi)
C L(c2_00):
shrd %cl, reg4, reg3
mov 24(%rsi),reg2
lea 32(%rsi),%rsi
mov reg3, 24(%rdi)
lea 32(%rdi),%rdi
sub $4, n
jg L(c2_loop)
C ALIGN(16) would align the branch target, but it doesn't seem to matter.
C L(c2_endshort):
L(c2_end):
pop %rax C return val
shr %cl, reg4 C compute most significant limb
mov reg4, (%rdi) C store it
C pop reg4
C pop reg2
ret
EPILOGUE()
--
#define X(x,y) x##y
Peter Cordes ; e-mail: X(peter at cor , des.ca)
"The gods confound the man who first found out how to distinguish the hours!
Confound him, too, who in this place set up a sundial, to cut and hack
my day so wretchedly into small pieces!" -- Plautus, 200 BC
More information about the gmp-devel
mailing list