rshift for 64bit Core2

Sat Mar 22 12:37:56 CET 2008

On Fri, Mar 21, 2008 at 11:19:14PM +0100, Torbjorn Granlund wrote:
> I reached 1.25 c/l with 4-way unrolling some time ago.  This is the
> inner loop:

 Thanks for the starting point.  It only goes 1.33c/l for me, though.
(counting overhead, so maybe it's really 1.25).  I haven't profiled it yet,
either.

 Instead of trying to set up all the registers for a computed jump into the
middle of the loop, I just used a rolled-up loop that goes until n&3 == 0,
at which point we're ready to enter the unrolled loop.  It's not the fastest
possible for small n, but it's pretty good and doesn't fill up the Icache.
It could also be used as a cleanup loop at the end, since it already tests
for n==0 as an exit condition.

 It also makes it easy to try different loop unrolling, because the loop
only has to be entered in one place.  Unrolling to 8limbs/iter can hit 1.20c/l.
It requires the loop body to be ALIGN(16)ed, though.  Without that, it's
slower than 4limbs/iter.

 That was just a naive copy of the loop body, adjusting the offsets.  I could
set up my intro loop to jump to the middle of that, only adjusting %rsi,
%rdi, and %rdx.  (Since the two halves of the loop need the same limbs in
the same registers.)  More unrolling makes the intro loop even worse: it can
run up to 7 times, instead of 3.

 Pipelining it deeper might get it down even closer to 1c/l, even without
unrolling.

 The better pipelining means it beats the SSE2 loop all the time.  I get a
flat ~2.4 c/l from from SIZE=2500 up to SIZE=120000 (2.5c/l), gradually
increasing to ~10.8 c/l.  (my best SSE2 version with aligned 128bit
loads/stores also got 10.8 c/l.  The others were slower.)

The "intro loop" column is the relevant one for the function listed below.

C conroe: intro loop		computed goto     ; triple jcc version  (pollutes the branch predictor)
C SIZE=1; 11.008		10.016 cycles/limb  8.992
C SIZE=2; 6.504			7.008		    7.464
C SIZE=3; 5.355			5.333 cycles/limb   5.013
C SIZE=4; 4.760			4.260 cycles/limb   5.760
C SIZE=5; 3.014			3.206 cycles/limb   3.014
C SIZE=6; 3.005			3.509 cycles/limb   3.509
C SIZE=7; 2.999			3.163 cycles/limb   2.999
C SIZE=8; 3.006			2.934 cycles/limb   2.718
C SIZE=9; 2.222			2.596 cycles/limb   2.347
C SIZE=10; 2.306		2.693 cycles/limb   2.710
C SIZE=496; 1.331		1.571 cycles/limb 1.571

C Harpertown:
C the same

C K8
C size 497: ~4.15 c/l

ASM_START()
ALIGN(16)
PROLOGUE(mpn_rshift_core2)
C 	C would like to get lots of instructions into the OOO execution engine early so it has plenty to work on...
C 	cmp	$16000,	%rdx
C 	jge	mpn_rshift_sse2		C TODO: and not penryn/harpertown unless we can use the aligned version

C regs for the loop.  use macros to make register reallocation easy.
define(n,%rdx)
define(reg1,%r9)
define(reg2,%rax)
define(reg3,%r8)  		C referenced the fewest times
define(reg4,%r11)

	C shift count can stay where it is in %rcx
C 	push	reg2
C 	push	reg4
	mov	(%rsi), reg4		C reg4 = limb0
	xor	%eax,	%eax
	shrd	%cl,	reg4,	%rax	C %rax = ret val limb.  %rbx still = limb0
	push	%rax			C save retval

C	mov	%rsi,	%r9
	jmp	L(c2_entry)
L(c2_shortloop):
	shrd	%cl,	reg1,	reg4
	mov	reg4,	(%rdi)
	mov	reg1,	reg4
	add	$8,	%rdi
C	add	$8,	%r9
L(c2_entry):
	dec	n		C sub looks like it makes things align better, but dec has the same timings
C 	sub	$1,	n
	jle	L(c2_end)
	add	$8,	%rsi
	mov	(%rsi),	reg1	C reg4=limb0 reg1=limb1
	test	$3,	%dl
	jnz	L(c2_shortloop)

C loop last iter stores ith limb to dest, and loads i+1st limb from src
C	reg4=limb(i) reg1=limb(i+1).  %rdx=n-i-1, %rdx%4=0  %rsi -> limb(i+1)

C  	mov	(%rsi),	reg1
  	mov	8(%rsi), reg2
	lea	16(%rsi),%rsi

C debug: %r9 = %rsi
C 	mov	(%r9),	reg4
C 	mov	8(%r9),	reg1
C 	mov	16(%r9),reg2

C require: reg1=limb1; reg2=limb2; reg3=xxx;  reg4=limb0

C loop is <= 18 insn and <= 4 16byte aligned blocks, so fits into Core 2's loop stream buffer, so alignment doesn't matter
C If this is misaligned so it doesn't fit in the loob buffer, it runs ~1.57 c/l instead of ~1.33
C further unrolling will push it beyond the size of the loop stream detector.  (already close in bytes).  8 limbs/iter runs at ~1.67 c/l
C use simple addressing modes, not (%rdi,%rdx,8).  That generates too many reads of not-in-flight register values
C ALIGN(16)
L(c2_loop):	shrd	%cl, reg1, reg4
	mov	(%rsi),	reg3
	mov	reg4,	(%rdi)
C	L(c2_10):
	shrd	%cl, reg2, reg1
	mov	8(%rsi), reg4
	mov	reg1,	8(%rdi)
C	L(c2_01):
	shrd	%cl, reg3, reg2
	mov	16(%rsi),reg1
	mov	reg2,	16(%rdi)
C	L(c2_00):
	shrd	%cl, reg4, reg3
	mov	24(%rsi),reg2
	lea	32(%rsi),%rsi
	mov	reg3,	24(%rdi)
	lea	32(%rdi),%rdi
	sub	$4, n
	jg	L(c2_loop)
C ALIGN(16)  would align the branch target, but it doesn't seem to matter.
C L(c2_endshort):
L(c2_end):
	pop	%rax			C return val
	shr	%cl,	reg4		C compute most significant limb
	mov	reg4,	(%rdi)		C store it
C  	pop	reg4
C  	pop	reg2
	ret
EPILOGUE()

-- 
#define X(x,y) x##y
Peter Cordes ;  e-mail: X(peter at cor , des.ca)

"The gods confound the man who first found out how to distinguish the hours!
 Confound him, too, who in this place set up a sundial, to cut and hack
 my day so wretchedly into small pieces!" -- Plautus, 200 BC