SSE2 rshift for 64bit Core2

Wed Mar 19 05:55:04 CET 2008

On Tue, Mar 18, 2008 at 04:19:36PM -0300, Peter Cordes wrote:
>  The other major limitation of my code currently is that it requires the
> input and output operands to be 16 byte aligned.  I use movdqa to load and
> store.  It might be possible to maintain 2.0 c/l on Core2 while using 64 bit
> loads and stores.

 I rewrote the loop to use movq 64bit loads, instead of 128bit movdqa, but I
haven't eliminated the movdqa for stores.  And any sane caller will have
their 64bit limbs aligned to 64bits, which is all movq needs to go fast.
Just changing movdqa to movdqu for stores give 4.0 c/l instead of 2, IIRC.
So maybe we can unpack the result and use two movqs.

 The loop still runs at 2.0c/l on Conroe, but 2.5 c/l on Harpertown, which
has shorter latency on some SSE shuffle/shift instructions.  That's no good,
because that's slower than the MMX code.  Maybe with more time tweaking, I
could find out what Harpertown/Penryn doesn't like.  K8 gets 3.0c/l on this
code, which is also slower than the MMX code.

 Having limb1 and limb0 in separate registers actually made the code easier.
It doesn't have to load so far ahead, either.  (which is good, because
without 16byte alignment, the next limb in the source might be on the next
page, which might segfault.)  Loading them at different times allows for
doing things in the register you're about to load the new limb to.

 I updated the code in my darcs repo (see last email).  Here's the new
version of the function:

I stripped out a few bench number from what's in the file from my DARCS repo,
http://cordes.ca/repos/gmp-shift/rshift.asm

C version using unaligned loads but aligned stores:
C K8 (2.6 GHz)
C size 1:	16.926	c/l
C size 8:	4.358	c/l (no ALIGN: 4.607)
C size 496:	3.052	c/l. (no ALIGN: 3.547) old: 3.039c/l with store commented out.
C size 10000	5.244	c/l (no ALIGN: 5.257)
C size 10000000	14.053	c/l. (no ALIGN: 14.529)

C Conroe:
C size 1	12.096	c/l
C size 2	6.504	c/l
C size 3	5.376	c/l  (no ALIGN: 5.312)
C size 4	4.040	c/l  (no ALIGN: 3.980, and one less icache line touched?)
C size 8	2.988	c/l  (no ALIGN: 2.988)
C size 496:	2.052	c/l. (no ALIGN: 2.052)
C size 10000	2.320	c/l  (no ALIGN: 2.320)
C size 10000000 11.178  c/l. (no ALIGN: 11.195)  (2.4GHz, g965, dual channel DDR800)

C Harpertown (2.8GHz):
C size 1	12.115	c/l.
C size 8	3.229	c/l. (no ALIGN: 3.134)
C size 496	2.581	c/l. (no ALIGN: 2.558)
C size 10000	2.847	c/l  (no ALIGN: 2.968)
C size 10000000	14.460	c/l. (no ALIGN: 14.403)  (don't know exact hardware config on this box)

ASM_START()
PROLOGUE(mpn_rshift_sse2)
	movq	(%rsi), %xmm7		C %7 = limb0
	movd	%ecx, %xmm1
	sub	$64, %ecx		C cnt must be <=64, so it's ok to operate on small version of it
	neg	%ecx			C we want 64-cnt in ecx as a shift count for getting the return value
	movq	%xmm7, %rax		C %rax = limb0
 	movd	%ecx, %xmm0		C %0=64-cnt=left count=lc; %1=cnt;  C this can go anywhere before the loop.
	shlq	%cl, %rax		C return value=limb0<<lc. shift count has to be in %cl.	 C modifies flags, so it has to go before the loop

	leaq	-8(%rsi,%rdx,8), %rsi     C rsi = rsi + rdx*8 = last limb.
	leaq	-16(%rdi,%rdx,8), %rdi	C rdi = rdi + rdx*8 - 16 = last limb - 1
	negq	%rdx
C rsi=last srclimb+1; rdi=last destlimb+1; rdx=-n; %0=64-cnt; %1=cnt;
C %7=limb0;
C	xor	%ecx, %ecx		C can clear rcx here or later with mov
	addq	$2, %rdx		C n= -n + 2
	jge	L(unal_out)			C skip the loop if n<=2

C	ALIGN(16)			C No alignment needed on Core 2.  This makes n even case only 128B of instructions.
L(unal_loop):
	movq	(%rsi,%rdx,8), %xmm6	C %6 = limb1
	movdqa	%xmm7,	%xmm3
	punpcklqdq %xmm6, %xmm3
	psrlq	%xmm1,	%xmm3

	movq	8(%rsi,%rdx,8), %xmm7	C %7 = limb2
	punpcklqdq %xmm7, %xmm6
	psllq	%xmm0,	%xmm6

	por	%xmm3,	%xmm6
	movdqa	%xmm6,	(%rdi,%rdx,8) C store the result.

C	movq	16(%rsi,%rdx,8),	%xmm6	C %6 = limb3  (about to become limb1)
	addq	$2,	%rdx
	jl	L(unal_loop)

L(unal_out):
C %6=limb1; %7=limb0
C It's actually faster to mov $0 here instead of an xor before the loop.  It saves a ~0.2 cycles/limb on n=4.  It does make the function bigger.
	mov	$0, %ecx		C need to preserve the condition codes if we clear rcx here.
	cmovng	(%rsi), %rcx		C (%rsi,%rdx,8), but the move happens only when rdx is zero.
	movq	%rcx, %xmm6		C %6 = limb1, or 0 if we're off the end of the array.  (can't be garbage because it gets ORed with the result we want)

	punpcklqdq %xmm6, %xmm7
	psllq	%xmm0,	%xmm6
	psrlq	%xmm1,	%xmm7
	por	%xmm7,	%xmm6
	jg	L(unal_endodd)			C n = 1 limb left 	C condition flags still set from loop counter
	movdqa	%xmm6, (%rdi)	C store the result.
	ret

L(unal_endodd):
	movq	%xmm6, 8(%rdi)		C an odd limb to store
	ret

EPILOGUE()

-- 
#define X(x,y) x##y
Peter Cordes ;  e-mail: X(peter at cor , des.ca)

"The gods confound the man who first found out how to distinguish the hours!
 Confound him, too, who in this place set up a sundial, to cut and hack
 my day so wretchedly into small pieces!" -- Plautus, 200 BC