[Gmp-commit] /var/hg/gmp: Use both SSE and XOP trickery, and plain popcnt insn.

mercurial at gmplib.org mercurial at gmplib.org
Fri Jun 2 00:34:07 UTC 2017


details:   /var/hg/gmp/rev/9392a8f4cc9c
changeset: 17417:9392a8f4cc9c
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Fri Jun 02 02:33:52 2017 +0200
description:
Use both SSE and XOP trickery, and plain popcnt insn.

diffstat:

 mpn/x86_64/bd1/hamdist.asm  |  181 +++++++++++++++++++++----------------------
 mpn/x86_64/bd1/popcount.asm |   43 +++++-----
 2 files changed, 111 insertions(+), 113 deletions(-)

diffs (truncated from 349 to 300 lines):

diff -r bdfde16d199e -r 9392a8f4cc9c mpn/x86_64/bd1/hamdist.asm
--- a/mpn/x86_64/bd1/hamdist.asm	Thu Jun 01 18:24:59 2017 +0200
+++ b/mpn/x86_64/bd1/hamdist.asm	Fri Jun 02 02:33:52 2017 +0200
@@ -34,8 +34,8 @@
 C		    cycles/limb	  good for cpu?
 C AMD K8,K9		n/a
 C AMD K10		n/a
-C AMD bd1	     1.93-2.49		y
-C AMD bd2	     1.81-2.30		y
+C AMD bd1	     1.51-2.0		y
+C AMD bd2	     1.50-1.9		y
 C AMD bd3		 ?
 C AMD bd4		 ?
 C AMD zen		n/a
@@ -53,6 +53,10 @@
 C Intel SLM		n/a
 C VIA nano		n/a
 
+C TODO
+C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C    intend to support old systems.
+
 C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
 C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
 C We fall back to the core2 code.
@@ -65,140 +69,133 @@
 define(`vp',		`%rsi')
 define(`n',		`%rdx')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_hamdist)
+	FUNC_ENTRY(3)
+	cmp	$5, n
+	jl	L(sma)
+
 	lea	L(cnsts)(%rip), %r9
 
-ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
-	     `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+	xor	R32(%r10), R32(%r10)
+	test	$8, R8(vp)
+	jz	L(ali)
+	mov	(up), %r8
+	xor	(vp), %r8
+	add	$8, up
+	add	$8, vp
+	dec	n
+	popcnt	%r8, %r10
+L(ali):
+
+ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
+	     `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
 	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
-	movdqa	OFF2`'(%r9), %xmm6 	C splat shift counts
-	movdqa	OFF3`'(%r9), %xmm9 	C masks
+	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
+	movdqa	OFF3`'(%r9), %xmm5	C masks
 	pxor	%xmm4, %xmm4
-	pxor	%xmm5, %xmm5		C 0-reg for psadbw
 	pxor	%xmm8, %xmm8		C grand total count
 
-	xor	R32(%r10), R32(%r10)
-
 	mov	R32(n), R32(%rax)
-	and	$7, R32(%rax)
+	and	$6, R32(%rax)
+	lea	-64(up,%rax,8), up
+	lea	-64(vp,%rax,8), vp
 ifdef(`PIC',`
-	movslq	(%r9,%rax,4), %rax
-	add	%r9, %rax
-	jmp	*%rax
+	movslq	(%r9,%rax,2), %r11
+	add	%r9, %r11
+	jmp	*%r11
 ',`
-	jmp	*(%r9,%rax,8)
+	jmp	*(%r9,%rax,4)
 ')
 
-L(1):	mov	(up), %r10
-	add	$8, up
-	xor	(vp), %r10
-	add	$8, vp
-	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
-	dec	n
-	jnz	L(top)
-	mov	%r10, %rax
-	ret
-
-L(2):	add	$-48, up
-	add	$-48, vp
-	jmp	L(e2)
-
-L(3):	mov	(up), %r10
-	add	$-40, up
-	xor	(vp), %r10
-	add	$-40, vp
-	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
-	jmp	L(e2)
-
-L(4):	add	$-32, up
-	add	$-32, vp
-	jmp	L(e4)
-
-L(5):	mov	(up), %r10
-	add	$-24, up
-	xor	(vp), %r10
-	add	$-24, vp
-	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
-	jmp	L(e4)
-
-L(6):	add	$-16, up
-	add	$-16, vp
-	jmp	L(e6)
-
-L(7):	mov	(up), %r10
-	add	$-8, up
-	xor	(vp), %r10
-	add	$-8, vp
-	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
-	jmp	L(e6)
+L(0):	add	$64, up
+	add	$64, vp
+	sub	$2, n
 
 	ALIGN(32)
 L(top):	lddqu	(up), %xmm0
-	lddqu	(vp), %xmm10
-	pxor	%xmm10, %xmm0
+	pxor	(vp), %xmm0
 	vpshlb	%xmm6, %xmm0, %xmm1
-	pand	%xmm9, %xmm0
-	pand	%xmm9, %xmm1
+	pand	%xmm5, %xmm0
+	pand	%xmm5, %xmm1
 	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
 	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
 	paddb	%xmm2, %xmm3
 	paddb	%xmm3, %xmm4
-L(e6):	lddqu	16(up), %xmm0
-	lddqu	16(vp), %xmm10
-	pxor	%xmm10, %xmm0
+L(6):	lddqu	16(up), %xmm0
+	pxor	16(vp), %xmm0
 	vpshlb	%xmm6, %xmm0, %xmm1
-	pand	%xmm9, %xmm0
-	pand	%xmm9, %xmm1
-	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
-	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
-	paddb	%xmm2, %xmm3
-	paddb	%xmm3, %xmm4
-L(e4):	lddqu	32(up), %xmm0
-	lddqu	32(vp), %xmm10
-	pxor	%xmm10, %xmm0
-	vpshlb	%xmm6, %xmm0, %xmm1
-	pand	%xmm9, %xmm0
-	pand	%xmm9, %xmm1
+	pand	%xmm5, %xmm0
+	pand	%xmm5, %xmm1
 	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
 	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
 	paddb	%xmm2, %xmm3
 	paddb	%xmm3, %xmm4
-L(e2):	lddqu	48(up), %xmm0
-	add	$64, up
-	lddqu	48(vp), %xmm10
-	add	$64, vp
-	pxor	%xmm10, %xmm0
+L(4):	lddqu	32(up), %xmm0
+	pxor	32(vp), %xmm0
 	vpshlb	%xmm6, %xmm0, %xmm1
-	pand	%xmm9, %xmm0
-	pand	%xmm9, %xmm1
+	pand	%xmm5, %xmm0
+	pand	%xmm5, %xmm1
 	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
-	psadbw	%xmm5, %xmm4		C sum to 8 x 16-bit counts
-	paddq	%xmm4, %xmm8		C sum to 2 x 64-bit counts
+	vphaddubq %xmm4, %xmm0			C sum to 8 x 16-bit counts
 	vpperm	%xmm1, %xmm7, %xmm7, %xmm4
+	paddb	%xmm2, %xmm3
 	paddb	%xmm2, %xmm4
+	paddq	%xmm0, %xmm8		C sum to 2 x 64-bit counts
+L(2):	mov	48(up), %r8
+	mov	56(up), %r9
+	add	$64, up
+	xor	48(vp), %r8
+	xor	56(vp), %r9
+	add	$64, vp
+	popcnt	%r8, %r8
+	popcnt	%r9, %r9
+	add	%r8, %r10
+	add	%r9, %r10
 	sub	$8, n
 	jg	L(top)
 
-	psadbw	%xmm5, %xmm4
-	paddq	%xmm4, %xmm8
+	test	$1, R8(n)
+	jz	L(x)
+	mov	(up), %r8
+	xor	(vp), %r8
+	popcnt	%r8, %r8
+	add	%r8, %r10
+L(x):	vphaddubq %xmm4, %xmm0			C sum to 8 x 16-bit counts
+	paddq	%xmm0, %xmm8
 	pshufd	$14, %xmm8, %xmm0
 	paddq	%xmm8, %xmm0
 	movq	%xmm0, %rax
 	add	%r10, %rax
+	FUNC_EXIT()
+	ret
+
+L(sma):	mov	(up), %r8
+	xor	(vp), %r8
+	popcnt	%r8, %rax
+	dec	n
+	jz	L(ed)
+L(tp):	mov	8(up), %r8
+	add	$8, up
+	xor	8(vp), %r8
+	add	$8, vp
+	popcnt	%r8, %r8
+	add	%r8, %rax
+	dec	n
+	jnz	L(tp)
+L(ed):	FUNC_EXIT()
 	ret
 EPILOGUE()
-DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
-	JMPENT(	L(top), L(cnsts))
-	JMPENT(	L(1), L(cnsts))
+DEF_OBJECT(L(cnsts),16)
+	JMPENT(	L(0), L(cnsts))
 	JMPENT(	L(2), L(cnsts))
-	JMPENT(	L(3), L(cnsts))
 	JMPENT(	L(4), L(cnsts))
-	JMPENT(	L(5), L(cnsts))
 	JMPENT(	L(6), L(cnsts))
-	JMPENT(	L(7), L(cnsts))
 	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
 	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
 	.byte	-4,-4,-4,-4,-4,-4,-4,-4
diff -r bdfde16d199e -r 9392a8f4cc9c mpn/x86_64/bd1/popcount.asm
--- a/mpn/x86_64/bd1/popcount.asm	Thu Jun 01 18:24:59 2017 +0200
+++ b/mpn/x86_64/bd1/popcount.asm	Fri Jun 02 02:33:52 2017 +0200
@@ -34,8 +34,8 @@
 C		    cycles/limb	  good for cpu?
 C AMD K8,K9		n/a
 C AMD K10		n/a
-C AMD bd1	     1.63-1.76		y
-C AMD bd2	     1.62-1.73		y
+C AMD bd1		 1.27		y
+C AMD bd2		 1.24		y
 C AMD bd3		 ?
 C AMD bd4		 ?
 C AMD zen		n/a
@@ -54,9 +54,8 @@
 C VIA nano		n/a
 
 C TODO
-C  * Perform some load-use scheduling for a small speedup.
-C  * The innerloop takes around 13 cycles.  That means that we could do 3 plain
-C    popcnt instructions in parallel and thereby approach 1.17 c/l.
+C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C    intend to support old systems.
 
 C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
 C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
@@ -69,19 +68,23 @@
 define(`up',		`%rdi')
 define(`n',		`%rsi')
 
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
 ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_popcount)
+	FUNC_ENTRY(3)
 	lea	L(cnsts)(%rip), %r9
 
 ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
 	     `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
 	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
-	movdqa	OFF2`'(%r9), %xmm6 	C splat shift counts
-	movdqa	OFF3`'(%r9), %xmm9 	C masks
+	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
+	movdqa	OFF3`'(%r9), %xmm9	C masks
 	pxor	%xmm4, %xmm4
-	pxor	%xmm5, %xmm5		C 0-reg for psadbw
+	pxor	%xmm5, %xmm5		C 0-reg
 	pxor	%xmm8, %xmm8		C grand total count


More information about the gmp-commit mailing list