[Gmp-commit] /var/hg/gmp: 4 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Tue May 30 00:40:51 UTC 2017


details:   /var/hg/gmp/rev/793826d9853a
changeset: 17400:793826d9853a
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue May 30 02:01:04 2017 +0200
description:
Replace grabber code with implementation proper.

details:   /var/hg/gmp/rev/fc2807686c78
changeset: 17401:fc2807686c78
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue May 30 02:03:29 2017 +0200
description:
Copyright header.

details:   /var/hg/gmp/rev/bdeffa9701b2
changeset: 17402:bdeffa9701b2
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue May 30 02:24:10 2017 +0200
description:
Provide popcount and hamdist optimised for conroe/penryn.

details:   /var/hg/gmp/rev/e5ab0550ae5c
changeset: 17403:e5ab0550ae5c
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue May 30 02:39:05 2017 +0200
description:
Expand popcnt insns into .byte sequences.

diffstat:

 mpn/x86_64/bd1/hamdist.asm    |  174 +++++++++++++++++++++++++++++++++-
 mpn/x86_64/bd1/popcount.asm   |  149 ++++++++++++++++++++++++++++-
 mpn/x86_64/core2/hamdist.asm  |  209 ++++++++++++++++++++++++++++++++++++++++++
 mpn/x86_64/core2/popcount.asm |  157 ++++++++++++++++++++++++++++++-
 4 files changed, 673 insertions(+), 16 deletions(-)

diffs (truncated from 731 to 300 lines):

diff -r 4b4c00f65d5c -r e5ab0550ae5c mpn/x86_64/bd1/hamdist.asm
--- a/mpn/x86_64/bd1/hamdist.asm	Mon May 29 01:45:55 2017 +0200
+++ b/mpn/x86_64/bd1/hamdist.asm	Tue May 30 02:39:05 2017 +0200
@@ -1,6 +1,6 @@
-dnl  AMD64 mpn_hamdist -- hamming distance.
+dnl  AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
 
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl  Copyright 2010-2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,8 +31,170 @@
 
 include(`../config.m4')
 
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+C		    cycles/limb	  good for cpu?
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1	     1.93-2.49		y
+C AMD bd2	     1.81-2.30		y
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		n/a
+C AMD bobcat		n/a
+C AMD jaguar		n/a
+C Intel P4		n/a
+C Intel PNR		n/a
+C Intel NHM		n/a
+C Intel SBR		n/a
+C Intel IBR		n/a
+C Intel HWL		n/a
+C Intel BWL		n/a
+C Intel SKL		n/a
+C Intel atom		n/a
+C Intel SLM		n/a
+C VIA nano		n/a
 
-MULFUNC_PROLOGUE(mpn_hamdist)
-include_mpn(`x86_64/k10/hamdist.asm')
+define(`up',		`%rdi')
+define(`vp',		`%rsi')
+define(`n',		`%rdx')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	lea	L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
+	     `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
+	movdqa	OFF2`'(%r9), %xmm6 	C splat shift counts
+	movdqa	OFF3`'(%r9), %xmm9 	C masks
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5		C 0-reg for psadbw
+	pxor	%xmm8, %xmm8		C grand total count
+
+	xor	R32(%r10), R32(%r10)
+
+	mov	R32(n), R32(%rax)
+	and	$7, R32(%rax)
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+L(1):	mov	(up), %r10
+	add	$8, up
+	xor	(vp), %r10
+	add	$8, vp
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
+	dec	n
+	jnz	L(top)
+	mov	%r10, %rax
+	ret
+
+L(2):	add	$-48, up
+	add	$-48, vp
+	jmp	L(e2)
+
+L(3):	mov	(up), %r10
+	add	$-40, up
+	xor	(vp), %r10
+	add	$-40, vp
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
+	jmp	L(e2)
+
+L(4):	add	$-32, up
+	add	$-32, vp
+	jmp	L(e4)
+
+L(5):	mov	(up), %r10
+	add	$-24, up
+	xor	(vp), %r10
+	add	$-24, vp
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
+	jmp	L(e4)
+
+L(6):	add	$-16, up
+	add	$-16, vp
+	jmp	L(e6)
+
+L(7):	mov	(up), %r10
+	add	$-8, up
+	xor	(vp), %r10
+	add	$-8, vp
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd2	C popcnt %r10,%r10
+	jmp	L(e6)
+
+	ALIGN(32)
+L(top):	lddqu	(up), %xmm0
+	lddqu	(vp), %xmm10
+	pxor	%xmm10, %xmm0
+	vpshlb	%xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
+	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e6):	lddqu	16(up), %xmm0
+	lddqu	16(vp), %xmm10
+	pxor	%xmm10, %xmm0
+	vpshlb	%xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
+	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e4):	lddqu	32(up), %xmm0
+	lddqu	32(vp), %xmm10
+	pxor	%xmm10, %xmm0
+	vpshlb	%xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
+	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e2):	lddqu	48(up), %xmm0
+	add	$64, up
+	lddqu	48(vp), %xmm10
+	add	$64, vp
+	pxor	%xmm10, %xmm0
+	vpshlb	%xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
+	psadbw	%xmm5, %xmm4		C sum to 8 x 16-bit counts
+	paddq	%xmm4, %xmm8		C sum to 2 x 64-bit counts
+	vpperm	%xmm1, %xmm7, %xmm7, %xmm4
+	paddb	%xmm2, %xmm4
+	sub	$8, n
+	jg	L(top)
+
+	psadbw	%xmm5, %xmm4
+	paddq	%xmm4, %xmm8
+	pshufd	$14, %xmm8, %xmm0
+	paddq	%xmm8, %xmm0
+	movq	%xmm0, %rax
+	add	%r10, %rax
+	ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16)
+	JMPENT(	L(top), L(cnsts))
+	JMPENT(	L(1), L(cnsts))
+	JMPENT(	L(2), L(cnsts))
+	JMPENT(	L(3), L(cnsts))
+	JMPENT(	L(4), L(cnsts))
+	JMPENT(	L(5), L(cnsts))
+	JMPENT(	L(6), L(cnsts))
+	JMPENT(	L(7), L(cnsts))
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+	.byte	-4,-4,-4,-4,-4,-4,-4,-4
+	.byte	-4,-4,-4,-4,-4,-4,-4,-4
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
diff -r 4b4c00f65d5c -r e5ab0550ae5c mpn/x86_64/bd1/popcount.asm
--- a/mpn/x86_64/bd1/popcount.asm	Mon May 29 01:45:55 2017 +0200
+++ b/mpn/x86_64/bd1/popcount.asm	Tue May 30 02:39:05 2017 +0200
@@ -1,6 +1,6 @@
-dnl  AMD64 mpn_popcount -- population count.
+dnl  AMD64 SSSE3/XOP mpn_popcount -- population count.
 
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl  Copyright 2010-2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,8 +31,145 @@
 
 include(`../config.m4')
 
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+C		    cycles/limb	  good for cpu?
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1	     1.63-1.76		y
+C AMD bd2	     1.62-1.73		y
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		n/a
+C AMD bobcat		n/a
+C AMD jaguar		n/a
+C Intel P4		n/a
+C Intel PNR		n/a
+C Intel NHM		n/a
+C Intel SBR		n/a
+C Intel IBR		n/a
+C Intel HWL		n/a
+C Intel BWL		n/a
+C Intel SKL		n/a
+C Intel atom		n/a
+C Intel SLM		n/a
+C VIA nano		n/a
 
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86_64/k10/popcount.asm')
+define(`up',		`%rdi')
+define(`n',		`%rsi')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+	lea	L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
+	     `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
+	movdqa	OFF2`'(%r9), %xmm6 	C splat shift counts
+	movdqa	OFF3`'(%r9), %xmm9 	C masks
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5		C 0-reg for psadbw
+	pxor	%xmm8, %xmm8		C grand total count
+
+	xor	R32(%rdx), R32(%rdx)
+
+	mov	R32(n), R32(%rax)
+	and	$7, R32(%rax)
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+L(1):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up),%rdx
+	add	$8, up
+	dec	n
+	jnz	L(top)
+	mov	%rdx, %rax
+	ret
+
+L(2):	add	$-48, up
+	jmp	L(e2)
+
+L(3):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
+	add	$-40, up
+	jmp	L(e2)
+
+L(4):	add	$-32, up
+	jmp	L(e4)
+
+L(5):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
+	add	$-24, up
+	jmp	L(e4)
+
+L(6):	add	$-16, up
+	jmp	L(e6)
+
+L(7):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
+	add	$-8, up
+	jmp	L(e6)
+
+	ALIGN(32)
+L(top):	lddqu	(up), %xmm0
+	vpshlb	%xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	vpperm	%xmm0, %xmm7, %xmm7, %xmm2
+	vpperm	%xmm1, %xmm7, %xmm7, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e6):	lddqu	16(up), %xmm0
+	vpshlb	%xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1


More information about the gmp-commit mailing list