[Gmp-commit] /var/hg/gmp: 2 new changesets

Sun May 28 23:46:29 UTC 2017

details:   /var/hg/gmp/rev/b9af7bba2f81
changeset: 17398:b9af7bba2f81
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun May 28 17:06:02 2017 +0200
description:
Fix comment typo.

details:   /var/hg/gmp/rev/4b4c00f65d5c
changeset: 17399:4b4c00f65d5c
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon May 29 01:45:55 2017 +0200
description:
Replace grabber code with implementation proper.

diffstat:

 mpn/x86/k7/addlsh1_n.asm         |    2 +-
 mpn/x86/k7/sublsh1_n.asm         |    2 +-
 mpn/x86_64/coreinhm/hamdist.asm  |  161 ++++++++++++++++++++++++++++++++++++++-
 mpn/x86_64/coreinhm/popcount.asm |  150 +++++++++++++++++++++++++++++++++++-
 4 files changed, 307 insertions(+), 8 deletions(-)

diffs (truncated from 367 to 300 lines):

diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86/k7/addlsh1_n.asm

--- a/mpn/x86/k7/addlsh1_n.asm	Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86/k7/addlsh1_n.asm	Mon May 29 01:45:55 2017 +0200
@@ -65,7 +65,7 @@
 C
 C Breaking carry recurrency might be a good idea.  We would then need separate
 C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
+C force us to 2*2-way unrolling.
 
 defframe(PARAM_SIZE,	16)
 defframe(PARAM_DBLD,	12)
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86/k7/sublsh1_n.asm
--- a/mpn/x86/k7/sublsh1_n.asm	Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86/k7/sublsh1_n.asm	Mon May 29 01:45:55 2017 +0200
@@ -57,7 +57,7 @@
 C
 C Breaking carry recurrency might be a good idea.  We would then need separate
 C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
+C force us to 2*2-way unrolling.
 
 defframe(PARAM_SIZE,	12)
 defframe(PARAM_SRC,	 8)
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86_64/coreinhm/hamdist.asm
--- a/mpn/x86_64/coreinhm/hamdist.asm	Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86_64/coreinhm/hamdist.asm	Mon May 29 01:45:55 2017 +0200
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_hamdist -- hamming distance.
 
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl  Copyright 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,8 +31,163 @@
 
 include(`../config.m4')
 
+C		    cycles/limb
+C AMD K8,K9		 n/a
+C AMD K10		 3.26
+C AMD bd1		 4.2
+C AMD bd2		 4.2
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		 1.15
+C AMD bobcat		 7.29
+C AMD jaguar		 2.53
+C Intel P4		 n/a
+C Intel core2		 n/a
+C Intel NHM		 2.03
+C Intel SBR		 1.66
+C Intel IBR		 1.62
+C Intel HWL		 1.50
+C Intel BWL		 1.50
+C Intel SKL		 1.50
+C Intel atom		 n/a
+C Intel SLM		 2.55
+C VIA nano		 n/a
+
+C TODO
+C  * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C    Intel hardware.  Perhaps mix such a loop with popcnt instructions.
+C  * The random placement of the L0, L1, L2, etc blocks are due to branch
+C    shortening.  More work could be done there.
+
+define(`up',		`%rdi')
+define(`vp',		`%rsi')
+define(`n',		`%rdx')
+
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
 
-MULFUNC_PROLOGUE(mpn_hamdist)
-include_mpn(`x86_64/k10/hamdist.asm')
+define(`sum', `lea	($1,$2), $2')
+define(`sum', `add	$1, $2')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	FUNC_ENTRY(3)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %r10
+	xor	(vp), %r10
+
+	mov	R32(n), R32(%r8)
+	and	$3, R32(%r8)
+
+	xor	R32(%rcx), R32(%rcx)
+	.byte	0xf3,0x49,0x0f,0xb8,0xc2	C popcnt %r10,%rax
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%r8,4), %r8
+	add	%r9, %r8
+	jmp	*%r8
+',`
+	jmp	*(%r9,%r8,8)
+')
+
+L(3):	mov	8(up), %r10
+	mov	16(up), %r11
+	xor	8(vp), %r10
+	xor	16(vp), %r11
+	xor	R32(%rbp), R32(%rbp)
+	sub	$4, n
+	jle	L(x3)
+	mov	24(up), %r8
+	mov	32(up), %r9
+	add	$24, up
+	add	$24, vp
+	jmp	L(e3)
+
+L(0):	mov	8(up), %r9
+	xor	8(vp), %r9
+	mov	16(up), %r10
+	mov	24(up), %r11
+	xor	R32(%rbx), R32(%rbx)
+	xor	16(vp), %r10
+	xor	24(vp), %r11
+	add	$32, up
+	add	$32, vp
+	sub	$4, n
+	jle	L(x4)
+
+	ALIGN(16)
+L(top):
+L(e0):	.byte	0xf3,0x49,0x0f,0xb8,0xe9	C popcnt %r9,%rbp
+	mov	(up), %r8
+	mov	8(up), %r9
+	sum(	%rbx, %rax)
+L(e3):	.byte	0xf3,0x49,0x0f,0xb8,0xda	C popcnt %r10,%rbx
+	xor	(vp), %r8
+	xor	8(vp), %r9
+	sum(	%rbp, %rcx)
+L(e2):	.byte	0xf3,0x49,0x0f,0xb8,0xeb	C popcnt %r11,%rbp
+	mov	16(up), %r10
+	mov	24(up), %r11
+	add	$32, up
+	sum(	%rbx, %rax)
+L(e1):	.byte	0xf3,0x49,0x0f,0xb8,0xd8	C popcnt %r8,%rbx
+	xor	16(vp), %r10
+	xor	24(vp), %r11
+	add	$32, vp
+	sum(	%rbp, %rcx)
+	sub	$4, n
+	jg	L(top)
+
+L(x4):	.byte	0xf3,0x49,0x0f,0xb8,0xe9	C popcnt %r9,%rbp
+	sum(	%rbx, %rax)
+L(x3):	.byte	0xf3,0x49,0x0f,0xb8,0xda	C popcnt %r10,%rbx
+	sum(	%rbp, %rcx)
+	.byte	0xf3,0x49,0x0f,0xb8,0xeb	C popcnt %r11,%rbp
+	sum(	%rbx, %rax)
+	sum(	%rbp, %rcx)
+L(x2):	add	%rcx, %rax
+L(x1):	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(2):	mov	8(up), %r11
+	xor	8(vp), %r11
+	sub	$2, n
+	jle	L(n2)
+	mov	16(up), %r8
+	mov	24(up), %r9
+	xor	R32(%rbx), R32(%rbx)
+	xor	16(vp), %r8
+	xor	24(vp), %r9
+	add	$16, up
+	add	$16, vp
+	jmp	L(e2)
+L(n2):	.byte	0xf3,0x49,0x0f,0xb8,0xcb	C popcnt %r11,%rcx
+	jmp	L(x2)
+
+L(1):	dec	n
+	jle	L(x1)
+	mov	8(up), %r8
+	mov	16(up), %r9
+	xor	8(vp), %r8
+	xor	16(vp), %r9
+	xor	R32(%rbp), R32(%rbp)
+	mov	24(up), %r10
+	mov	32(up), %r11
+	add	$40, up
+	add	$8, vp
+	jmp	L(e1)
+
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86_64/coreinhm/popcount.asm
--- a/mpn/x86_64/coreinhm/popcount.asm	Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86_64/coreinhm/popcount.asm	Mon May 29 01:45:55 2017 +0200
@@ -1,6 +1,6 @@
 dnl  AMD64 mpn_popcount -- population count.
 
-dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl  Copyright 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,8 +31,152 @@
 
 include(`../config.m4')
 
+C		    cycles/limb
+C AMD K8,K9		 n/a
+C AMD K10		 1.39
+C AMD bd1		 4
+C AMD bd2		 4
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		 0.72
+C AMD bobcat		 5.78
+C AMD jaguar		 1.27
+C Intel P4		 n/a
+C Intel core2		 n/a
+C Intel NHM		 1.04
+C Intel SBR		 1.02
+C Intel IBR		 1.0
+C Intel HWL		 1.0
+C Intel BWL		 1.0
+C Intel SKL		 1.0
+C Intel atom		 n/a
+C Intel SLM		 1.34
+C VIA nano		 n/a
+
+C TODO
+C  * We could approach 0.5 c/l for AMD Zen with more unrolling.  That would
+C    not cause any additional feed-in overhead as we already use a jump table.
+C  * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C    Intel hardware.  Perhaps mix such a loop with popcnt instructions.
+C  * The random placement of the L0, L1, L2, etc blocks are due to branch
+C    shortening.
+
+define(`up',		`%rdi')
+define(`n',		`%rsi')
+
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
 
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86_64/k10/popcount.asm')
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+	FUNC_ENTRY(2)
+
+	mov	R32(n), R32(%r8)
+	and	$7, R32(%r8)
+
+	.byte	0xf3,0x48,0x0f,0xb8,0x07	C popcnt (up), %rax
+	xor	R32(%rcx), R32(%rcx)
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%r8,4), %r8
+	add	%r9, %r8
+	jmp	*%r8
+',`
+	jmp	*(%r9,%r8,8)
+')
+
+L(3):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x08	C popcnt 8(up), %r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x10	C popcnt 16(up), %r11
+	add	$24, up
+	sub	$8, n
+	jg	L(e34)
+	add	%r10, %rax
+	add	%r11, %rax
+L(s1):	FUNC_EXIT()
+	ret
+
+L(1):	sub	$8, n
+	jle	L(s1)
+	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x08	C popcnt 8(up), %r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x10	C popcnt 16(up), %r9
+	add	$8, up
+	jmp	L(e12)
+
+L(7):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x08	C popcnt 0x8(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x10	C popcnt 0x10(%rdi),%r11
+	add	$-8, up
+	jmp	L(e07)
+
+L(0):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
+	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11