[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun May 28 23:46:29 UTC 2017
details: /var/hg/gmp/rev/b9af7bba2f81
changeset: 17398:b9af7bba2f81
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun May 28 17:06:02 2017 +0200
description:
Fix comment typo.
details: /var/hg/gmp/rev/4b4c00f65d5c
changeset: 17399:4b4c00f65d5c
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon May 29 01:45:55 2017 +0200
description:
Replace grabber code with implementation proper.
diffstat:
mpn/x86/k7/addlsh1_n.asm | 2 +-
mpn/x86/k7/sublsh1_n.asm | 2 +-
mpn/x86_64/coreinhm/hamdist.asm | 161 ++++++++++++++++++++++++++++++++++++++-
mpn/x86_64/coreinhm/popcount.asm | 150 +++++++++++++++++++++++++++++++++++-
4 files changed, 307 insertions(+), 8 deletions(-)
diffs (truncated from 367 to 300 lines):
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86/k7/addlsh1_n.asm
--- a/mpn/x86/k7/addlsh1_n.asm Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86/k7/addlsh1_n.asm Mon May 29 01:45:55 2017 +0200
@@ -65,7 +65,7 @@
C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
+C force us to 2*2-way unrolling.
defframe(PARAM_SIZE, 16)
defframe(PARAM_DBLD, 12)
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86/k7/sublsh1_n.asm
--- a/mpn/x86/k7/sublsh1_n.asm Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86/k7/sublsh1_n.asm Mon May 29 01:45:55 2017 +0200
@@ -57,7 +57,7 @@
C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
+C force us to 2*2-way unrolling.
defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC, 8)
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86_64/coreinhm/hamdist.asm
--- a/mpn/x86_64/coreinhm/hamdist.asm Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86_64/coreinhm/hamdist.asm Mon May 29 01:45:55 2017 +0200
@@ -1,6 +1,6 @@
dnl AMD64 mpn_hamdist -- hamming distance.
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -31,8 +31,163 @@
include(`../config.m4')
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 3.26
+C AMD bd1 4.2
+C AMD bd2 4.2
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 1.15
+C AMD bobcat 7.29
+C AMD jaguar 2.53
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel NHM 2.03
+C Intel SBR 1.66
+C Intel IBR 1.62
+C Intel HWL 1.50
+C Intel BWL 1.50
+C Intel SKL 1.50
+C Intel atom n/a
+C Intel SLM 2.55
+C VIA nano n/a
+
+C TODO
+C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C Intel hardware. Perhaps mix such a loop with popcnt instructions.
+C * The random placement of the L0, L1, L2, etc blocks are due to branch
+C shortening. More work could be done there.
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
-MULFUNC_PROLOGUE(mpn_hamdist)
-include_mpn(`x86_64/k10/hamdist.asm')
+define(`sum', `lea ($1,$2), $2')
+define(`sum', `add $1, $2')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ push %rbx
+ push %rbp
+
+ mov (up), %r10
+ xor (vp), %r10
+
+ mov R32(n), R32(%r8)
+ and $3, R32(%r8)
+
+ xor R32(%rcx), R32(%rcx)
+ .byte 0xf3,0x49,0x0f,0xb8,0xc2 C popcnt %r10,%rax
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%r8,4), %r8
+ add %r9, %r8
+ jmp *%r8
+',`
+ jmp *(%r9,%r8,8)
+')
+
+L(3): mov 8(up), %r10
+ mov 16(up), %r11
+ xor 8(vp), %r10
+ xor 16(vp), %r11
+ xor R32(%rbp), R32(%rbp)
+ sub $4, n
+ jle L(x3)
+ mov 24(up), %r8
+ mov 32(up), %r9
+ add $24, up
+ add $24, vp
+ jmp L(e3)
+
+L(0): mov 8(up), %r9
+ xor 8(vp), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+ xor R32(%rbx), R32(%rbx)
+ xor 16(vp), %r10
+ xor 24(vp), %r11
+ add $32, up
+ add $32, vp
+ sub $4, n
+ jle L(x4)
+
+ ALIGN(16)
+L(top):
+L(e0): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp
+ mov (up), %r8
+ mov 8(up), %r9
+ sum( %rbx, %rax)
+L(e3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx
+ xor (vp), %r8
+ xor 8(vp), %r9
+ sum( %rbp, %rcx)
+L(e2): .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ add $32, up
+ sum( %rbx, %rax)
+L(e1): .byte 0xf3,0x49,0x0f,0xb8,0xd8 C popcnt %r8,%rbx
+ xor 16(vp), %r10
+ xor 24(vp), %r11
+ add $32, vp
+ sum( %rbp, %rcx)
+ sub $4, n
+ jg L(top)
+
+L(x4): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp
+ sum( %rbx, %rax)
+L(x3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx
+ sum( %rbp, %rcx)
+ .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp
+ sum( %rbx, %rax)
+ sum( %rbp, %rcx)
+L(x2): add %rcx, %rax
+L(x1): pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(2): mov 8(up), %r11
+ xor 8(vp), %r11
+ sub $2, n
+ jle L(n2)
+ mov 16(up), %r8
+ mov 24(up), %r9
+ xor R32(%rbx), R32(%rbx)
+ xor 16(vp), %r8
+ xor 24(vp), %r9
+ add $16, up
+ add $16, vp
+ jmp L(e2)
+L(n2): .byte 0xf3,0x49,0x0f,0xb8,0xcb C popcnt %r11,%rcx
+ jmp L(x2)
+
+L(1): dec n
+ jle L(x1)
+ mov 8(up), %r8
+ mov 16(up), %r9
+ xor 8(vp), %r8
+ xor 16(vp), %r9
+ xor R32(%rbp), R32(%rbp)
+ mov 24(up), %r10
+ mov 32(up), %r11
+ add $40, up
+ add $8, vp
+ jmp L(e1)
+
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
diff -r 020ab7920eab -r 4b4c00f65d5c mpn/x86_64/coreinhm/popcount.asm
--- a/mpn/x86_64/coreinhm/popcount.asm Mon May 22 00:24:15 2017 +0200
+++ b/mpn/x86_64/coreinhm/popcount.asm Mon May 29 01:45:55 2017 +0200
@@ -1,6 +1,6 @@
dnl AMD64 mpn_popcount -- population count.
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -31,8 +31,152 @@
include(`../config.m4')
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 1.39
+C AMD bd1 4
+C AMD bd2 4
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 0.72
+C AMD bobcat 5.78
+C AMD jaguar 1.27
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel NHM 1.04
+C Intel SBR 1.02
+C Intel IBR 1.0
+C Intel HWL 1.0
+C Intel BWL 1.0
+C Intel SKL 1.0
+C Intel atom n/a
+C Intel SLM 1.34
+C VIA nano n/a
+
+C TODO
+C * We could approach 0.5 c/l for AMD Zen with more unrolling. That would
+C not cause any additional feed-in overhead as we already use a jump table.
+C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C Intel hardware. Perhaps mix such a loop with popcnt instructions.
+C * The random placement of the L0, L1, L2, etc blocks are due to branch
+C shortening.
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86_64/k10/popcount.asm')
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(2)
+
+ mov R32(n), R32(%r8)
+ and $7, R32(%r8)
+
+ .byte 0xf3,0x48,0x0f,0xb8,0x07 C popcnt (up), %rax
+ xor R32(%rcx), R32(%rcx)
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%r8,4), %r8
+ add %r9, %r8
+ jmp *%r8
+',`
+ jmp *(%r9,%r8,8)
+')
+
+L(3): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 8(up), %r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 16(up), %r11
+ add $24, up
+ sub $8, n
+ jg L(e34)
+ add %r10, %rax
+ add %r11, %rax
+L(s1): FUNC_EXIT()
+ ret
+
+L(1): sub $8, n
+ jle L(s1)
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 8(up), %r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 16(up), %r9
+ add $8, up
+ jmp L(e12)
+
+L(7): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 0x8(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 0x10(%rdi),%r11
+ add $-8, up
+ jmp L(e07)
+
+L(0): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
More information about the gmp-commit
mailing list