[Gmp-commit] /var/hg/gmp: Use both SSE and XOP trickery, and plain popcnt insn.
mercurial at gmplib.org
mercurial at gmplib.org
Fri Jun 2 00:34:07 UTC 2017
details: /var/hg/gmp/rev/9392a8f4cc9c
changeset: 17417:9392a8f4cc9c
user: Torbjorn Granlund <tg at gmplib.org>
date: Fri Jun 02 02:33:52 2017 +0200
description:
Use both SSE and XOP trickery, and plain popcnt insn.
diffstat:
mpn/x86_64/bd1/hamdist.asm | 181 +++++++++++++++++++++----------------------
mpn/x86_64/bd1/popcount.asm | 43 +++++-----
2 files changed, 111 insertions(+), 113 deletions(-)
diffs (truncated from 349 to 300 lines):
diff -r bdfde16d199e -r 9392a8f4cc9c mpn/x86_64/bd1/hamdist.asm
--- a/mpn/x86_64/bd1/hamdist.asm Thu Jun 01 18:24:59 2017 +0200
+++ b/mpn/x86_64/bd1/hamdist.asm Fri Jun 02 02:33:52 2017 +0200
@@ -34,8 +34,8 @@
C cycles/limb good for cpu?
C AMD K8,K9 n/a
C AMD K10 n/a
-C AMD bd1 1.93-2.49 y
-C AMD bd2 1.81-2.30 y
+C AMD bd1 1.51-2.0 y
+C AMD bd2 1.50-1.9 y
C AMD bd3 ?
C AMD bd4 ?
C AMD zen n/a
@@ -53,6 +53,10 @@
C Intel SLM n/a
C VIA nano n/a
+C TODO
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
+
C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
C We fall back to the core2 code.
@@ -65,140 +69,133 @@
define(`vp', `%rsi')
define(`n', `%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ cmp $5, n
+ jl L(sma)
+
lea L(cnsts)(%rip), %r9
-ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
- `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+ xor R32(%r10), R32(%r10)
+ test $8, R8(vp)
+ jz L(ali)
+ mov (up), %r8
+ xor (vp), %r8
+ add $8, up
+ add $8, vp
+ dec n
+ popcnt %r8, %r10
+L(ali):
+
+ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
+ `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
movdqa OFF1`'(%r9), %xmm7 C nibble counts table
- movdqa OFF2`'(%r9), %xmm6 C splat shift counts
- movdqa OFF3`'(%r9), %xmm9 C masks
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm5 C masks
pxor %xmm4, %xmm4
- pxor %xmm5, %xmm5 C 0-reg for psadbw
pxor %xmm8, %xmm8 C grand total count
- xor R32(%r10), R32(%r10)
-
mov R32(n), R32(%rax)
- and $7, R32(%rax)
+ and $6, R32(%rax)
+ lea -64(up,%rax,8), up
+ lea -64(vp,%rax,8), vp
ifdef(`PIC',`
- movslq (%r9,%rax,4), %rax
- add %r9, %rax
- jmp *%rax
+ movslq (%r9,%rax,2), %r11
+ add %r9, %r11
+ jmp *%r11
',`
- jmp *(%r9,%rax,8)
+ jmp *(%r9,%rax,4)
')
-L(1): mov (up), %r10
- add $8, up
- xor (vp), %r10
- add $8, vp
- .byte 0xf3,0x4d,0x0f,0xb8,0xd2 C popcnt %r10,%r10
- dec n
- jnz L(top)
- mov %r10, %rax
- ret
-
-L(2): add $-48, up
- add $-48, vp
- jmp L(e2)
-
-L(3): mov (up), %r10
- add $-40, up
- xor (vp), %r10
- add $-40, vp
- .byte 0xf3,0x4d,0x0f,0xb8,0xd2 C popcnt %r10,%r10
- jmp L(e2)
-
-L(4): add $-32, up
- add $-32, vp
- jmp L(e4)
-
-L(5): mov (up), %r10
- add $-24, up
- xor (vp), %r10
- add $-24, vp
- .byte 0xf3,0x4d,0x0f,0xb8,0xd2 C popcnt %r10,%r10
- jmp L(e4)
-
-L(6): add $-16, up
- add $-16, vp
- jmp L(e6)
-
-L(7): mov (up), %r10
- add $-8, up
- xor (vp), %r10
- add $-8, vp
- .byte 0xf3,0x4d,0x0f,0xb8,0xd2 C popcnt %r10,%r10
- jmp L(e6)
+L(0): add $64, up
+ add $64, vp
+ sub $2, n
ALIGN(32)
L(top): lddqu (up), %xmm0
- lddqu (vp), %xmm10
- pxor %xmm10, %xmm0
+ pxor (vp), %xmm0
vpshlb %xmm6, %xmm0, %xmm1
- pand %xmm9, %xmm0
- pand %xmm9, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
vpperm %xmm0, %xmm7, %xmm7, %xmm2
vpperm %xmm1, %xmm7, %xmm7, %xmm3
paddb %xmm2, %xmm3
paddb %xmm3, %xmm4
-L(e6): lddqu 16(up), %xmm0
- lddqu 16(vp), %xmm10
- pxor %xmm10, %xmm0
+L(6): lddqu 16(up), %xmm0
+ pxor 16(vp), %xmm0
vpshlb %xmm6, %xmm0, %xmm1
- pand %xmm9, %xmm0
- pand %xmm9, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vpperm %xmm1, %xmm7, %xmm7, %xmm3
- paddb %xmm2, %xmm3
- paddb %xmm3, %xmm4
-L(e4): lddqu 32(up), %xmm0
- lddqu 32(vp), %xmm10
- pxor %xmm10, %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
- pand %xmm9, %xmm0
- pand %xmm9, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
vpperm %xmm0, %xmm7, %xmm7, %xmm2
vpperm %xmm1, %xmm7, %xmm7, %xmm3
paddb %xmm2, %xmm3
paddb %xmm3, %xmm4
-L(e2): lddqu 48(up), %xmm0
- add $64, up
- lddqu 48(vp), %xmm10
- add $64, vp
- pxor %xmm10, %xmm0
+L(4): lddqu 32(up), %xmm0
+ pxor 32(vp), %xmm0
vpshlb %xmm6, %xmm0, %xmm1
- pand %xmm9, %xmm0
- pand %xmm9, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
vpperm %xmm0, %xmm7, %xmm7, %xmm2
- psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts
- paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts
+ vphaddubq %xmm4, %xmm0 C sum to 8 x 16-bit counts
vpperm %xmm1, %xmm7, %xmm7, %xmm4
+ paddb %xmm2, %xmm3
paddb %xmm2, %xmm4
+ paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts
+L(2): mov 48(up), %r8
+ mov 56(up), %r9
+ add $64, up
+ xor 48(vp), %r8
+ xor 56(vp), %r9
+ add $64, vp
+ popcnt %r8, %r8
+ popcnt %r9, %r9
+ add %r8, %r10
+ add %r9, %r10
sub $8, n
jg L(top)
- psadbw %xmm5, %xmm4
- paddq %xmm4, %xmm8
+ test $1, R8(n)
+ jz L(x)
+ mov (up), %r8
+ xor (vp), %r8
+ popcnt %r8, %r8
+ add %r8, %r10
+L(x): vphaddubq %xmm4, %xmm0 C sum to 8 x 16-bit counts
+ paddq %xmm0, %xmm8
pshufd $14, %xmm8, %xmm0
paddq %xmm8, %xmm0
movq %xmm0, %rax
add %r10, %rax
+ FUNC_EXIT()
+ ret
+
+L(sma): mov (up), %r8
+ xor (vp), %r8
+ popcnt %r8, %rax
+ dec n
+ jz L(ed)
+L(tp): mov 8(up), %r8
+ add $8, up
+ xor 8(vp), %r8
+ add $8, vp
+ popcnt %r8, %r8
+ add %r8, %rax
+ dec n
+ jnz L(tp)
+L(ed): FUNC_EXIT()
ret
EPILOGUE()
-DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
- JMPENT( L(top), L(cnsts))
- JMPENT( L(1), L(cnsts))
+DEF_OBJECT(L(cnsts),16)
+ JMPENT( L(0), L(cnsts))
JMPENT( L(2), L(cnsts))
- JMPENT( L(3), L(cnsts))
JMPENT( L(4), L(cnsts))
- JMPENT( L(5), L(cnsts))
JMPENT( L(6), L(cnsts))
- JMPENT( L(7), L(cnsts))
.byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
.byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
.byte -4,-4,-4,-4,-4,-4,-4,-4
diff -r bdfde16d199e -r 9392a8f4cc9c mpn/x86_64/bd1/popcount.asm
--- a/mpn/x86_64/bd1/popcount.asm Thu Jun 01 18:24:59 2017 +0200
+++ b/mpn/x86_64/bd1/popcount.asm Fri Jun 02 02:33:52 2017 +0200
@@ -34,8 +34,8 @@
C cycles/limb good for cpu?
C AMD K8,K9 n/a
C AMD K10 n/a
-C AMD bd1 1.63-1.76 y
-C AMD bd2 1.62-1.73 y
+C AMD bd1 1.27 y
+C AMD bd2 1.24 y
C AMD bd3 ?
C AMD bd4 ?
C AMD zen n/a
@@ -54,9 +54,8 @@
C VIA nano n/a
C TODO
-C * Perform some load-use scheduling for a small speedup.
-C * The innerloop takes around 13 cycles. That means that we could do 3 plain
-C popcnt instructions in parallel and thereby approach 1.17 c/l.
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
@@ -69,19 +68,23 @@
define(`up', `%rdi')
define(`n', `%rsi')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(3)
lea L(cnsts)(%rip), %r9
ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
`define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
movdqa OFF1`'(%r9), %xmm7 C nibble counts table
- movdqa OFF2`'(%r9), %xmm6 C splat shift counts
- movdqa OFF3`'(%r9), %xmm9 C masks
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm9 C masks
pxor %xmm4, %xmm4
- pxor %xmm5, %xmm5 C 0-reg for psadbw
+ pxor %xmm5, %xmm5 C 0-reg
pxor %xmm8, %xmm8 C grand total count
More information about the gmp-commit
mailing list