[Gmp-commit] /var/hg/gmp: Provide arm64 popcount and hamdist.
mercurial at gmplib.org
mercurial at gmplib.org
Sat Aug 23 04:53:07 UTC 2014
details: /var/hg/gmp/rev/3bdd8cff2bb2
changeset: 16474:3bdd8cff2bb2
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Aug 23 06:51:42 2014 +0200
description:
Provide arm64 popcount and hamdist.
diffstat:
mpn/arm64/hamdist.asm | 180 +++++++++++++++++++++++++++++++++++++++++++++++++
mpn/arm64/popcount.asm | 156 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 336 insertions(+), 0 deletions(-)
diffs (truncated from 344 to 300 lines):
diff -r a17f8511711b -r 3bdd8cff2bb2 mpn/arm64/hamdist.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm64/hamdist.asm Sat Aug 23 06:51:42 2014 +0200
@@ -0,0 +1,180 @@
+dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl Copyright 2013, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Cortex-A53 ?
+C Cortex-A57 ?
+
+C TODO
+C * Consider greater unrolling.
+C * Arrange to align the pointer, if that helps performance. Use the same
+C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
+C valgrind!)
+C * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+changecom(@&*$)
+
+C INPUT PARAMETERS
+define(`ap', x0)
+define(`bp', x1)
+define(`n', x2)
+
+C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
+C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
+C allows the huge count code to jump deep into the code (at L(chu)).
+
+define(`maxsize', 0x1fff)
+define(`chunksize',0x1ff0)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+ mov x11, #maxsize
+ cmp n, x11
+ b.hi L(gt8k)
+
+L(lt8k):
+ movi v4.16b, #0 C clear summation register
+ movi v5.16b, #0 C clear summation register
+
+ tbz n, #0, L(xx0)
+ sub n, n, #1
+ ld1 {v0.1d}, [ap], #8 C load 1 limb
+ ld1 {v16.1d}, [bp], #8 C load 1 limb
+ eor v0.16b, v0.16b, v16.16b
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b C could also splat
+
+L(xx0): tbz n, #1, L(x00)
+ sub n, n, #2
+ ld1 {v0.2d}, [ap], #16 C load 2 limbs
+ ld1 {v16.2d}, [bp], #16 C load 2 limbs
+ eor v0.16b, v0.16b, v16.16b
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b
+
+L(x00): tbz n, #2, L(000)
+ subs n, n, #4
+ ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
+ ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
+ b.ls L(sum)
+
+L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
+ ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ sub n, n, #4
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ b L(mid)
+
+L(000): subs n, n, #8
+ b.lo L(e0)
+
+L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
+ ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
+ ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
+ ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ cnt v6.16b, v2.16b
+ cnt v7.16b, v3.16b
+ subs n, n, #8
+ b.lo L(end)
+
+L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
+ ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v0.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v1.16b
+L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
+ ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ subs n, n, #8
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v2.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v3.16b
+ b.hs L(top)
+
+L(end): uadalp v4.8h, v6.16b
+ uadalp v5.8h, v7.16b
+L(sum): eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ uadalp v4.8h, v6.16b
+ uadalp v5.8h, v7.16b
+ add v4.8h, v4.8h, v5.8h
+ C we have 8 16-bit counts
+L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
+ uaddlp v4.2d, v4.4s C we have 2 64-bit counts
+ mov x0, v4.d[0]
+ mov x1, v4.d[1]
+ add x0, x0, x1
+ ret
+
+C Code for count > maxsize. Splits operand and calls above code.
+define(`ap2', x5) C caller-saves reg not used above
+define(`bp2', x6) C caller-saves reg not used above
+L(gt8k):
+ mov x8, x30
+ mov x7, n C full count (caller-saves reg not used above)
+ mov x4, #0 C total sum (caller-saves reg not used above)
+ mov x9, #chunksize*8 C caller-saves reg not used above
+ mov x10, #chunksize C caller-saves reg not used above
+
+1: add ap2, ap, x9 C point at subsequent block
+ add bp2, bp, x9 C point at subsequent block
+ mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
+ movi v4.16b, #0 C clear chunk summation register
+ movi v5.16b, #0 C clear chunk summation register
+ bl L(chu) C jump deep inside code
+ add x4, x4, x0
+ mov ap, ap2 C put chunk pointer in place for calls
+ mov bp, bp2 C put chunk pointer in place for calls
+ sub x7, x7, x10
+ cmp x7, x11
+ b.hi 1b
+
+ mov n, x7 C count for final invocation
+ bl L(lt8k)
+ add x0, x4, x0
+ mov x30, x8
+ ret
+EPILOGUE()
diff -r a17f8511711b -r 3bdd8cff2bb2 mpn/arm64/popcount.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm64/popcount.asm Sat Aug 23 06:51:42 2014 +0200
@@ -0,0 +1,156 @@
+dnl ARM64 Neon mpn_popcount -- mpn bit population count.
+
+dnl Copyright 2013, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Cortex-A53 ?
+C Cortex-A57 ?
+
+C TODO
+C * Consider greater unrolling.
+C * Arrange to align the pointer, if that helps performance. Use the same
+C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
+C valgrind!)
+C * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+changecom(@&*$)
+
+C INPUT PARAMETERS
+define(`ap', x0)
+define(`n', x1)
+
+C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
+C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
+C allows the huge count code to jump deep into the code (at L(chu)).
+
+define(`maxsize', 0x1fff)
+define(`chunksize',0x1ff0)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+ mov x11, #maxsize
+ cmp n, x11
+ b.hi L(gt8k)
+
+L(lt8k):
+ movi v4.16b, #0 C clear summation register
+ movi v5.16b, #0 C clear summation register
+
+ tbz n, #0, L(xx0)
+ sub n, n, #1
+ ld1 {v0.1d}, [ap], #8 C load 1 limb
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b C could also splat
+
+L(xx0): tbz n, #1, L(x00)
+ sub n, n, #2
+ ld1 {v0.2d}, [ap], #16 C load 2 limbs
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b
+
+L(x00): tbz n, #2, L(000)
+ subs n, n, #4
+ ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
+ b.ls L(sum)
+
+L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
+ sub n, n, #4
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ b L(mid)
+
+L(000): subs n, n, #8
+ b.lo L(e0)
+
+L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
+ ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
+ cnt v6.16b, v2.16b
+ cnt v7.16b, v3.16b
+ subs n, n, #8
+ b.lo L(end)
+
+L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v0.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v1.16b
+L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
+ subs n, n, #8
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v2.16b
+ uadalp v5.8h, v7.16b
More information about the gmp-commit
mailing list