[Gmp-commit] /var/hg/gmp: 4 new changesets

Tue Mar 19 18:44:38 CET 2013

details:   /var/hg/gmp/rev/707ab7470ab9
changeset: 15610:707ab7470ab9
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 19 18:36:24 2013 +0100
description:
New arm/neon popcount and hamdist.

details:   /var/hg/gmp/rev/d394216820bf
changeset: 15611:d394216820bf
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 19 18:43:45 2013 +0100
description:
Get printing of clobbered register right.

details:   /var/hg/gmp/rev/90df127e0d2d
changeset: 15612:90df127e0d2d
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 19 18:44:15 2013 +0100
description:
Add cycle numbers.

details:   /var/hg/gmp/rev/69009e987422
changeset: 15613:69009e987422
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Mar 19 18:44:35 2013 +0100
description:
ChangeLog

diffstat:

 ChangeLog                 |   10 ++
 mpn/arm/neon/hamdist.asm  |  182 ++++++++++++++++++++++++++++++++++++++++++++++
 mpn/arm/neon/popcount.asm |  154 ++++++++++++++++++++++++++++++++++++++
 mpn/arm/v6t2/divrem_1.asm |    2 +-
 tests/arm32check.c        |    5 +-
 5 files changed, 350 insertions(+), 3 deletions(-)

diffs (truncated from 398 to 300 lines):

diff -r d218f1e159f3 -r 69009e987422 ChangeLog

--- a/ChangeLog	Tue Mar 19 16:12:19 2013 +0100
+++ b/ChangeLog	Tue Mar 19 18:44:35 2013 +0100
@@ -1,3 +1,13 @@
+2013-03-19  Torbjorn Granlund  <tege at gmplib.org>
+
+	* tests/arm32check.c: Get printing of clobbered register right.
+
+	* mpn/arm/neon/popcount.asm: New file.
+	* mpn/arm/neon/hamdist.asm: New file.
+
+	* tests/Makefile.am (EXTRA_libtests_la_SOURCES): Add arm32call.asm and
+	arm32check.c.
+
 2013-03-18  Torbjorn Granlund  <tege at gmplib.org>
 
 	* configure.ac (arm*-*-*): Define CALLING_CONVENTIONS_OBJS.
diff -r d218f1e159f3 -r 69009e987422 mpn/arm/neon/hamdist.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/neon/hamdist.asm	Tue Mar 19 18:44:35 2013 +0100
@@ -0,0 +1,182 @@
+dnl  ARM Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A8	 ?
+C Cortex-A9	 1.89
+C Cortex-A15	 0.95
+
+C TODO
+C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
+C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
+C    popcount. Except perhaps also for popcount for the edge loads.)
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`bp', r1)
+define(`n',  r2)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+	cmp	n, #chunksize
+	bhi	L(gt16k)
+
+L(lt16k):
+	vmov.i64   q8, #0		C clear summation register
+	vmov.i64   q9, #0		C clear summation register
+
+	tst	   n, #1
+	beq	   L(xxx0)
+	vmov.i64   d0, #0
+	vmov.i64   d20, #0
+	sub	   n, n, #1
+	vld1.32   {d0[0]}, [ap]!	C load 1 limb
+	vld1.32   {d20[0]}, [bp]!	C load 1 limb
+	veor	   d0, d0, d20
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
+
+L(xxx0):tst	   n, #2
+	beq	   L(xx00)
+	sub	   n, n, #2
+	vld1.32    {d0}, [ap]!		C load 2 limbs
+	vld1.32    {d20}, [bp]!		C load 2 limbs
+	veor	   d0, d0, d20
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24
+
+L(xx00):tst	   n, #4
+	beq	   L(x000)
+	sub	   n, n, #4
+	vld1.32    {q0}, [ap]!		C load 4 limbs
+	vld1.32    {q10}, [bp]!		C load 4 limbs
+	veor	   q0, q0, q10
+	vcnt.8	   q12, q0
+	vpadal.u8  q8, q12
+
+L(x000):tst	   n, #8
+	beq	   L(0000)
+
+	subs	   n, n, #8
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	bls	   L(sum)
+
+L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	sub	   n, n, #8
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	b	   L(mid)
+
+L(0000):subs	   n, n, #16
+	blo	   L(e0)
+
+	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	veor	   q2, q2, q14
+	veor	   q3, q3, q15
+	vcnt.8	   q12, q2
+	vcnt.8	   q13, q3
+	subs	   n, n, #16
+	blo	   L(end)
+
+L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q0
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q1
+L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	veor	   q2, q2, q14
+	veor	   q3, q3, q15
+	subs	   n, n, #16
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q2
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q3
+	bhs	   L(top)
+
+L(end):	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+L(sum):	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+	vadd.i16   q8, q8, q9
+					C we have 8 16-bit counts
+L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
+	vpaddl.u32 q8, q8		C we have 2 64-bit counts
+	vmov.32    r0, d16[0]
+	vmov.32    r1, d17[0]
+	add	   r0, r0, r1
+	bx	lr
+
+C Code for large count.  Splits operand and calls above code.
+define(`ap2', r5)
+define(`bp2', r6)
+L(gt16k):
+	push	{r4,r5,r6,r14}
+	mov	ap2, ap
+	mov	bp2, bp
+	mov	r3, n			C full count
+	mov	r4, #0			C total sum
+
+1:	mov	n, #chunksize		C count for this invocation
+	bl	L(lt16k)		C could jump deep inside code
+	add	ap2, ap2, #chunksize*4	C point at next chunk
+	add	bp2, bp2, #chunksize*4	C point at next chunk
+	add	r4, r4, r0
+	mov	ap, ap2			C put chunk pointer in place for call
+	mov	bp, bp2			C put chunk pointer in place for call
+	sub	r3, r3, #chunksize
+	cmp	r3, #chunksize
+	bhi	1b
+
+	mov	n, r3			C count for final invocation
+	bl	L(lt16k)
+	add	r0, r4, r0
+	pop	{r4,r5,r6,pc}
+EPILOGUE()
diff -r d218f1e159f3 -r 69009e987422 mpn/arm/neon/popcount.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/neon/popcount.asm	Tue Mar 19 18:44:35 2013 +0100
@@ -0,0 +1,154 @@
+dnl  ARM Neon mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A8	 ?
+C Cortex-A9	 1.125
+C Cortex-A15	 0.56
+
+C TODO
+C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
+C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
+C    popcount. Except perhaps also for popcount for the edge loads.)
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`n',  r1)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+	cmp	n, #chunksize
+	bhi	L(gt16k)
+
+L(lt16k):
+	vmov.i64   q8, #0		C clear summation register
+	vmov.i64   q9, #0		C clear summation register
+
+	tst	   n, #1
+	beq	   L(xxx0)
+	vmov.i64   d0, #0
+	sub	   n, n, #1
+	vld1.32   {d0[0]}, [ap]!	C load 1 limb
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
+
+L(xxx0):tst	   n, #2
+	beq	   L(xx00)
+	sub	   n, n, #2
+	vld1.32    {d0}, [ap]!		C load 2 limbs
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24
+
+L(xx00):tst	   n, #4
+	beq	   L(x000)
+	sub	   n, n, #4
+	vld1.32    {q0}, [ap]!		C load 4 limbs
+	vcnt.8	   q12, q0
+	vpadal.u8  q8, q12
+
+L(x000):tst	   n, #8
+	beq	   L(0000)
+
+	subs	   n, n, #8
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	bls	   L(sum)
+
+L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	sub	   n, n, #8
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	b	   L(mid)