[Gmp-commit] /var/hg/gmp: 6 new changesets

Thu Mar 21 23:10:12 CET 2013

details:   /var/hg/gmp/rev/2528ba817c41
changeset: 15627:2528ba817c41
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 21 23:03:07 2013 +0100
description:
Move veriables out from generated file, make them 'const'.

details:   /var/hg/gmp/rev/f89b56f2cade
changeset: 15628:f89b56f2cade
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 21 23:03:41 2013 +0100
description:
Add and correct cycle numbers.

details:   /var/hg/gmp/rev/4d3c8f773d49
changeset: 15629:4d3c8f773d49
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 21 23:04:23 2013 +0100
description:
Fix a comment typo.

details:   /var/hg/gmp/rev/02737f9759a9
changeset: 15630:02737f9759a9
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 21 23:06:29 2013 +0100
description:
Trim 'sqr_diag_addlsh1' loop.

details:   /var/hg/gmp/rev/8a8826f8459d
changeset: 15631:8a8826f8459d
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 21 23:09:46 2013 +0100
description:
Add arm/neon lshiftc support.

details:   /var/hg/gmp/rev/4b32bd60c390
changeset: 15632:4b32bd60c390
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Mar 21 23:10:04 2013 +0100
description:
ChangeLog

diffstat:

 ChangeLog                   |   10 +
 gen-trialdivtab.c           |   18 +-
 longlong.h                  |    2 +-
 mpn/arm/mode1o.asm          |    4 +-
 mpn/arm/neon/lshiftc.asm    |  245 ++++++++++++++++++++++++++++++++++++++++++++
 mpn/arm/v6/sqr_basecase.asm |    6 +-
 mpn/generic/trialdiv.c      |   16 ++-
 7 files changed, 287 insertions(+), 14 deletions(-)

diffs (truncated from 417 to 300 lines):

diff -r 055b847ac3d7 -r 4b32bd60c390 ChangeLog

--- a/ChangeLog	Wed Mar 20 18:40:58 2013 +0100
+++ b/ChangeLog	Thu Mar 21 23:10:04 2013 +0100
@@ -1,3 +1,13 @@
+2013-03-21  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/arm/neon/lshiftc.asm: New file.
+
+	* mpn/arm/v6/sqr_basecase.asm: Trim 'sqr_diag_addlsh1' loop.
+
+	* gen-trialdivtab.c: Output just raw data, remove actual variables.
+	* mpn/generic/trialdiv.c: Put variables from gen-trialdivtab.c here,
+	and make them 'const'.
+
 2013-03-20  Torbjorn Granlund  <tege at gmplib.org>
 
 	* config.guess: Rework arm CPU recognition.
diff -r 055b847ac3d7 -r 4b32bd60c390 gen-trialdivtab.c
--- a/gen-trialdivtab.c	Wed Mar 20 18:40:58 2013 +0100
+++ b/gen-trialdivtab.c	Thu Mar 21 23:10:04 2013 +0100
@@ -2,7 +2,7 @@
 
    Contributed to the GNU project by Torbjorn Granlund.
 
-Copyright 2009, 2012 Free Software Foundation, Inc.
+Copyright 2009, 2012, 2013 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -98,7 +98,9 @@
   omitted_p = 3;
   interval_end = 0;
 
-  printf ("static struct gmp_primes_dtab gmp_primes_dtab[] = {\n");
+/*  printf ("static struct gmp_primes_dtab gmp_primes_dtab[] = {\n"); */
+
+  printf ("#ifdef WANT_dtab\n");
 
   for (t = start_p; t <= end_p; t += 2)
     {
@@ -120,7 +122,7 @@
 	      if (! isprime (p))
 		continue;
 
-	      printf ("	 P(%d,", (int) p);
+	      printf ("  P(%d,", (int) p);
 	      mpz_invert_ui_2exp (inv, p, limb_bits);
 	      printf ("CNST_LIMB(0x");  mpz_out_str (stdout, 16, inv);  printf ("),");
 
@@ -138,10 +140,12 @@
 	}
       interval_end = t;
     }
-  printf ("  P(0,0,0)\n};\n");
+  printf ("#define SMALLEST_OMITTED_PRIME %d\n", (int) omitted_p);
+  printf ("#endif\n");
 
+  printf ("#ifdef WANT_ptab\n");
 
-  printf ("static struct gmp_primes_ptab gmp_primes_ptab[] = {\n");
+/*  printf ("static struct gmp_primes_ptab gmp_primes_ptab[] = {\n"); */
 
   endtok = "";
 
@@ -193,9 +197,9 @@
       interval_end = t;
       np++;
     }
-  printf ("\n};\n");
 
-  printf ("#define SMALLEST_OMITTED_PRIME %d\n", (int) omitted_p);
+  printf ("\n");
+  printf ("#endif\n");
 
   return 0;
 }
diff -r 055b847ac3d7 -r 4b32bd60c390 longlong.h
--- a/longlong.h	Wed Mar 20 18:40:58 2013 +0100
+++ b/longlong.h	Thu Mar 21 23:10:04 2013 +0100
@@ -514,7 +514,7 @@
 #define UDIV_TIME 200
 #endif /* LONGLONG_STANDALONE */
 #endif
-/* This is a bizarre test, but GCC doesn't define useful common symbol. */
+/* This is a bizarre test, but GCC doesn't define any useful common symbol. */
 #if defined (__ARM_ARCH_5__)  || defined (__ARM_ARCH_5T__) || \
     defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
     defined (__ARM_ARCH_6__)  || defined (__ARM_ARCH_6J__) || \
diff -r 055b847ac3d7 -r 4b32bd60c390 mpn/arm/mode1o.asm
--- a/mpn/arm/mode1o.asm	Wed Mar 20 18:40:58 2013 +0100
+++ b/mpn/arm/mode1o.asm	Thu Mar 21 23:10:04 2013 +0100
@@ -25,8 +25,8 @@
 C StrongARM	 ?
 C XScale	 ?
 C Cortex-A8	 ?
-C Cortex-A9	10
-C Cortex-A15	 ?
+C Cortex-A9	 9
+C Cortex-A15	 7
 
 define(`up', `r0')
 define(`n',  `r1')
diff -r 055b847ac3d7 -r 4b32bd60c390 mpn/arm/neon/lshiftc.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/neon/lshiftc.asm	Thu Mar 21 23:10:04 2013 +0100
@@ -0,0 +1,245 @@
+dnl  ARM Neon mpn_lshiftc.
+
+dnl  Contributed to the GNU project by TorbjÃ¶rn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C StrongARM	 n/a
+C XScale	 n/a
+C Cortex-A8	 ?
+C Cortex-A9	 3.5		3.5				Y
+C Cortex-A15	 1.75		1.75				Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses.  All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands.  Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
+C    which might make it tricky.
+C  * Clean up and simplify.
+C  * Consider sharing most of the code for lshift and rshift, since the feed-in code,
+C    the loop, and most of the wind-down code are identical.
+C  * Replace the basecase code with code using 'extension' registers.
+C  * Optimise.  It is not clear that this loop insn permutation is optimal for
+C    either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp',  `r0')
+define(`ap',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+
+	define(`IFLSH', `$1')
+	define(`IFRSH', `')
+	define(`X',`0')
+	define(`Y',`1')
+	define(`func',`mpn_lshiftc')
+define(`OPERATION_lshiftc',1)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+IFLSH(`	mov	r12, n, lsl #2	')
+IFLSH(`	add	rp, rp, r12	')
+IFLSH(`	add	ap, ap, r12	')
+
+	cmp	n, #4			C SIMD code n limit
+	ble	L(base)
+
+ifdef(`OPERATION_lshiftc',`
+	vdup.32	d6, r3			C left shift count is positive
+	sub	r3, r3, #64		C right shift count is negative
+	vdup.32	d7, r3
+	mov	r12, #-8')		C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+	rsb	r3, r3, #0		C right shift count is negative
+	vdup.32	d6, r3
+	add	r3, r3, #64		C left shift count is positive
+	vdup.32	d7, r3
+	mov	r12, #8')		C rshift pointer update offset
+
+IFLSH(`	sub	ap, ap, #8	')
+	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
+	vshl.u64 d18, d19, d7		C retval
+
+	tst	rp, #4			C is rp 64-bit aligned already?
+	beq	L(rp_aligned)		C yes, skip
+	vmvn	 d19, d19
+IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
+IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
+	vshl.u64 d4, d19, d6
+	sub	n, n, #1		C first limb handled
+IFLSH(`	sub	 rp, rp, #4	')
+	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
+	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(`	sub	rp, rp, #8	')
+	subs	n, n, #6
+	vmvn	 d19, d19
+	blt	L(two_or_three_more)
+	tst	n, #2
+	beq	L(2)
+
+L(1):	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d5, d19, d6
+	vmvn	 d17, d17
+	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	sub	n, n, #2
+	b	 L(mid)
+
+L(2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vmvn	 d16, d16
+	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	subs	n, n, #4
+	blt	L(end)
+
+L(top):	vmvn	 d17, d17
+	vld1.32	 {d16}, [ap], r12
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+L(mid):	vmvn	 d16, d16
+	vld1.32	 {d17}, [ap], r12
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	subs	n, n, #4
+	bge	L(top)
+
+L(end):	tst	 n, #1
+	beq	 L(evn)
+
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	b	 L(cj1)
+
+L(evn):	vmvn	 d17, d17
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+	vmvn.u8	 d17, #0
+	vorr	 d2, d5, d0
+	vshl.u64 d0, d17, d7
+	vorr	 d3, d4, d0
+	b	 L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+	tst	n, #1
+	beq	L(l2)
+
+L(l3):	vshl.u64 d5, d19, d6
+	vld1.32	 {d17}, [ap], r12
+L(cj1):	vmov.u8	 d16, #0
+IFLSH(`	add	 ap, ap, #4	')
+	vmvn	 d17, d17
+	vld1.32	 {d16[Y]}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vmvn	 d16, d16
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+IFLSH(`	add	 rp, rp, #4	')
+	vst1.32	 {d5[Y]}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+L(l2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vmvn	 d16, d16
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vmvn.u8	 d17, #0
+	vorr	 d2, d4, d1