[Gmp-commit] /var/hg/gmp: 6 new changesets

Mon Nov 12 21:06:38 CET 2012

details:   /var/hg/gmp/rev/1e723be62081
changeset: 15109:1e723be62081
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Nov 12 20:54:09 2012 +0100
description:
Remove a redundant abort().

details:   /var/hg/gmp/rev/228bb37c6023
changeset: 15110:228bb37c6023
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Nov 12 20:56:00 2012 +0100
description:
Fix comment typo.

details:   /var/hg/gmp/rev/4e17fc006374
changeset: 15111:4e17fc006374
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Nov 12 20:59:17 2012 +0100
description:
Tune, simplify.

details:   /var/hg/gmp/rev/b8fde68617eb
changeset: 15112:b8fde68617eb
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Nov 12 21:00:26 2012 +0100
description:
Fix typo.

details:   /var/hg/gmp/rev/94c55274b650
changeset: 15113:94c55274b650
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Nov 12 21:02:19 2012 +0100
description:
Add ARM64 support.  Add AVR support.

details:   /var/hg/gmp/rev/889e31898397
changeset: 15114:889e31898397
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Nov 12 21:05:42 2012 +0100
description:
Update recommendations for M-R counts.  Misc wording improvements.

diffstat:

 ChangeLog                         |   5 +
 doc/gmp.texi                      |  14 +++--
 longlong.h                        |  38 +++++++++++++++
 mpn/generic/perfsqr.c             |   2 +-
 mpn/powerpc64/mode64/divrem_1.asm |  98 ++++++++++----------------------------
 mpn/s390_64/lshift.asm            |   2 +-
 tests/mpz/t-popcount.c            |   3 +-
 7 files changed, 80 insertions(+), 82 deletions(-)

diffs (truncated from 319 to 300 lines):

diff -r 16951630978f -r 889e31898397 ChangeLog

--- a/ChangeLog	Mon Nov 12 20:52:35 2012 +0100
+++ b/ChangeLog	Mon Nov 12 21:05:42 2012 +0100
@@ -1,5 +1,10 @@
 2012-11-12  Torbjorn Granlund  <tege at gmplib.org>
 
+	* longlong.h: Add ARM64 support.
+	* longlong.h: Add AVR support.
+
+	* mpn/powerpc64/mode64/divrem_1.asm: Tune, simplify.
+
 	* mpq/md_2exp.c: Use MPN_COPY_INCR, not MPN_COPY_DECR.
 	* tests/mpq/t-md_2exp.c (check_random): New function.
 
diff -r 16951630978f -r 889e31898397 doc/gmp.texi
--- a/doc/gmp.texi	Mon Nov 12 20:52:35 2012 +0100
+++ b/doc/gmp.texi	Mon Nov 12 21:05:42 2012 +0100
@@ -3494,9 +3494,10 @@
 @var{n} is definitely composite.
 
 This function does some trial divisions, then some Miller-Rabin probabilistic
-primality tests.  @var{reps} controls how many such tests are done, 5 to 10 is
-a reasonable number, more will reduce the chances of a composite being
-returned as ``probably prime''.
+primality tests.  The argument @var{reps} controls how many such tests are
+done; a higher value will reduce the chances of a composite being returned as
+``probably prime''.  25 is a reasonable number; a composite number will then be
+identified as a prime with a probability of less than @m{2^{-50}}.
 
 Miller-Rabin and similar tests can be more properly called compositeness
 tests.  Numbers which fail are known to be composite but those which pass
@@ -8343,9 +8344,10 @@
 divisions saved.  When @math{d} is a single limb some simplifications arise,
 providing good speedups on a number of processors.
 
- at code{mpn_divexact_by3}, @code{mpn_modexact_1_odd} and the @code{mpn_redc_X}
-functions differ subtly in how they return @math{r}, leading to some negations
-in the above formula, but all are essentially the same.
+The functions @code{mpn_divexact_by3}, @code{mpn_modexact_1_odd} and the
+internal @code{mpn_redc_X} functions differ subtly in how they return @math{r},
+leading to some negations in the above formula, but all are essentially the
+same.
 
 @cindex Divisibility algorithm
 @cindex Congruence algorithm
diff -r 16951630978f -r 889e31898397 longlong.h
--- a/longlong.h	Mon Nov 12 20:52:35 2012 +0100
+++ b/longlong.h	Mon Nov 12 21:05:42 2012 +0100
@@ -259,6 +259,15 @@
 #endif /* clz using mpn */
 #endif /* __alpha */
 
+#if defined (__AVR) && W_TYPE_SIZE == 8
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    unsigned short __p = (unsigned short) (m0) * (m1);			\
+    (ph) = __p >> 8;							\
+    (pl) = __p;								\
+  } while (0)
+#endif /* AVR */
+
 #if defined (_CRAY) && W_TYPE_SIZE == 64
 #include <intrinsics.h>
 #define UDIV_PREINV_ALWAYS  1
@@ -520,6 +529,35 @@
 #endif
 #endif /* __arm__ */
 
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "r" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bl))					\
+      {									\
+	__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
+		 : "=r" (sh), "=&r" (sl)				\
+		 : "r" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+      }									\
+    else /* only bh might be a constant */				\
+      __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "rZ" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
+    } while (0)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 64
+#endif /* __aarch64__ */
+
 #if defined (__clipper__) && W_TYPE_SIZE == 32
 #define umul_ppmm(w1, w0, u, v) \
   ({union {UDItype __ll;						\
diff -r 16951630978f -r 889e31898397 mpn/generic/perfsqr.c
--- a/mpn/generic/perfsqr.c	Mon Nov 12 20:52:35 2012 +0100
+++ b/mpn/generic/perfsqr.c	Mon Nov 12 21:05:42 2012 +0100
@@ -185,7 +185,7 @@
   /* Check that we have even multiplicity of 2, and then check that the rest is
      a possible perfect square.  Leave disabled until we can determine this
      really is an improvement.  It it is, it could completely replace the
-     simple probe above, since this should through out more non-squares, but at
+     simple probe above, since this should throw out more non-squares, but at
      the expense of somewhat more cycles.  */
   {
     mp_limb_t lo;
diff -r 16951630978f -r 889e31898397 mpn/powerpc64/mode64/divrem_1.asm
--- a/mpn/powerpc64/mode64/divrem_1.asm	Mon Nov 12 20:52:35 2012 +0100
+++ b/mpn/powerpc64/mode64/divrem_1.asm	Mon Nov 12 21:05:42 2012 +0100
@@ -1,7 +1,7 @@
 dnl  PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
 
-dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation,
-dnl  Inc.
+dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2012 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -22,11 +22,11 @@
 
 C                           cycles/limb
 C                       norm    unorm   frac
-C POWER3/PPC630         16-34   16-34   ~11
-C POWER4/PPC970          29              19
-C POWER5                 29      29     ~20
-C POWER6                 50      59     ~42
-C POWER7                 25      25     ~14
+C POWER3/PPC630         16-34   16-34   ~11   outdated figures
+C POWER4/PPC970          28      28      19
+C POWER5                 29      29     ~19
+C POWER6                 49      59     ~42
+C POWER7                 24.5    23     ~14
 
 C INPUT PARAMETERS
 C qp  = r3
@@ -113,10 +113,11 @@
 	sldi	r6, r6, 3
 	ALIGN(16)
 L(uloop):
-	addi	r11, r31, 1
 	ldx	r8, r26, r6
+	nop
 	mulld	r0, r31, r3
 	mulhdu	r10, r31, r3
+	addi	r11, r31, 1
 	srd	r9, r8, r5
 	addi	r6, r6, -8
 	or	r9, r7, r9
@@ -124,12 +125,11 @@
 	adde	r10, r10, r11
 	mulld	r31, r10, r30
 	subf	r31, r31, r9
-	subfc	r0, r0, r31	C r >= ql
-	subfe	r0, r0, r0	C r0 = -(r >= ql)
-	not	r7, r0
-	add	r10, r7, r10	C qh -= (r >= ql)
-	andc	r0, r30, r0
-	add	r31, r31, r0
+	subfc	r0, r31, r0	C r <= ql
+	subfe	r0, r0, r0	C r0 = -(r <= ql)
+	and	r9, r30, r0
+	add	r31, r31, r9
+	add	r10, r0, r10	C qh -= (r >= ql)
 	cmpld	cr7, r31, r30
 	bge-	cr7, L(164)
 L(123):
@@ -166,19 +166,19 @@
 L(ufloop):
 	addi	r11, r31, 1
 	nop
-	mulld	r7, r3, r31
+	mulld	r0, r3, r31
 	mulhdu	r10, r3, r31
 	add	r10, r10, r11
 	mulld	r31, r9, r10
 ifelse(0,1,`
-	subfc	r0, r7, r31
+	subfc	r0, r0, r31
 	subfe	r0, r0, r0	C r0 = -(r >= ql)
 	not	r7, r0
 	add	r10, r7, r10	C qh -= (r >= ql)
 	andc	r0, r30, r0
 	add	r31, r31, r0
 ',`
-	cmpld	cr7, r31, r7
+	cmpld	cr7, r31, r0
 	blt	cr7, L(29)
 	add	r31, r30, r31
 	addi	r10, r10, -1
@@ -219,12 +219,11 @@
 	and	r0, r0, r7
 	subf	r31, r0, r31
 L(8):
-L(10):
 	mr	r3, r30
 	CALL(	mpn_invert_limb)
-	nop
+	li	r27, 0
 	addic.	r6, r28, -1
-	blt-	cr0, L(150)
+	blt-	cr0, L(110)
 	mtctr	r28
 	sldi	r6, r6, 3
 	ALIGN(16)
@@ -234,68 +233,23 @@
 	mulld	r0, r31, r3
 	mulhdu	r10, r31, r3
 	addi	r6, r6, -8
-	addc	r7, r0, r8
+	addc	r0, r0, r8
 	adde	r10, r10, r11
 	mulld	r31, r10, r30
 	subf	r31, r31, r8	C r = nl - qh * d
-	subfc	r0, r7, r31	C r >= ql
-	subfe	r0, r0, r0	C r0 = -(r >= ql)
-	not	r7, r0
-	add	r10, r7, r10	C qh -= (r >= ql)
-	andc	r0, r30, r0
-	add	r31, r31, r0
+	subfc	r0, r31, r0	C r <= ql
+	subfe	r0, r0, r0	C r0 = -(r <= ql)
+	and	r9, r30, r0
+	add	r31, r31, r9
+	add	r10, r0, r10	C qh -= (r >= ql)
 	cmpld	cr7, r31, r30
 	bge-	cr7, L(167)
 L(51):
 	std	r10, 0(r29)
 	addi	r29, r29, -8
 	bdnz	L(nloop)
+	b	L(110)
 
-L(150):
-	addic.	r9, r25, -1
-	blt-	cr0, L(152)
-	mtctr	r25
-	neg	r9, r30
-	ALIGN(16)
-L(nfloop):
-	addi	r11, r31, 1
-	nop
-	mulld	r7, r3, r31
-	mulhdu	r10, r3, r31
-	add	r10, r10, r11
-	mulld	r31, r9, r10
-ifelse(0,1,`
-	subfc	r0, r7, r31
-	subfe	r0, r0, r0	C r0 = -(r >= ql)
-	not	r7, r0
-	add	r10, r7, r10	C qh -= (r >= ql)
-	andc	r0, r30, r0
-	add	r31, r31, r0
-',`
-	cmpld	cr7, r31, r7
-	blt	cr7, L(28)
-	add	r31, r30, r31
-	addi	r10, r10, -1
-L(28):
-')
-	std	r10, 0(r29)
-	addi	r29, r29, -8
-	bdnz	L(nfloop)
-L(152):
-	addi	r1, r1, 176
-	mr	r3, r31
-	ld	r0, 16(r1)
-	lwz	r12, 8(r1)
-	mtlr	r0
-	ld	r25, -56(r1)
-	ld	r26, -48(r1)
-	mtcrf	8, r12
-	ld	r27, -40(r1)
-	ld	r28, -32(r1)
-	ld	r29, -24(r1)
-	ld	r30, -16(r1)
-	ld	r31, -8(r1)
-	blr
 L(164):
 	subf	r31, r30, r31
 	addi	r10, r10, 1
diff -r 16951630978f -r 889e31898397 mpn/s390_64/lshift.asm
--- a/mpn/s390_64/lshift.asm	Mon Nov 12 20:52:35 2012 +0100
+++ b/mpn/s390_64/lshift.asm	Mon Nov 12 21:05:42 2012 +0100
@@ -32,7 +32,7 @@
 C  * One could assume more pipelining could approach 2.5 c/l, but we have not
 C    found any 8-way loop that runs better than the current 4-way loop.
 C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
-C    similrly to the x86_64 sqr_basecase feed-in.
+C    similarly to the x86_64 sqr_basecase feed-in.
 
 C INPUT PARAMETERS
 define(`rp',	`%r2')
diff -r 16951630978f -r 889e31898397 tests/mpz/t-popcount.c