[Gmp-commit] /var/hg/gmp: 5 new changesets

Fri Feb 24 20:58:11 UTC 2017

details:   /var/hg/gmp/rev/ef1792bf399e
changeset: 17299:ef1792bf399e
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Fri Feb 24 20:32:18 2017 +0100
description:
(powerpc add_sssaaaa): Fix typo.

details:   /var/hg/gmp/rev/4758daf902b2
changeset: 17300:4758daf902b2
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Fri Feb 24 21:01:18 2017 +0100
description:
Fix typo.

details:   /var/hg/gmp/rev/f169ebae5637
changeset: 17301:f169ebae5637
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Fri Feb 24 21:01:25 2017 +0100
description:
Fix typo.

details:   /var/hg/gmp/rev/613212bc409a
changeset: 17302:613212bc409a
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Fri Feb 24 21:52:10 2017 +0100
description:
Define add_sssaaaa also for arm64.

details:   /var/hg/gmp/rev/7e57d6fda760
changeset: 17303:7e57d6fda760
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Fri Feb 24 21:53:58 2017 +0100
description:
Avoid branching on flags.

diffstat:

 mpn/arm64/copyd.asm         |   8 ++++----
 mpn/arm64/copyi.asm         |   8 ++++----
 mpn/generic/div_qr_1n_pi2.c |  17 +++++++++++++----
 mpn/generic/div_qr_1u_pi2.c |  15 ++++++++++++---
 mpn/generic/div_qr_2.c      |  21 ++++++++++++++-------
 5 files changed, 47 insertions(+), 22 deletions(-)

diffs (181 lines):

diff -r fb46501c49fc -r 7e57d6fda760 mpn/arm64/copyd.asm

--- a/mpn/arm64/copyd.asm	Thu Feb 23 14:33:10 2017 +0100
+++ b/mpn/arm64/copyd.asm	Fri Feb 24 21:53:58 2017 +0100
@@ -58,9 +58,9 @@
 
 L(al2):	sub	up, up, #16
 	ld1	{v26.2d}, [up]
-	subs	n, n, #6
+	sub	n, n, #6
 	sub	rp, rp, #16			C offset rp for loop
-	b.lt	L(end)
+	tbnz	n, #63, L(end)
 
 	sub	up, up, #16			C offset up for loop
 	mov	x12, #-16
@@ -70,8 +70,8 @@
 	st1	{v26.2d}, [rp], x12
 	ld1	{v26.2d}, [up], x12
 	st1	{v22.2d}, [rp], x12
-	subs	n, n, #4
-	b.ge	L(top)
+	sub	n, n, #4
+	tbz	n, #63, L(top)
 
 	add	up, up, #16			C undo up offset
 
diff -r fb46501c49fc -r 7e57d6fda760 mpn/arm64/copyi.asm
--- a/mpn/arm64/copyi.asm	Thu Feb 23 14:33:10 2017 +0100
+++ b/mpn/arm64/copyi.asm	Fri Feb 24 21:53:58 2017 +0100
@@ -53,16 +53,16 @@
 	st1	{v22.1d}, [rp], #8
 
 L(al2):	ld1	{v26.2d}, [up], #16
-	subs	n, n, #6
-	b.lt	L(end)
+	sub	n, n, #6
+	tbnz	n, #63, L(end)
 
 	ALIGN(16)
 L(top):	ld1	{v22.2d}, [up], #16
 	st1	{v26.2d}, [rp], #16
 	ld1	{v26.2d}, [up], #16
 	st1	{v22.2d}, [rp], #16
-	subs	n, n, #4
-	b.ge	L(top)
+	sub	n, n, #4
+	tbz	n, #63, L(top)
 
 L(end):	st1	{v26.2d}, [rp], #16
 
diff -r fb46501c49fc -r 7e57d6fda760 mpn/generic/div_qr_1n_pi2.c
--- a/mpn/generic/div_qr_1n_pi2.c	Thu Feb 23 14:33:10 2017 +0100
+++ b/mpn/generic/div_qr_1n_pi2.c	Fri Feb 24 21:53:58 2017 +0100
@@ -1,4 +1,4 @@
-/* mpn_div_qr_1u_pi2.
+/* mpn_div_qr_1n_pi2.
 
    THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
    ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
@@ -46,8 +46,10 @@
 #include "longlong.h"
 
 /* Define some longlong.h-style macros, but for wider operations.
-   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
-     carry-out into an additional sum operand.
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+   * add_csaac accepts two addends and a carry in, and generates a sum and a
+     carry out.  A little like a "full adder".
 */
 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
 
@@ -69,12 +71,19 @@
 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
 #endif
 
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0))
+#endif
+
 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
    processor running in 32-bit mode, since the carry flag then gets the 32-bit
    carry.  */
 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
-  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
 #endif
diff -r fb46501c49fc -r 7e57d6fda760 mpn/generic/div_qr_1u_pi2.c
--- a/mpn/generic/div_qr_1u_pi2.c	Thu Feb 23 14:33:10 2017 +0100
+++ b/mpn/generic/div_qr_1u_pi2.c	Fri Feb 24 21:53:58 2017 +0100
@@ -46,8 +46,10 @@
 #include "longlong.h"
 
 /* Define some longlong.h-style macros, but for wider operations.
-   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
-     carry-out into an additional sum operand.
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+   * add_csaac accepts two addends and a carry in, and generates a sum and a
+     carry out.  A little like a "full adder".
 */
 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
 
@@ -69,12 +71,19 @@
 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
 #endif
 
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0))
+#endif
+
 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
    processor running in 32-bit mode, since the carry flag then gets the 32-bit
    carry.  */
 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
-  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
 #endif
diff -r fb46501c49fc -r 7e57d6fda760 mpn/generic/div_qr_2.c
--- a/mpn/generic/div_qr_2.c	Thu Feb 23 14:33:10 2017 +0100
+++ b/mpn/generic/div_qr_2.c	Fri Feb 24 21:53:58 2017 +0100
@@ -49,10 +49,10 @@
 #endif
 
 /* Define some longlong.h-style macros, but for wider operations.
-   * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
-     carry-out into an additional sum operand.
-   * add_csaac accepts two addends and a carry in, and generates a sum
-     and a carry out.  A little like a "full adder".
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+   * add_csaac accepts two addends and a carry in, and generates a sum and a
+     carry out.  A little like a "full adder".
 */
 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
 
@@ -84,12 +84,19 @@
 	     "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
 #endif
 
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) __CLOBBER_CC)
+#endif
+
 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
    processor running in 32-bit mode, since the carry flag then gets the 32-bit
    carry.  */
 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
-  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
 #endif
@@ -282,9 +289,9 @@
    Return the most significant limb of the quotient.
 
    Preconditions:
-   1. qp must either not overlap with the input operands at all, or
+   1. qp must either not overlap with the other operands at all, or
       qp >= np + 2 must hold true.  (This means that it's possible to put
-      the quotient in the high part of {np,nn}, right above the remainder.
+      the quotient in the high part of {np,nn}, right above the remainder.)
    2. nn >= 2.  */
 
 mp_limb_t