[Gmp-commit] /home/hgfiles/gmp: New public functions mpn_com and mpn_neg. (W...

Mon Dec 28 16:33:52 CET 2009

details:   /home/hgfiles/gmp/rev/4aaca500e158
changeset: 13248:4aaca500e158
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Dec 28 16:33:49 2009 +0100
description:
New public functions mpn_com and mpn_neg.  (Were internal and with _n suffix.)

diffstat:

 ChangeLog                   |    6 +
 configure.in                |    6 +-
 doc/gmp.texi                |   10 ++
 doc/tasks.html              |    4 +-
 gmp-h.in                    |   17 ++-
 gmp-impl.h                  |    8 +-
 mpn/Makefile.am             |    4 +-
 mpn/alpha/com.asm           |  165 ++++++++++++++++++++++++++++++++++++++++++
 mpn/alpha/com_n.asm         |  165 ------------------------------------------
 mpn/asm-defs.m4             |    4 +-
 mpn/generic/binvert.c       |    2 +-
 mpn/generic/com.c           |   34 ++++++++
 mpn/generic/invert.c        |    2 +-
 mpn/generic/invertappr.c    |    4 +-
 mpn/generic/mul_fft.c       |    4 +-
 mpn/generic/neg.c           |   23 +++++
 mpn/generic/neg_n.c         |   23 -----
 mpn/powerpc32/750/com.asm   |   68 +++++++++++++++++
 mpn/powerpc32/750/com_n.asm |   68 -----------------
 mpn/powerpc32/vmx/copyd.asm |    2 +-
 mpn/powerpc32/vmx/copyi.asm |    2 +-
 mpn/powerpc64/com.asm       |   74 +++++++++++++++++++
 mpn/powerpc64/com_n.asm     |   74 -------------------
 mpn/x86/k6/mmx/com.asm      |   92 +++++++++++++++++++++++
 mpn/x86/k6/mmx/com_n.asm    |   92 -----------------------
 mpn/x86/k7/mmx/com.asm      |  114 +++++++++++++++++++++++++++++
 mpn/x86/k7/mmx/com_n.asm    |  114 -----------------------------
 mpn/x86/pentium/com.asm     |  170 ++++++++++++++++++++++++++++++++++++++++++++
 mpn/x86/pentium/com_n.asm   |  170 --------------------------------------------
 mpn/x86_64/com.asm          |   77 +++++++++++++++++++
 mpn/x86_64/com_n.asm        |   77 -------------------
 mpz/aorsmul_i.c             |    6 +-
 mpz/cfdiv_r_2exp.c          |    2 +-
 tests/devel/try.c           |   14 +-
 tests/mpn/t-instrument.c    |    6 +-
 tests/mpz/bit.c             |    2 +-
 tests/refmpn.c              |   16 +---
 tests/refmpz.c              |    4 +-
 tests/tests.h               |    4 +-
 tune/common.c               |    4 +-
 tune/many.pl                |    2 +-
 tune/speed.c                |    2 +-
 tune/speed.h                |    2 +-
 43 files changed, 889 insertions(+), 850 deletions(-)

diffs (truncated from 2192 to 300 lines):

diff -r 9f83d5200872 -r 4aaca500e158 ChangeLog

--- a/ChangeLog	Mon Dec 28 15:37:05 2009 +0100
+++ b/ChangeLog	Mon Dec 28 16:33:49 2009 +0100
@@ -1,5 +1,11 @@
 2009-12-28  Torbjorn Granlund  <tege at gmplib.org>
 
+	* configure.in (gmp_mpn_functions_optional) Move "com" from here...
+	(gmp_mpn_functions): ...to here.
+	* mpn/generic/com.c: New file.
+	* (mpn_com): New name for mpn_com_n.  Make public.
+	* (mpn_neg): Analogous changes.
+
 	* tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Set step_factor.
 
 	* tune/common.c, tune/speed.c, tune/speed.h: Support measuring
diff -r 9f83d5200872 -r 4aaca500e158 configure.in
--- a/configure.in	Mon Dec 28 15:37:05 2009 +0100
+++ b/configure.in	Mon Dec 28 16:33:49 2009 +0100
@@ -2481,7 +2481,7 @@
 #       can optionally provide the latter as an extra entrypoint.  Likewise
 #       divrem_1 and pre_divrem_1.
 
-gmp_mpn_functions_optional="umul udiv com_n				\
+gmp_mpn_functions_optional="umul udiv					\
   invert_limb sqr_diagonal						\
   mul_2 mul_3 mul_4							\
   addmul_2 addmul_3 addmul_4 addmul_5 addmul_6 addmul_7 addmul_8	\
@@ -2491,7 +2491,7 @@
   add_n_sub_n addaddmul_1msb0"
 
 gmp_mpn_functions="$extra_functions					   \
-  add add_1 add_n sub sub_1 sub_n neg_n mul_1 addmul_1			   \
+  add add_1 add_n sub sub_1 sub_n neg com mul_1 addmul_1		   \
   submul_1 lshift rshift dive_1 diveby3 divis divrem divrem_1 divrem_2     \
   fib2_ui mod_1 mod_34lsub1 mode1o pre_divrem_1 pre_mod_1 dump		   \
   mod_1_1 mod_1_2 mod_1_3 mod_1_4 lshiftc				   \
@@ -2997,7 +2997,7 @@
 #undef HAVE_NATIVE_mpn_bdiv_dbm1c
 #undef HAVE_NATIVE_mpn_bdiv_q_1
 #undef HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#undef HAVE_NATIVE_mpn_com_n
+#undef HAVE_NATIVE_mpn_com
 #undef HAVE_NATIVE_mpn_copyd
 #undef HAVE_NATIVE_mpn_copyi
 #undef HAVE_NATIVE_mpn_divexact_1
diff -r 9f83d5200872 -r 4aaca500e158 doc/gmp.texi
--- a/doc/gmp.texi	Mon Dec 28 15:37:05 2009 +0100
+++ b/doc/gmp.texi	Mon Dec 28 16:33:49 2009 +0100
@@ -5167,6 +5167,11 @@
 @var{s2n}.
 @end deftypefun
 
+ at deftypefun void mpn_neg_n (mp_limb_t *@var{rp}, const mp_limb_t *@var{sp}, mp_size_t @var{n})
+Perform the negation of @{@var{sp}, @var{n}@}, and write the result to
+@{@var{rp}, @var{n}@}.  Return carry-out.
+ at end deftypefun
+
 @deftypefun void mpn_mul_n (mp_limb_t *@var{rp}, const mp_limb_t *@var{s1p}, const mp_limb_t *@var{s2p}, mp_size_t @var{n})
 Multiply @{@var{s1p}, @var{n}@} and @{@var{s2p}, @var{n}@}, and write the
 2*@var{n}-limb result to @var{rp}.
@@ -5525,6 +5530,11 @@
 @{@var{rp}, @var{n}@}.
 @end deftypefun
 
+ at deftypefun void mpn_com_n (mp_limb_t *@var{rp}, const mp_limb_t *@var{sp}, mp_size_t @var{n})
+Perform the bitwise complement of @{@var{sp}, @var{n}@}, and write the result
+to @{@var{rp}, @var{n}@}.
+ at end deftypefun
+
 @deftypefun void mpn_copyi (mp_limb_t *@var{rp}, const mp_limb_t *@var{s1p}, mp_size_t @var{n})
 Copy from @{@var{s1p}, @var{n}@} to @{@var{rp}, @var{n}@}, increasingly.
 @end deftypefun
diff -r 9f83d5200872 -r 4aaca500e158 doc/tasks.html
--- a/doc/tasks.html	Mon Dec 28 15:37:05 2009 +0100
+++ b/doc/tasks.html	Mon Dec 28 16:33:49 2009 +0100
@@ -37,7 +37,7 @@
 
 <hr>
 <!-- NB. timestamp updated automatically by emacs -->
-  This file current as of 28 Nov 2009.  An up-to-date version is available at
+  This file current as of 28 Dec 2009.  An up-to-date version is available at
   <a href="http://gmplib.org/tasks.html">http://gmplib.org/tasks.html</a>.
   Please send comments about this page to gmp-devel<font>@</font>gmplib.org.
 
@@ -436,7 +436,7 @@
      <code>mpn_rshift</code> already provided.
 <li> Cray T3E: Experiment with optimization options.  In particular,
      -hpipeline3 seems promising.  We should at least up -O to -O2 or -O3.
-<li> Cray: <code>mpn_com_n</code> and <code>mpn_and_n</code> etc very probably
+<li> Cray: <code>mpn_com</code> and <code>mpn_and_n</code> etc very probably
      wants a pragma like <code>MPN_COPY_INCR</code>.
 <li> Cray vector systems: <code>mpn_lshift</code>, <code>mpn_rshift</code>,
      <code>mpn_popcount</code> and <code>mpn_hamdist</code> are nice and small
diff -r 9f83d5200872 -r 4aaca500e158 gmp-h.in
--- a/gmp-h.in	Mon Dec 28 15:37:05 2009 +0100
+++ b/gmp-h.in	Mon Dec 28 16:33:49 2009 +0100
@@ -1571,9 +1571,14 @@
 #define mpn_sqr __MPN(sqr)
 __GMP_DECLSPEC void mpn_sqr __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
 
-#define mpn_neg_n __MPN(neg_n)
-#if __GMP_INLINE_PROTOTYPES || defined (__GMP_FORCE_mpn_neg_n)
-__GMP_DECLSPEC mp_limb_t mpn_neg_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#define mpn_neg __MPN(neg)
+#if __GMP_INLINE_PROTOTYPES || defined (__GMP_FORCE_mpn_neg)
+__GMP_DECLSPEC mp_limb_t mpn_neg __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+
+#define mpn_com __MPN(com)
+#if __GMP_INLINE_PROTOTYPES || defined (__GMP_FORCE_mpn_com)
+__GMP_DECLSPEC void mpn_com __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
 #endif
 
 #define mpn_perfect_square_p __MPN(perfect_square_p)
@@ -2158,12 +2163,12 @@
 }
 #endif
 
-#if defined (__GMP_EXTERN_INLINE) || defined (__GMP_FORCE_mpn_neg_n)
-#if ! defined (__GMP_FORCE_mpn_neg_n)
+#if defined (__GMP_EXTERN_INLINE) || defined (__GMP_FORCE_mpn_neg)
+#if ! defined (__GMP_FORCE_mpn_neg)
 __GMP_EXTERN_INLINE
 #endif
 mp_limb_t
-mpn_neg_n (mp_ptr __gmp_rp, mp_srcptr __gmp_up, mp_size_t __gmp_n)
+mpn_neg (mp_ptr __gmp_rp, mp_srcptr __gmp_up, mp_size_t __gmp_n)
 {
   mp_limb_t __gmp_ul, __gmp_cy;
   __gmp_cy = 0;
diff -r 9f83d5200872 -r 4aaca500e158 gmp-impl.h
--- a/gmp-impl.h	Mon Dec 28 15:37:05 2009 +0100
+++ b/gmp-impl.h	Mon Dec 28 16:33:49 2009 +0100
@@ -2084,11 +2084,9 @@
 #endif
 
 
-#if HAVE_NATIVE_mpn_com_n
-#define mpn_com_n __MPN(com_n)
-__GMP_DECLSPEC void    mpn_com_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
-#else
-#define mpn_com_n(d,s,n)                                \
+#if ! HAVE_NATIVE_mpn_com
+#undef mpn_com
+#define mpn_com(d,s,n)                                  \
   do {                                                  \
     mp_ptr     __d = (d);                               \
     mp_srcptr  __s = (s);                               \
diff -r 9f83d5200872 -r 4aaca500e158 mpn/Makefile.am
--- a/mpn/Makefile.am	Mon Dec 28 15:37:05 2009 +0100
+++ b/mpn/Makefile.am	Mon Dec 28 16:33:49 2009 +0100
@@ -35,7 +35,7 @@
   addmul_1.c addmul_2.c addmul_3.c addmul_4.c addmul_5.c addmul_6.c	    \
   addmul_7.c addmul_8.c							    \
   and_n.c andn_n.c							    \
-  cmp.c com_n.c copyd.c copyi.c						    \
+  cmp.c com.c copyd.c copyi.c						    \
   dive_1.c diveby3.c divis.c divrem.c divrem_1.c divrem_2.c		    \
   sbpi1_bdiv_qr.c sbpi1_bdiv_q.c					    \
   sbpi1_div_qr.c sbpi1_div_q.c sbpi1_divappr_q.c			    \
@@ -60,7 +60,7 @@
   toom_interpolate_5pts.c toom_interpolate_6pts.c toom_interpolate_7pts.c   \
   toom_interpolate_8pts.c toom_interpolate_12pts.c toom_interpolate_16pts.c \
   invertappr.c invert.c binvert.c mulmod_bnm1.c sqrmod_bnm1.c		    \
-  mullo_n.c mullo_basecase.c nand_n.c neg_n.c nior_n.c perfsqr.c	    \
+  mullo_n.c mullo_basecase.c nand_n.c neg.c nior_n.c perfsqr.c	    \
   popcount.c pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
   rootrem.c scan0.c scan1.c set_str.c			    \
   sqr_basecase.c sqr_diagonal.c						    \
diff -r 9f83d5200872 -r 4aaca500e158 mpn/alpha/com.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/alpha/com.asm	Mon Dec 28 16:33:49 2009 +0100
@@ -0,0 +1,165 @@
+dnl  Alpha mpn_com -- mpn one's complement.
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C EV4:    4.75
+C EV5:    2.0
+C EV6:    1.5
+
+
+C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
+C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
+C will be 1.5+2/N c/l.
+C
+C 2 cycles of loop control are unavoidable, for pointer updates and the
+C taken branch bubble, but also since ldq cannot issue two cycles after stq
+C (and with a run of stqs that means neither of two cycles at the end of the
+C loop.
+C
+C The fbeq is forced into the second cycle of the loop using unops, since
+C the first time through it must wait for the cvtqt result.  Once that
+C result is ready (a 1 cycle stall) then both the branch and following loads
+C can issue together.
+C
+C The main loop handles an odd count of limbs, being two limbs loaded before
+C each size test, plus one pipelined around from the previous iteration (or
+C setup in the entry sequence).
+C
+C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
+C entry sequence, and an increment of the pointers.  For an odd size there's
+C no increment and the first store in the loop (r24) is a repeat of dst[0].
+C
+C Note that the load for r24 after the possible pointer increment is done
+C before the explicit store to dst[0], in case src==dst.
+
+
+ASM_START()
+
+FLOAT64(L(dat), 2.0)
+
+	ALIGN(16)
+
+PROLOGUE(mpn_com,gp)
+
+	C r16	dst
+	C r17	src
+	C r18	size
+
+	lda	r30, -16(r30)		C temporary stack space
+	lda	r7, -3(r18)		C size - 3
+
+	ldq	r20, 0(r17)		C src[0]
+	srl	r7, 1, r6		C (size-3)/2
+
+	stq	r6, 8(r30)		C (size-3)/2
+	and	r7, 1, r5		C 1 if size even
+
+	LEA(	r8, L(dat))
+	s8addq	r5, r17, r17		C skip src[0] if even
+
+	ornot	r31, r20, r20		C ~src[0]
+	unop
+
+	ldt	f0, 8(r30)		C (size-3)/2
+	ldq	r24, 0(r17)		C src[0 or 1]
+
+	stq	r20, 0(r16)		C dst[0]
+	s8addq	r5, r16, r19		C skip dst[0] if even
+
+	ldt	f1, 0(r8)		C data 2.0
+	lda	r30, 16(r30)		C restore stack
+	unop
+	cvtqt	f0, f0			C (size-3)/2 as float
+
+	ornot	r31, r24, r24
+	blt	r7, L(done_1)		C if size<=2
+	unop
+	unop
+
+
+	C 16-byte alignment here
+L(top):
+	C r17	src, incrementing
+	C r19	dst, incrementing
+	C r24	dst[i] result, ready to store
+	C f0	(size-3)/2, decrementing
+	C f1	2.0
+
+	ldq	r20, 8(r17)		C src[i+1]
+	ldq	r21, 16(r17)		C src[i+2]
+	unop
+	unop
+
+	fbeq	f0, L(done_2)
+	unop
+	ldq	r22, 24(r17)		C src[i+3]
+	ldq	r23, 32(r17)		C src[i+4]
+
+	stq	r24, 0(r19)		C dst[i]
+	ornot	r31, r20, r20
+	subt	f0, f1, f0		C count -= 2
+	unop
+
+	stq	r20, 8(r19)		C dst[i+1]
+	ornot	r31, r21, r21
+	unop
+	unop