[Gmp-commit] /home/hgfiles/gmp: New public functions mpn_com and mpn_neg. (W...
mercurial at gmplib.org
mercurial at gmplib.org
Mon Dec 28 16:33:52 CET 2009
details: /home/hgfiles/gmp/rev/4aaca500e158
changeset: 13248:4aaca500e158
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Dec 28 16:33:49 2009 +0100
description:
New public functions mpn_com and mpn_neg. (Were internal and with _n suffix.)
diffstat:
ChangeLog | 6 +
configure.in | 6 +-
doc/gmp.texi | 10 ++
doc/tasks.html | 4 +-
gmp-h.in | 17 ++-
gmp-impl.h | 8 +-
mpn/Makefile.am | 4 +-
mpn/alpha/com.asm | 165 ++++++++++++++++++++++++++++++++++++++++++
mpn/alpha/com_n.asm | 165 ------------------------------------------
mpn/asm-defs.m4 | 4 +-
mpn/generic/binvert.c | 2 +-
mpn/generic/com.c | 34 ++++++++
mpn/generic/invert.c | 2 +-
mpn/generic/invertappr.c | 4 +-
mpn/generic/mul_fft.c | 4 +-
mpn/generic/neg.c | 23 +++++
mpn/generic/neg_n.c | 23 -----
mpn/powerpc32/750/com.asm | 68 +++++++++++++++++
mpn/powerpc32/750/com_n.asm | 68 -----------------
mpn/powerpc32/vmx/copyd.asm | 2 +-
mpn/powerpc32/vmx/copyi.asm | 2 +-
mpn/powerpc64/com.asm | 74 +++++++++++++++++++
mpn/powerpc64/com_n.asm | 74 -------------------
mpn/x86/k6/mmx/com.asm | 92 +++++++++++++++++++++++
mpn/x86/k6/mmx/com_n.asm | 92 -----------------------
mpn/x86/k7/mmx/com.asm | 114 +++++++++++++++++++++++++++++
mpn/x86/k7/mmx/com_n.asm | 114 -----------------------------
mpn/x86/pentium/com.asm | 170 ++++++++++++++++++++++++++++++++++++++++++++
mpn/x86/pentium/com_n.asm | 170 --------------------------------------------
mpn/x86_64/com.asm | 77 +++++++++++++++++++
mpn/x86_64/com_n.asm | 77 -------------------
mpz/aorsmul_i.c | 6 +-
mpz/cfdiv_r_2exp.c | 2 +-
tests/devel/try.c | 14 +-
tests/mpn/t-instrument.c | 6 +-
tests/mpz/bit.c | 2 +-
tests/refmpn.c | 16 +---
tests/refmpz.c | 4 +-
tests/tests.h | 4 +-
tune/common.c | 4 +-
tune/many.pl | 2 +-
tune/speed.c | 2 +-
tune/speed.h | 2 +-
43 files changed, 889 insertions(+), 850 deletions(-)
diffs (truncated from 2192 to 300 lines):
diff -r 9f83d5200872 -r 4aaca500e158 ChangeLog
--- a/ChangeLog Mon Dec 28 15:37:05 2009 +0100
+++ b/ChangeLog Mon Dec 28 16:33:49 2009 +0100
@@ -1,5 +1,11 @@
2009-12-28 Torbjorn Granlund <tege at gmplib.org>
+ * configure.in (gmp_mpn_functions_optional) Move "com" from here...
+ (gmp_mpn_functions): ...to here.
+ * mpn/generic/com.c: New file.
+ * (mpn_com): New name for mpn_com_n. Make public.
+ * (mpn_neg): Analogous changes.
+
* tune/tuneup.c (tune_mu_div, tune_mu_bdiv): Set step_factor.
* tune/common.c, tune/speed.c, tune/speed.h: Support measuring
diff -r 9f83d5200872 -r 4aaca500e158 configure.in
--- a/configure.in Mon Dec 28 15:37:05 2009 +0100
+++ b/configure.in Mon Dec 28 16:33:49 2009 +0100
@@ -2481,7 +2481,7 @@
# can optionally provide the latter as an extra entrypoint. Likewise
# divrem_1 and pre_divrem_1.
-gmp_mpn_functions_optional="umul udiv com_n \
+gmp_mpn_functions_optional="umul udiv \
invert_limb sqr_diagonal \
mul_2 mul_3 mul_4 \
addmul_2 addmul_3 addmul_4 addmul_5 addmul_6 addmul_7 addmul_8 \
@@ -2491,7 +2491,7 @@
add_n_sub_n addaddmul_1msb0"
gmp_mpn_functions="$extra_functions \
- add add_1 add_n sub sub_1 sub_n neg_n mul_1 addmul_1 \
+ add add_1 add_n sub sub_1 sub_n neg com mul_1 addmul_1 \
submul_1 lshift rshift dive_1 diveby3 divis divrem divrem_1 divrem_2 \
fib2_ui mod_1 mod_34lsub1 mode1o pre_divrem_1 pre_mod_1 dump \
mod_1_1 mod_1_2 mod_1_3 mod_1_4 lshiftc \
@@ -2997,7 +2997,7 @@
#undef HAVE_NATIVE_mpn_bdiv_dbm1c
#undef HAVE_NATIVE_mpn_bdiv_q_1
#undef HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#undef HAVE_NATIVE_mpn_com_n
+#undef HAVE_NATIVE_mpn_com
#undef HAVE_NATIVE_mpn_copyd
#undef HAVE_NATIVE_mpn_copyi
#undef HAVE_NATIVE_mpn_divexact_1
diff -r 9f83d5200872 -r 4aaca500e158 doc/gmp.texi
--- a/doc/gmp.texi Mon Dec 28 15:37:05 2009 +0100
+++ b/doc/gmp.texi Mon Dec 28 16:33:49 2009 +0100
@@ -5167,6 +5167,11 @@
@var{s2n}.
@end deftypefun
+ at deftypefun void mpn_neg_n (mp_limb_t *@var{rp}, const mp_limb_t *@var{sp}, mp_size_t @var{n})
+Perform the negation of @{@var{sp}, @var{n}@}, and write the result to
+@{@var{rp}, @var{n}@}. Return carry-out.
+ at end deftypefun
+
@deftypefun void mpn_mul_n (mp_limb_t *@var{rp}, const mp_limb_t *@var{s1p}, const mp_limb_t *@var{s2p}, mp_size_t @var{n})
Multiply @{@var{s1p}, @var{n}@} and @{@var{s2p}, @var{n}@}, and write the
2*@var{n}-limb result to @var{rp}.
@@ -5525,6 +5530,11 @@
@{@var{rp}, @var{n}@}.
@end deftypefun
+ at deftypefun void mpn_com_n (mp_limb_t *@var{rp}, const mp_limb_t *@var{sp}, mp_size_t @var{n})
+Perform the bitwise complement of @{@var{sp}, @var{n}@}, and write the result
+to @{@var{rp}, @var{n}@}.
+ at end deftypefun
+
@deftypefun void mpn_copyi (mp_limb_t *@var{rp}, const mp_limb_t *@var{s1p}, mp_size_t @var{n})
Copy from @{@var{s1p}, @var{n}@} to @{@var{rp}, @var{n}@}, increasingly.
@end deftypefun
diff -r 9f83d5200872 -r 4aaca500e158 doc/tasks.html
--- a/doc/tasks.html Mon Dec 28 15:37:05 2009 +0100
+++ b/doc/tasks.html Mon Dec 28 16:33:49 2009 +0100
@@ -37,7 +37,7 @@
<hr>
<!-- NB. timestamp updated automatically by emacs -->
- This file current as of 28 Nov 2009. An up-to-date version is available at
+ This file current as of 28 Dec 2009. An up-to-date version is available at
<a href="http://gmplib.org/tasks.html">http://gmplib.org/tasks.html</a>.
Please send comments about this page to gmp-devel<font>@</font>gmplib.org.
@@ -436,7 +436,7 @@
<code>mpn_rshift</code> already provided.
<li> Cray T3E: Experiment with optimization options. In particular,
-hpipeline3 seems promising. We should at least up -O to -O2 or -O3.
-<li> Cray: <code>mpn_com_n</code> and <code>mpn_and_n</code> etc very probably
+<li> Cray: <code>mpn_com</code> and <code>mpn_and_n</code> etc very probably
wants a pragma like <code>MPN_COPY_INCR</code>.
<li> Cray vector systems: <code>mpn_lshift</code>, <code>mpn_rshift</code>,
<code>mpn_popcount</code> and <code>mpn_hamdist</code> are nice and small
diff -r 9f83d5200872 -r 4aaca500e158 gmp-h.in
--- a/gmp-h.in Mon Dec 28 15:37:05 2009 +0100
+++ b/gmp-h.in Mon Dec 28 16:33:49 2009 +0100
@@ -1571,9 +1571,14 @@
#define mpn_sqr __MPN(sqr)
__GMP_DECLSPEC void mpn_sqr __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
-#define mpn_neg_n __MPN(neg_n)
-#if __GMP_INLINE_PROTOTYPES || defined (__GMP_FORCE_mpn_neg_n)
-__GMP_DECLSPEC mp_limb_t mpn_neg_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#define mpn_neg __MPN(neg)
+#if __GMP_INLINE_PROTOTYPES || defined (__GMP_FORCE_mpn_neg)
+__GMP_DECLSPEC mp_limb_t mpn_neg __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+
+#define mpn_com __MPN(com)
+#if __GMP_INLINE_PROTOTYPES || defined (__GMP_FORCE_mpn_com)
+__GMP_DECLSPEC void mpn_com __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
#endif
#define mpn_perfect_square_p __MPN(perfect_square_p)
@@ -2158,12 +2163,12 @@
}
#endif
-#if defined (__GMP_EXTERN_INLINE) || defined (__GMP_FORCE_mpn_neg_n)
-#if ! defined (__GMP_FORCE_mpn_neg_n)
+#if defined (__GMP_EXTERN_INLINE) || defined (__GMP_FORCE_mpn_neg)
+#if ! defined (__GMP_FORCE_mpn_neg)
__GMP_EXTERN_INLINE
#endif
mp_limb_t
-mpn_neg_n (mp_ptr __gmp_rp, mp_srcptr __gmp_up, mp_size_t __gmp_n)
+mpn_neg (mp_ptr __gmp_rp, mp_srcptr __gmp_up, mp_size_t __gmp_n)
{
mp_limb_t __gmp_ul, __gmp_cy;
__gmp_cy = 0;
diff -r 9f83d5200872 -r 4aaca500e158 gmp-impl.h
--- a/gmp-impl.h Mon Dec 28 15:37:05 2009 +0100
+++ b/gmp-impl.h Mon Dec 28 16:33:49 2009 +0100
@@ -2084,11 +2084,9 @@
#endif
-#if HAVE_NATIVE_mpn_com_n
-#define mpn_com_n __MPN(com_n)
-__GMP_DECLSPEC void mpn_com_n __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
-#else
-#define mpn_com_n(d,s,n) \
+#if ! HAVE_NATIVE_mpn_com
+#undef mpn_com
+#define mpn_com(d,s,n) \
do { \
mp_ptr __d = (d); \
mp_srcptr __s = (s); \
diff -r 9f83d5200872 -r 4aaca500e158 mpn/Makefile.am
--- a/mpn/Makefile.am Mon Dec 28 15:37:05 2009 +0100
+++ b/mpn/Makefile.am Mon Dec 28 16:33:49 2009 +0100
@@ -35,7 +35,7 @@
addmul_1.c addmul_2.c addmul_3.c addmul_4.c addmul_5.c addmul_6.c \
addmul_7.c addmul_8.c \
and_n.c andn_n.c \
- cmp.c com_n.c copyd.c copyi.c \
+ cmp.c com.c copyd.c copyi.c \
dive_1.c diveby3.c divis.c divrem.c divrem_1.c divrem_2.c \
sbpi1_bdiv_qr.c sbpi1_bdiv_q.c \
sbpi1_div_qr.c sbpi1_div_q.c sbpi1_divappr_q.c \
@@ -60,7 +60,7 @@
toom_interpolate_5pts.c toom_interpolate_6pts.c toom_interpolate_7pts.c \
toom_interpolate_8pts.c toom_interpolate_12pts.c toom_interpolate_16pts.c \
invertappr.c invert.c binvert.c mulmod_bnm1.c sqrmod_bnm1.c \
- mullo_n.c mullo_basecase.c nand_n.c neg_n.c nior_n.c perfsqr.c \
+ mullo_n.c mullo_basecase.c nand_n.c neg.c nior_n.c perfsqr.c \
popcount.c pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
rootrem.c scan0.c scan1.c set_str.c \
sqr_basecase.c sqr_diagonal.c \
diff -r 9f83d5200872 -r 4aaca500e158 mpn/alpha/com.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/alpha/com.asm Mon Dec 28 16:33:49 2009 +0100
@@ -0,0 +1,165 @@
+dnl Alpha mpn_com -- mpn one's complement.
+
+dnl Copyright 2003 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C EV4: 4.75
+C EV5: 2.0
+C EV6: 1.5
+
+
+C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
+C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop
+C will be 1.5+2/N c/l.
+C
+C 2 cycles of loop control are unavoidable, for pointer updates and the
+C taken branch bubble, but also since ldq cannot issue two cycles after stq
+C (and with a run of stqs that means neither of two cycles at the end of the
+C loop.
+C
+C The fbeq is forced into the second cycle of the loop using unops, since
+C the first time through it must wait for the cvtqt result. Once that
+C result is ready (a 1 cycle stall) then both the branch and following loads
+C can issue together.
+C
+C The main loop handles an odd count of limbs, being two limbs loaded before
+C each size test, plus one pipelined around from the previous iteration (or
+C setup in the entry sequence).
+C
+C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
+C entry sequence, and an increment of the pointers. For an odd size there's
+C no increment and the first store in the loop (r24) is a repeat of dst[0].
+C
+C Note that the load for r24 after the possible pointer increment is done
+C before the explicit store to dst[0], in case src==dst.
+
+
+ASM_START()
+
+FLOAT64(L(dat), 2.0)
+
+ ALIGN(16)
+
+PROLOGUE(mpn_com,gp)
+
+ C r16 dst
+ C r17 src
+ C r18 size
+
+ lda r30, -16(r30) C temporary stack space
+ lda r7, -3(r18) C size - 3
+
+ ldq r20, 0(r17) C src[0]
+ srl r7, 1, r6 C (size-3)/2
+
+ stq r6, 8(r30) C (size-3)/2
+ and r7, 1, r5 C 1 if size even
+
+ LEA( r8, L(dat))
+ s8addq r5, r17, r17 C skip src[0] if even
+
+ ornot r31, r20, r20 C ~src[0]
+ unop
+
+ ldt f0, 8(r30) C (size-3)/2
+ ldq r24, 0(r17) C src[0 or 1]
+
+ stq r20, 0(r16) C dst[0]
+ s8addq r5, r16, r19 C skip dst[0] if even
+
+ ldt f1, 0(r8) C data 2.0
+ lda r30, 16(r30) C restore stack
+ unop
+ cvtqt f0, f0 C (size-3)/2 as float
+
+ ornot r31, r24, r24
+ blt r7, L(done_1) C if size<=2
+ unop
+ unop
+
+
+ C 16-byte alignment here
+L(top):
+ C r17 src, incrementing
+ C r19 dst, incrementing
+ C r24 dst[i] result, ready to store
+ C f0 (size-3)/2, decrementing
+ C f1 2.0
+
+ ldq r20, 8(r17) C src[i+1]
+ ldq r21, 16(r17) C src[i+2]
+ unop
+ unop
+
+ fbeq f0, L(done_2)
+ unop
+ ldq r22, 24(r17) C src[i+3]
+ ldq r23, 32(r17) C src[i+4]
+
+ stq r24, 0(r19) C dst[i]
+ ornot r31, r20, r20
+ subt f0, f1, f0 C count -= 2
+ unop
+
+ stq r20, 8(r19) C dst[i+1]
+ ornot r31, r21, r21
+ unop
+ unop
More information about the gmp-commit
mailing list