gmp
view mpn/generic/toom_interpolate_5pts.c @ 12619:79e81acfb9b3
Merged changes to toom4_sqr and toom44_mul and toom_eval_dgr3_pm1.
| author | Niels M?ller <nisse@lysator.liu.se> |
|---|---|
| date | Mon, 08 Jun 2009 21:13:58 +0200 |
| parents | fcdc521a539e |
| children | 1d026ed9f9bf |
line source
1 /* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42.
3 Contributed to the GNU project by Robert Harley.
4 Improvements by Paul Zimmermann and Marco Bodrato.
6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2009 Free Software
11 Foundation, Inc.
13 This file is part of the GNU MP Library.
15 The GNU MP Library is free software; you can redistribute it and/or modify it
16 under the terms of the GNU Lesser General Public License as published by the
17 Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
23 for more details.
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
28 #include "gmp.h"
29 #include "gmp-impl.h"
31 void
32 mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
33 mp_size_t k, mp_size_t twor, int sa,
34 mp_limb_t vinf0)
35 {
36 mp_limb_t cy, saved;
37 mp_size_t twok = k + k;
38 mp_size_t kk1 = twok + 1;
39 mp_ptr c1, v1, c3, vinf;
41 c1 = c + k;
42 v1 = c1 + k;
43 c3 = v1 + k;
44 vinf = c3 + k;
46 #define v0 (c)
47 /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) =
48 thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0)
49 */
50 if (sa <= 0)
51 ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
52 else
53 ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
55 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
56 v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */
58 ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */
59 /* (5 3 1 1 0)*/
61 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
62 v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */
64 /* (2) vm1 <- tm1 := (v1 - sa*vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
65 tm1 >= 0 (0 1 0 1 0)
66 No carry comes out from {v1, kk1} +/- {vm1, kk1},
67 and the division by two is exact */
68 if (sa <= 0)
69 {
70 #ifdef HAVE_NATIVE_mpn_rsh1add_n
71 mpn_rsh1add_n (vm1, v1, vm1, kk1);
72 #else
73 ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
74 ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
75 #endif
76 }
77 else
78 {
79 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
80 mpn_rsh1sub_n (vm1, v1, vm1, kk1);
81 #else
82 ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
83 ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
84 #endif
85 }
87 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
88 v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */
90 /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
91 t1 >= 0
92 */
93 vinf[0] -= mpn_sub_n (v1, v1, c, twok);
95 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
96 v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */
98 /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
99 t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
100 */
101 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
102 mpn_rsh1sub_n (v2, v2, v1, kk1);
103 #else
104 ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
105 ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
106 #endif
108 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
109 v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */
111 /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
112 result is v1 >= 0
113 */
114 ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
116 /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
117 cy = mpn_add_n (c1, c1, vm1, kk1);
118 MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
119 /* Memory allocated for vm1 is now free, it can be recycled ...*/
121 /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
122 result is v2 >= 0 */
123 saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */
124 vinf[0] = vinf0; /* Set the right value for vinf0 */
125 #ifdef HAVE_NATIVE_mpn_sublsh1_n
126 cy = mpn_sublsh1_n (v2, v2, vinf, twor);
127 #else
128 /* Overwrite unused vm1 */
129 cy = mpn_lshift (vm1, vinf, twor, 1);
130 cy += mpn_sub_n (v2, v2, vm1, twor);
131 #endif
132 MPN_DECR_U (v2 + twor, kk1 - twor, cy);
134 /* Current matrix is
135 [1 0 0 0 0; vinf
136 0 1 0 0 0; v2
137 1 0 1 0 0; v1
138 0 1 0 1 0; vm1
139 0 0 0 0 1] v0
140 Some vaues already are in-place (we added vm1 in the correct position)
141 | vinf| v1 | v0 |
142 | vm1 |
143 One still is in a separated area
144 | +v2 |
145 We have to compute v1-=vinf; vm1 -= v2,
146 |-vinf|
147 | -v2 |
148 Carefully reordering operations we can avoid to compute twice the sum
149 of the high half of v2 plus the low half of vinf.
150 */
152 /* Add the high half of t2 in {vinf} */
153 if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */
154 cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
155 MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
156 } else { /* triggered only by very unbalanced cases like
157 (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
158 ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
159 }
160 /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
161 result is >= 0 */
162 /* Side effect: we also subtracted (high half) vm1 -= v2 */
163 cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */
164 vinf0 = vinf[0]; /* Save again the right value for vinf0 */
165 vinf[0] = saved;
166 MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */
168 /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
169 Operate only on the low half.
170 */
171 cy = mpn_sub_n (c1, c1, v2, k);
172 MPN_DECR_U (v1, kk1, cy);
174 /********************* Beginning the final phase **********************/
176 /* Most of the recomposition was done */
178 /* add t2 in {c+3k, ...}, but only the low half */
179 cy = mpn_add_n (c3, c3, v2, k);
180 vinf[0] += cy;
181 ASSERT(vinf[0] >= cy); /* No carry */
182 MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
184 #undef v0
185 }
