mul speed

Jason Moxham j.l.moxham@maths.soton.ac.uk
Tue, 29 Jul 2003 17:10:04 +0100


--Boundary-00=_cxpJ/3T+eD/tw6+
Content-Type: text/plain;
  charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline


attached is a preliminary version of a 
mpn_mulmod_2exp_m1
ie calculating {z,n}= {x,n}*{y,n} mod 2^k-1
where n=ceil(k/GMP_NUMB_BITS)

and some timings compairing it to 
mpn_mul_n(z,x,y,n)
mpn_mulhighhalf(z,x,y,n); // {z,n}= floor({x,n}*{y,n}/2^k)
mpn_mullowhalf(z,x,y,n); // {z,n}= {x,n}*{y,n} mod 2^k

eg
n         mul		high		low 		mod 2^k-1
1097  1.0000       1.217      1.2222     1.7144

so mulmod_2exp_m1 is 1.71x faster than a mul

and a barrett division could be done in a time
2      for mpn_mul+mpn_mul
1.64 for mpn_high+mpn_low
1.41 for mpn_high+mpn_mulmod_2exp_m1

possible improvements include "unrolling the recursion" , using FFT for 
mul_mod_2exp_p1 part , using a new mpn_addlshift/mpn_addrshift fn's

--Boundary-00=_cxpJ/3T+eD/tw6+
Content-Type: text/plain;
  charset="us-ascii";
  name="times"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="times"

n  mul      mulhigh  mullow   mulmod_2exp_m1
2 1.000000 0.588235 1.428571 0.416667
3 1.000000 0.650000 0.866667 0.074713
4 1.000000 0.680000 0.772727 0.151786
5 1.000000 0.766667 0.884615 0.073248
6 1.000000 0.769231 0.909091 0.112782
7 1.000000 0.734694 0.923077 0.108761
8 1.000000 0.793103 0.978723 0.211009
9 1.000000 0.850746 1.096154 0.114000
10 1.000000 0.881579 1.015152 0.152968
11 1.000000 0.929412 1.053333 0.153101
12 1.000000 0.918367 1.058824 0.231959
13 1.000000 0.944444 1.062500 0.189591
14 1.000000 0.966667 1.094340 0.244211
15 1.000000 0.984848 1.101695 0.234657
16 1.000000 1.006944 1.115385 0.403900
17 1.000000 1.038462 1.125000 0.200993
18 1.000000 1.047337 1.141935 0.262611
19 1.000000 1.060109 1.147929 0.259705
20 1.000000 1.091837 1.175824 0.351974
21 1.000000 1.094340 1.183673 0.305665
22 1.000000 1.118943 1.198113 0.365994
23 1.000000 1.132231 1.212389 0.346835
24 1.000000 1.135135 1.219917 0.485950
25 1.000000 1.152727 1.233463 0.383777
26 1.000000 1.133562 1.203636 0.437831
27 1.000000 1.172078 1.240550 0.429251
28 1.000000 1.140673 1.211039 0.530583
29 1.000000 1.171014 1.246914 0.465438
30 1.000000 1.134986 1.167139 0.513076
31 1.000000 1.175393 1.240331 0.502237
32 1.000000 1.145000 1.208443 0.720126
33 1.000000 1.188095 1.250627 0.459062
34 1.000000 1.151927 1.215311 0.482431
35 1.000000 1.188720 1.248292 0.489286
36 1.000000 1.148760 1.216630 0.576763
37 1.000000 1.177866 1.239085 0.529778
38 1.000000 1.145833 1.202783 0.553522
39 1.000000 1.189091 1.245714 0.554237
40 1.000000 1.163763 1.221207 0.706878
41 1.000000 1.191275 1.250000 0.577705
42 1.000000 1.179032 1.232715 0.645760
43 1.000000 1.207812 1.254870 0.617412
44 1.000000 1.198485 1.234009 0.721057
45 1.000000 1.231778 1.270677 0.663786
46 1.000000 1.201418 0.018295 0.681965
47 1.000000 1.202957 1.255259 0.690054
48 1.000000 1.192408 1.256552 0.901088
49 1.000000 1.237975 1.307487 0.725519
50 1.000000 1.208128 1.260925 0.769412
51 1.000000 1.233254 1.287141 0.733286
52 1.000000 1.220809 1.278450 0.862041
53 1.000000 1.259798 1.326651 0.784519
54 1.000000 1.262061 1.318442 0.850702
55 1.000000 1.242553 1.299221 0.815073
56 1.000000 1.197949 1.262703 0.950366
57 1.000000 1.245226 1.306962 0.810864
58 1.000000 1.236559 1.272636 0.888967
59 1.000000 1.221163 1.267062 0.852295
60 1.000000 1.197588 1.230696 0.953471
61 1.000000 1.245009 1.288263 0.876677
62 1.000000 1.229825 1.275705 0.941572
63 1.000000 1.220034 1.272321 0.893977
64 1.000000 0.011788 1.228719 1.178071
65 1.000000 1.230333 1.279089 0.900297
66 1.000000 1.238510 1.282199 0.891614
67 1.000000 1.219985 1.267096 0.916764
68 1.000000 1.209690 1.251370 0.953461
69 1.000000 1.245731 1.273141 0.942697
70 1.000000 1.227273 1.273204 0.924959
71 1.000000 1.228491 1.265069 0.981962
72 1.000000 1.206634 1.233922 1.033136
73 1.000000 1.237127 1.271588 0.979088
74 1.000000 1.375491 1.426920 1.090390
75 1.000000 1.207717 1.234714 1.005892
76 1.000000 1.194199 1.234681 1.028789
77 1.000000 1.230486 1.289118 1.033557
78 1.000000 1.220217 1.265918 1.018072
79 1.000000 1.236277 1.270386 1.054453
80 1.000000 1.216706 1.261660 1.168256
81 1.000000 1.250000 1.293103 1.056338
82 1.000000 0.030744 1.281486 1.048931
83 1.000000 1.241091 1.267918 1.076292
84 1.000000 1.229258 1.268018 1.110454
85 1.000000 1.266523 1.299338 1.102948
86 1.000000 1.270285 1.308193 1.093920
87 1.000000 1.254771 1.295527 1.124827
88 1.000000 1.238240 1.284365 1.195312
89 1.000000 1.273774 0.119114 1.135147
90 1.000000 1.262849 1.300403 1.121739
91 1.000000 1.277751 1.301728 1.176261
92 1.000000 1.254762 1.285993 1.183738
93 1.000000 1.269411 1.297323 1.154402
94 1.000000 1.269710 1.272055 1.160067
95 1.000000 1.241087 1.292343 1.184602
96 1.000000 1.249228 1.285520 1.347930
97 1.000000 1.259740 1.299687 1.151108
98 1.000000 1.263113 1.300834 1.181021
99 1.000000 1.244408 1.297064 1.212268
100 1.000000 1.254132 1.289295 1.230239
101 1.000000 1.277551 1.305799 1.216952
102 1.000000 1.267886 1.316550 1.218487
103 1.000000 1.279651 1.312449 1.230359
104 1.000000 1.273083 1.302121 1.318476
105 1.000000 1.321140 1.352918 1.255397
106 1.000000 1.304943 1.323053 1.236311
107 1.000000 1.298246 1.336664 1.275394
108 1.000000 1.297967 1.332448 1.294139
109 1.000000 1.286279 1.334700 1.276194
110 1.000000 1.292998 1.326335 1.249046
111 1.000000 1.258591 1.313768 1.303379
112 1.000000 1.249051 1.280410 1.374715
113 1.000000 1.266330 1.309540 1.280123
114 1.000000 1.280590 1.295354 1.284465
115 1.000000 1.283355 1.316396 1.313725
116 1.000000 1.269831 1.301133 1.352493
117 1.000000 1.291009 1.320369 1.336224
118 1.000000 1.250787 1.293103 1.309720
119 1.000000 1.255092 1.272236 1.339913
120 1.000000 1.235820 1.269072 1.403220
121 1.000000 1.260553 1.296782 1.327046
122 1.000000 1.281974 1.309963 1.339623
123 1.000000 1.271842 1.313720 1.357165
124 1.000000 1.242861 1.284735 1.382419
125 1.000000 1.254608 1.269231 1.359551
126 1.000000 1.251989 1.278132 1.352255
127 1.000000 1.243889 1.269209 1.365936
128 1.000000 1.214680 1.246319 1.541317
129 1.000000 1.260477 1.280824 1.360733
130 1.000000 1.279686 1.305464 1.378898
131 1.000000 1.277986 1.304895 1.396138
132 1.000000 1.267702 1.300918 1.360836
133 1.000000 1.246324 1.285790 1.384131
134 1.000000 1.245013 1.280379 1.389269
135 1.000000 1.243298 1.274896 1.406178
136 1.000000 1.228922 1.260780 1.399829
137 1.000000 1.257538 1.292026 1.398571
138 1.000000 1.239711 1.294546 1.389159
139 1.000000 1.245220 1.283567 1.424276
140 1.000000 1.234795 1.277561 1.385348
141 1.000000 1.237839 1.272968 1.432089
142 1.000000 1.228018 1.265398 1.423869
143 1.000000 1.209281 1.261278 1.442235
144 1.000000 1.197719 1.237864 1.452794
145 1.000000 1.222296 1.256837 1.411569
146 1.000000 1.247901 1.273219 1.441552
147 1.000000 1.242966 1.263076 1.454197
148 1.000000 1.232779 1.260265 1.425468
149 1.000000 1.223637 1.256975 1.451183
150 1.000000 1.223941 1.255324 1.429597
151 1.000000 1.220105 1.235107 1.460150
152 1.000000 1.207211 1.220360 1.479431
153 1.000000 1.230031 1.263854 1.451893
154 1.000000 1.239417 1.272935 1.457252
155 1.000000 1.250854 1.274683 1.492684
156 1.000000 1.245520 1.268763 1.464528
157 1.000000 1.241773 1.271333 1.470369
158 1.000000 1.251670 1.260586 1.488551
159 1.000000 1.240597 1.266878 1.493349
160 1.000000 1.220670 1.254771 1.543936
161 1.000000 1.253706 1.272079 1.484473
162 1.000000 1.249062 1.276371 1.471043
163 1.000000 1.261155 1.264711 1.501562
164 1.000000 1.245886 1.279476 1.478499
165 1.000000 1.267085 1.287296 1.510680
166 1.000000 1.249954 1.279409 1.489440
167 1.000000 1.248866 1.261220 1.507554
168 1.000000 1.229234 1.261617 1.519947
169 1.000000 1.266714 1.288420 1.512386
170 1.000000 1.269339 1.284240 1.521281
171 1.000000 1.265522 1.294942 1.519575
172 1.000000 1.266897 1.285789 1.511446
173 1.000000 1.267955 1.284205 1.534561
174 1.000000 1.262018 1.289965 1.532580
175 1.000000 1.265757 1.285470 1.539455
176 1.000000 1.257723 1.268801 1.576389
177 1.000000 1.249752 1.273324 1.491910
178 1.000000 1.256837 1.267486 1.506814
179 1.000000 1.238056 1.241102 1.481982
180 1.000000 1.229355 1.226783 1.483457
181 1.000000 1.235093 1.280136 1.547902
182 1.000000 1.256256 1.272173 1.548009
183 1.000000 1.247439 1.264579 1.533320
184 1.000000 1.252301 1.271949 1.562086
185 1.000000 1.246823 1.280749 1.527767
186 1.000000 1.232502 1.254694 1.524210
187 1.000000 1.253472 1.272295 1.550224
188 1.000000 1.231457 1.246871 1.529412
189 1.000000 1.232948 1.252259 1.541242
190 1.000000 1.216733 1.236366 1.528163
191 1.000000 1.220871 1.234623 1.526200
192 1.000000 1.198117 1.218051 1.588213
193 1.000000 1.250824 1.274573 1.545696
194 1.000000 1.233739 1.256661 1.536633
195 1.000000 1.249295 1.265904 1.558156
196 1.000000 1.246543 1.269959 1.560413
197 1.000000 1.250349 1.262150 1.548030
198 1.000000 1.257408 1.273454 1.565420
199 1.000000 1.246224 1.268698 1.566005
200 1.000000 1.240000 1.256757 1.500000
220 1.000000 1.204545 1.232558 1.452055
242 1.000000 1.213592 1.237624 1.506024
266 1.000000 1.230769 1.263158 1.548387
292 1.000000 1.229630 1.248120 1.551402
321 1.000000 1.279221 1.296053 1.628099
353 1.000000 1.234637 1.241573 1.613139
388 1.000000 1.230769 1.236715 1.630573
426 1.000000 1.105263 1.209877 1.651685
468 1.000000 1.217857 1.222222 1.705000
514 1.000000 1.451411 1.451411 1.953586
565 1.000000 1.205882 1.209115 1.695489
621 1.000000 1.217799 1.223529 1.716172
683 1.000000 1.172691 1.175050 1.493606
751 1.000000 1.181501 1.183566 1.675743
826 1.000000 1.186082 1.193303 1.686022
908 1.000000 1.214660 1.222661 1.774379
998 1.000000 1.209091 1.218786 1.758678
1097 1.000000 1.217434 1.222222 1.714483
1206 1.000000 1.203046 1.205085 1.744785
1326 1.000000 1.188584 1.208670 1.735232
1458 1.000000 1.220166 1.227214 1.742935
1603 1.000000 1.222590 1.225290 1.736307
1763 1.000000 1.215996 1.217746 1.729564
1939 1.000000 1.167699 1.166735 1.632217
2132 1.000000 1.205692 1.207431 1.700711
2345 1.000000 1.214308 1.208620 1.639831
2579 1.000000 1.205866 1.209150 1.666667
2836 1.000000 1.266509 1.259972 1.735617
3119 1.000000 1.228391 1.233660 1.631110
3430 1.000000 1.205929 1.212170 1.502211
3773 1.000000 1.216433 1.215698 1.649396
4150 1.000000 1.227382 1.232630 1.659173
4565 1.000000 1.236473 1.238626 1.659978
5021 1.000000 1.198976 1.212109 1.619478
5523 1.000000 1.209964 1.225144 1.634790
6075 1.000000 1.175139 1.184656 1.615138
6682 1.000000 1.197656 1.202585 1.664056
7350 1.000000 1.195607 1.192717 1.674015
8085 1.000000 1.171926 1.184158 1.648815
8893 1.000000 1.200630 1.200336 1.686113
9782 1.000000 1.340563 1.347656 1.929432
10760 1.000000 1.305261 1.311359 1.910029
11836 1.000000 1.171663 1.181283 1.713657
13019 1.000000 1.101954 1.103534 1.759975
14320 1.000000 1.001331 1.170679 1.826989
15752 1.000000 0.998925 0.963461 1.572146
17327 1.000000 1.002155 1.003688 1.537980
19059 1.000000 1.000239 1.000418 1.455052
20964 1.000000 0.985597 1.001937 1.453793
23060 1.000000 1.002124 1.001004 1.437382
25366 1.000000 0.999515 1.001116 1.358188
27902 1.000000 0.999086 1.001197 1.396673
30692 1.000000 1.000415 1.001546 1.313564
33761 1.000000 0.994741 1.000999 1.446909
37137 1.000000 1.000055 0.999531 1.268196
40850 1.000000 1.001057 1.001719 1.377347
44935 1.000000 1.003231 1.002772 1.207230
49428 1.000000 0.999151 1.000195 1.320900

--Boundary-00=_cxpJ/3T+eD/tw6+
Content-Type: text/x-csrc;
  charset="us-ascii";
  name="mulmod_2exp.c"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="mulmod_2exp.c"


/*

if GMP_NUMB_BITS==32 then the "first" split for will use shifts of 16 bits
and the "second" split will use shift of 8 bits , this could be done by just 
moving memory rather than rshifts or by adjusting pointers(if mp_limb_t can 
have a alignment different from sizeof(mp_limb_t))
or we can just handle the unaligned pieces at both ends so that the middle part
of the operands is still aligned
*/

#define WANT_ASSERT	1

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include "gmp_plus.h"
#include "gmp-impl.h"


//#define __GMP_UNLIKELY(x) (x)

// let m=ceil( k/GMP_NUMB_BITS)=floor( (k-1)/GMP_NUMB_BITS)+1
// let n=ceil(2k/GMP_NUMB_BITS)=floor((2k-1)/GMP_NUMB_BITS)+1
// {z,m}= ({y,n} % 2^(2k) ) % 2^k-1   not fully reduced ie 0 <= {z,m} <= 2^k-1
// requires tmp space of n-m+1 limbs and we know that m <= n-m+1 <= m+1
void	mpn_mod_2exp_m1(mp_ptr z,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_limb_t c,mask;mp_size_t m,n,s;

ASSERT(k>0);
m=(k-1)/GMP_NUMB_BITS+1;
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y,m));
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y+m,m));
s=k%GMP_NUMB_BITS;
if(s==0)
  {ASSERT_MPN(y,m);ASSERT_MPN(y+m,m);
   c=mpn_add_n(z,y,y+m,m);
   ASSERT_NOCARRY(mpn_add_1(z,z,m,c));
   return;}
//ASSERT_TEMP_SPACE(tmp,m+1);
n=(2*k-1)/GMP_NUMB_BITS+1;
ASSERT_MPN(y,n);ASSERT(!MPN_OVERLAP_P(tmp,n-m+1,y,n));
mpn_rshift(tmp,y+m-1,n-m+1,s);// could use a mpn_addlshift or mpn_addrshift to save the tmp
c=0;if(m!=1)c=mpn_add_n(z,y,tmp,m-1);
mask=(((mp_limb_t)1)<<s)-1;
c=(y[m-1]&mask)+(tmp[m-1]&mask)+c;
// if we can assume {y,n}<2^(2k) then can replace (tmp[m-]1&mask) with tmp[m-1] in above line
z[m-1]=c&mask;
if(z[m-1]!=c)ASSERT_NOCARRY(mpn_add_1(z,z,m,1));
ASSERT((z[m-1]&mask)==z[m-1]);
ASSERT_MPN(z,m);
return;}

// let m=ceil( k/GMP_NUMB_BITS)=floor( (k-1)/GMP_NUMB_BITS)+1
// let n=ceil(2k/GMP_NUMB_BITS)=floor((2k-1)/GMP_NUMB_BITS)+1
// {z,m}= ({y,n} % 2^(2k) ) % 2^k+1   fully reduced ie 0 <= {z,m} <= 2^k 
// NOTE: return value is true iff {z,m}=2^k
// requires tmp space of n-m+1 limbs and we know that m <= n-m+1 <= m+1
mp_limb_t	mpn_mod_2exp_p1(mp_ptr z,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_limb_t c,mask;mp_size_t m,n,s;

ASSERT(k>0);
m=(k-1)/GMP_NUMB_BITS+1;
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y,m));
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y+m,m));
s=k%GMP_NUMB_BITS;
if(s==0)
  {ASSERT_MPN(y,m);ASSERT_MPN(y+m,m);
   c=mpn_sub_n(z,y,y+m,m);
   return mpn_add_1(z,z,m,c);}
//ASSERT_TEMP_SPACE(tmp,m+1);
n=(2*k-1)/GMP_NUMB_BITS+1;
ASSERT_MPN(y,n);ASSERT(!MPN_OVERLAP_P(tmp,n-m+1,y,n));
mpn_rshift(tmp,y+m-1,n-m+1,s);// could use a mpn_sublshift or mpn_subrshift to save the tmp
c=0;if(m!=1)c=mpn_sub_n(z,y,tmp,m-1);
mask=(((mp_limb_t)1)<<s)-1;
c=(y[m-1]&mask)-(tmp[m-1]&mask)-c;
// if we can assume {y,n}<2^(2k) then can replace (tmp[m-]1&mask) with tmp[m-1] in above line
z[m-1]=c&mask;
if(z[m-1]!=c)ASSERT_NOCARRY(mpn_add_1(z,z,m,1));
ASSERT_MPN(z,m);
return  (z[m-1]&mask)!=z[m-1];}

// let m=ceil( k/GMP_NUMB_BITS)=floor( (k-1)/GMP_NUMB_BITS)+1
// let n=ceil(2k/GMP_NUMB_BITS)=floor((2k-1)/GMP_NUMB_BITS)+1
// requires tmp space of n-m+1 limbs and we know that m <= n-m+1 <= m+1
// {z,m}= {x,m}*{y,m} % 2^(2k) % 2^k-1  not fully reduced
// z requires 2*m space
void	basic_mpn_mulmod_2exp_m1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_size_t m;

m=(k-1)/GMP_NUMB_BITS+1;
mpn_mul_n(z,x,y,m);
mpn_mod_2exp_m1(z,z,k,tmp);
return;}

// let m=ceil( k/GMP_NUMB_BITS)=floor( (k-1)/GMP_NUMB_BITS)+1
// let n=ceil(2k/GMP_NUMB_BITS)=floor((2k-1)/GMP_NUMB_BITS)+1
// requires tmp space of n-m+1 limbs and we know that m <= n-m+1 <= m+1
// {z,m}= {x,m}*{y,m} % 2^(2k) % 2^k+1
// z requires 2*m space
mp_limb_t	mpn_mulmod_2exp_p1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_size_t m;mp_limb_t c;

m=(k-1)/GMP_NUMB_BITS+1;
mpn_mul_n(z,x,y,m);
return mpn_mod_2exp_p1(z,z,k,tmp);}

/*
let n=Bk where B=GMP_NUMB_BITS
want a*b%(2^(2n)-1)

calc l=a*b%(2^n-1)  h=a*b%(2^n+1) 

as  2^(n-1) == (2^n-1)^(-1) mod 2^n+1
and 2^(n-1) == (2^n+1)^(-1) mod 2^n-1

let s=S+c2^n=h+l
let d=D-b2^n=l-h
let z=(2^n+1)*2^(n-1)*l + (2^n-1)*2^(n-1)*h
     =2^(n-1)(s*2^n+d)
     =rotr(2n,(D+c)2^n+S-b)
then z == a*b%(2^(2n)-1) mod 2^(2n)-1
*/

// let m=ceil( k/GMP_NUMB_BITS)=floor( (k-1)/GMP_NUMB_BITS)+1
// let n=ceil(2k/GMP_NUMB_BITS)=floor((2k-1)/GMP_NUMB_BITS)+1
// requires tmp space of n-m+1 limbs and we know that m <= n-m+1 <= m+1
// {z,m}= {x,m}*{y,m} % 2^(2k) % 2^k-1  not fully reduced
// z requires 2*m+???? space
void	mpn_mulmod_2exp_m1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_size_t m,n,nmod;mp_limb_t cx,c,cy;// fix and check allocation

if((k&1)!=0 || k<=2*GMP_NUMB_BITS)
  {m=(k-1)/GMP_NUMB_BITS+1;
   mpn_mul_n(z,x,y,m);
   mpn_mod_2exp_m1(z,z,k,tmp);
   return;}
n=k/2;ASSERT(n>GMP_NUMB_BITS);
m=(n-1)/GMP_NUMB_BITS+1;ASSERT(m>=2);
mp_limb_t h[MAX(m,2*m)],l[MAX(m,2*m)],xt[m+1],yt[m+1];
mpn_mod_2exp_m1(xt,x,n,tmp);
mpn_mod_2exp_m1(yt,y,n,tmp);
mpn_mulmod_2exp_m1(l,xt,yt,n,tmp);
cx=mpn_mod_2exp_p1(xt,x,n,tmp);
cy=mpn_mod_2exp_p1(yt,y,n,tmp);
nmod=n%GMP_NUMB_BITS;
if(nmod==0)
  {if(cx==0)
     {if(cy==0){c=mpn_mulmod_2exp_p1(h,xt,yt,n,tmp);}  // this is the common case , must test the rare cases
      else{MPN_ZERO(h,m);c=mpn_sub_n(h,h,yt,m);c=mpn_add_1(h,h,m,c);}}
   else
     {MPN_ZERO(h,m);
      if(cy==0){c=mpn_sub_n(h,h,xt,m);c=mpn_add_1(h,h,m,c);}
      else{h[0]=1;c=0;}}
   cx=mpn_add_n(z,l,h,m)+c;ASSERT(cx==0 || cx==1);
   cy=mpn_sub_n(z+m,l,h,m)+c;ASSERT(cy==0 || cy==1);
   z[2*m]=0;
   ASSERT_NOCARRY(mpn_add_1(z+m,z+m,n+1,cx));
   ASSERT_NOCARRY(mpn_sub_1(z,z,2*m+1,cy));
   ASSERT(z[2*m]==0 || z[2*m]==1);
   c=mpn_rshift(z,z,2*m+1,1);
   z[2*m-1]+=c;
   if(z[2*m-1]<c)ASSERT_NOCARRY(mpn_add_1(z,z,2*m,1));}
else
  {mpn_mulmod_2exp_p1(h,xt,yt,n,tmp);
   if(UNLIKELY(cx&cy)){MPN_ZERO(h,m);h[0]=1;}
   ASSERT_NOCARRY(mpn_add_n(z,l,h,m));				// z[0 to m-1] is sum is n+1 bits
   cx=z[m-1];							// z[0 to m-2],cx is sum is n+1 bits
   cy=mpn_sub_n(z+m-1,l,h,m);					// z[m-1 to 2m-2] is diff and cy is borrow is more than n bits
   z[2*m-2]&=(((mp_limb_t)1)<<nmod)-1;				// z[m-1 to 2m-2] is D and cy is borrow is n bits
   cx-=mpn_sub_1(z,z,m-1,cy);					// z[0 to m-2],cx is sum-borrow=F is n+1 bits
   z[2*m-1]=mpn_lshift(z+m-1,z+m-1,m,nmod-1);			// z[m-1 to 2m-1] is E is n+n%32-1 bits
   c=mpn_rshift(z,z,m-1,1);c>>=GMP_NUMB_BITS-1;
   if((cx&1)!=0)z[m-2]|=((mp_limb_t)1)<<(GMP_NUMB_BITS-1);	
   cx>>=1;					// z[0 to m-2],cx is G is n bits , c is carry out
   						// z[0 to 2m-1] is G-cx2^?+E2^(32(m-1)) is 2n-1 bits
   ASSERT_NOCARRY(mpn_add_1(z+m-1,z+m-1,m+1,cx));	// z[0 to 2m-1] is 2n bits 
   							// z[0 to (2n-1)/32] is 2n bits
   c<<=(k-1)%GMP_NUMB_BITS;
   cx=z[(k-1)/GMP_NUMB_BITS]&c;
   cx>>=(k-1)%GMP_NUMB_BITS;
   z[(k-1)/GMP_NUMB_BITS]^=c;
   ASSERT_NOCARRY(mpn_add_1(z,z,(k-1)/GMP_NUMB_BITS+1,cx));}
ASSERT(k%GMP_NUMB_BITS==0 || z[(k-1)/GMP_NUMB_BITS]>>(k%GMP_NUMB_BITS)==0);
return;}

int	main(void)
{mpz_t a,b,c,d,e,f,m;mp_limb_t tmp[100000];
gmp_randstate_t rnd;
int cc,s=1,ccc,cccc;
struct timeval tv1,tv2;
double t1,t2,t3,t4;

mpz_init2(a,32*1000000);mpz_init2(b,32*1000000);mpz_init2(c,32*1000000);mpz_init2(d,32*1000000);
mpz_init2(e,32*1000000);mpz_init2(m,32*1000000);
gmp_randinit_default(rnd);s=1;
for(cc=0;cc<1000000;cc++)
   {//s=rand()%1000+1;
    s++;if(s>200)s=(s-1)*1.1;
    cccc=1;if(s<200)cccc=100;
    if(s>100000)break;
    mpz_set_ui(m,1);mpz_mul_2exp(m,m,32*s);mpz_sub_ui(m,m,1);
    mpz_urandomm(a,rnd,m);
    mpz_urandomm(b,rnd,m);
    mpz_mul(c,a,b);
    mpz_mod(c,c,m);
  
    gettimeofday(&tv1,0);
    for(ccc=0;ccc<cccc;ccc++)mpn_mul_n(e->_mp_d,a->_mp_d,b->_mp_d,s);
    gettimeofday(&tv2,0);
    t1=tv2.tv_sec-tv1.tv_sec;
    t1*=1000000;
    t1+=tv2.tv_usec-tv1.tv_usec;
  
    gettimeofday(&tv1,0);
    for(ccc=0;ccc<cccc;ccc++)mpn_mulhighhalf(e->_mp_d,a->_mp_d,b->_mp_d,s);
    gettimeofday(&tv2,0);
    t2=tv2.tv_sec-tv1.tv_sec;
    t2*=1000000;
    t2+=tv2.tv_usec-tv1.tv_usec;
  
    gettimeofday(&tv1,0);
    for(ccc=0;ccc<cccc;ccc++)mpn_mullowhalf(e->_mp_d,a->_mp_d,b->_mp_d,s);
    gettimeofday(&tv2,0);
    t3=tv2.tv_sec-tv1.tv_sec;
    t3*=1000000;
    t3+=tv2.tv_usec-tv1.tv_usec;
  
    gettimeofday(&tv1,0);
    for(ccc=0;ccc<cccc;ccc++)mpn_mulmod_2exp_m1(d->_mp_d,a->_mp_d,b->_mp_d,s*32,tmp);
    gettimeofday(&tv2,0);
    t4=tv2.tv_sec-tv1.tv_sec;
    t4*=1000000;
    t4+=tv2.tv_usec-tv1.tv_usec;
  
  
    printf("%d %f %f %f %f\n",s,t1/t1,t1/t2,t1/t3,t1/t4);
    //gmp_printf("%Zd %Zd %Zd\n",a,b,c);
    //gmp_printf("%Nd %Nd %Nd\n",a->_mp_d,s,b->_mp_d,s,d->_mp_d,s);
    //if(mpn_cmp(d->_mp_d,c->_mp_d,s)!=0)abort();
    //printf("OK");
    fflush(0);
   }


return 0;}



/***********************************************************************************************
	
		CRAP BELOW
		
***********************************************************************************************/


#if 0

/*

let n=Bk where B=GMP_NUMB_BITS
want a*b%(2^(2n)-1)

calc l=a*b%(2^n-1)  h=a*b%(2^n+1) 

as  2^(n-1) == (2^n-1)^(-1) mod 2^n+1
and 2^(n-1) == (2^n+1)^(-1) mod 2^n-1

let s=S+c2^n=h+l
let d=D-b2^n=l-h
let z=(2^n+1)*2^(n-1)*l + (2^n-1)*2^(n-1)*h
     =2^(n-1)(s*2^n+d)
     =rotr(2n,(D+c)2^k+S-b)
then z == a*b%(2^(2n)-1) mod 2^(2n)-1
*/

// {z,m}= {x,m}*{y,m} % (2^k-1) not fully reduced
// z needs 2m limbs
// multiplies k bits by k bits to give k bits
void	mpn_mulmod_2exp_m1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k)
{mp_limb_t cx,cy,c,xt[k/2],yt[k/2],l[k],h[k],tmp[k];
 mp_size_t n,m;

if((k&1)!=0)
  {// k is odd so cannot do the fast way
   m=(k-1)/GMP_NUMB_BITS+1;
   // do we want to mask out any above k bits or do we assume they are zero or just not fully reduced ???
   mpn_mul_n(z,x,y,m);
   mpn_mod_2exp_m1(z,z,k);
   return;}
ASSERT((k&1)==0);
// so k is even so we can do it the fast way
n=k/2;
mpn_mod_2exp_m1(xt,x,n);
mpn_mod_2exp_m1(yt,y,n);
mpn_mulmod_2exp_m1(l,xt,yt,n);
mpn_mod_2exp_p1(xt,x,n);
mpn_mod_2exp_p1(yt,y,n);
mpn_mulmod_2exp_p1(h,xt,yt,n);
mpn_add_n(s,l,h,?);
mpn_sub_n(d,l,h,?);


 
   ASSERT_NOCARRY(mpn_add_n(z,l,h,n));
   cy=mpn_sub_n(z+n,l,h,n)+c;ASSERT(cy==0 || cy==1);
   z[k]=0;
   ASSERT_NOCARRY(mpn_add_1(z+n,z+n,n+1,cx));
   ASSERT_NOCARRY(mpn_sub_1(z,z,k+1,cy));
   ASSERT(z[k]==0 || z[k]==1);
   c=mpn_rshift(z,z,k+1,1);
   z[k-1]+=c;
   if(z[k-1]<c)ASSERT_NOCARRY(mpn_add_1(z,z,k,1));
  }
else
  {m=(k-1)/GMP_NUMB_BITS+1;
   mpn_mul_n(z,x,y,m);
   mpn_mod_2exp_m1(z,z,k,tmp);
  }
return;}

#endif






#if 0
// {z,k}= {y,2k} % 2^(k*GMP_NUMB_BITS)-1   not fully reduced ie 0<={z,k}<=2^(k*GMP_NUMB_BITS)-1
void	mpn_mod_limbexp_m1(mp_ptr z,mp_srcptr y,mp_size_t k)
{mp_limb_t c;

ASSERT(k>0);
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y,k));
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y+k,k));
c=mpn_add_n(z,y,y+k,k);
ASSERT_NOCARRY(mpn_add_1(z,z,k,c));
return;}


// {z,k bits}= {y,2k bits note ignores any bits above 2k} % 2^k-1   not fully reduced ie 0<={z,k bits}<=2^k-1
// requires tmp space of 2*(floor((k-1)/GMP_NUMB_BITS)+1)=2*ceil(k/GMP_NUMB_BITS)
void	mpn_mod_2exp_m1(mp_ptr z,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_limb_t c,mask;mp_size_t m,n,s;

ASSERT(k>0);
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y,k));
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y+k,k));
if(k%GMP_NUMB_BITS==0){mpn_mod_limbexp_m1(z,y,k/GMP_NUMB_BITS);return;}
ASSERT(k%GMP_NUMB_BITS!=0);
m=(k-1)/GMP_NUMB_BITS+1;
s=k%GMP_NUMB_BITS;
n=(2*k-1)/GMP_NUMB_BITS+2-m;
mpn_rshift(tmp,y+m-1,n,s);
ASSERT(n==m || (n==m+1 && tmp[m]==0));
mask=(((mp_limb_t)1)<<s)-1;
tmp[m-1]&=mask;
MPN_COPY(tmp+m,y,m);// remove this copy or use mpn_addlshift or mpn_addrshift
(tmp+m)[m-1]&=mask;
ASSERT_NOCARRY(mpn_add_n(z,tmp+m,tmp,m));
if((z[m-1]&(~mask))!=0)
  {ASSERT_NOCARRY(mpn_add_1(z,z,m,1));
   z[m-1]&=mask;}
return;}

// {z,k}+ret*2^(k*GMP_NUMB_BITS) = {y,2k} % (2^(k*GMP_NUMB_BITS)+1)
mp_limb_t	mpn_mod_limbexp_p1(mp_ptr z,mp_srcptr y,mp_size_t k)
{mp_limb_t carry;

ASSERT(k>0);
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y,k));
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y+k,k));
carry=mpn_sub_n(z,y,y+k,k);
return mpn_add_1(z,z,k,carry);}

// {z,k bits}+ret*2^k= {y,2k bits note ignores any bits above 2k} % 2^k+1
// requires tmp space of 2*(floor((k-1)/GMP_NUMB_BITS)+1)=2*ceil(k/GMP_NUMB_BITS)
mp_limb_t	mpn_mod_2exp_p1(mp_ptr z,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mp_limb_t c,mask;mp_size_t m,n,s;

ASSERT(k>0);
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y,k));
ASSERT(MPN_SAME_OR_SEPARATE_P(z,y+k,k));
if(k%GMP_NUMB_BITS==0){mpn_mod_limbexp_p1(z,y,k/GMP_NUMB_BITS);return;}
ASSERT(k%GMP_NUMB_BITS!=0);
m=(k-1)/GMP_NUMB_BITS+1;
s=k%GMP_NUMB_BITS;
n=(2*k-1)/GMP_NUMB_BITS+2-m;
mpn_rshift(tmp,y+m-1,n,s);
ASSERT(n==m || (n==m+1 && tmp[m]==0));
mask=(((mp_limb_t)1)<<s)-1;
tmp[m-1]&=mask;
MPN_COPY(tmp+m,y,m);// remove this copy or use mpn_addlshift or mpn_addrshift
(tmp+m)[m-1]&=mask;
c=mpn_sub_n(z,tmp+m,tmp,m);
c=mpn_add_1(z,z,m,c);
z[m-1]&=mask;
return c;}

// {z,k}+ret*2^()= {x,k}*{y,k} % (2^(k*GMP_NUMB_BITS)+1)
// z needs 2k limbs
mp_limb_t	mpn_mulmod_limbexp_p1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k)
{mpn_mul_n(z,x,y,k);return mpn_mod_limbexp_p1(z,z,k);}

// NOTE must not any bits set above k bits
mp_limb_t	mpn_mulmod_2exp_p1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mpn_mul_n(z,x,y,(k-1)/GMP_NUMB_BITS+1);return mpn_mod_2exp_p1(z,z,k,tmp);}


/*

let n=Bk where B=GMP_NUMB_BITS
want a*b%(2^(2n)-1)

calc l=a*b%(2^n-1)  h=a*b%(2^n+1) 

as  2^(n-1) == (2^n-1)^(-1) mod 2^n+1
and 2^(n-1) == (2^n+1)^(-1) mod 2^n-1

let s=S+c2^n=h+l
let d=D-b2^n=l-h
let z=(2^n+1)*2^(n-1)*l + (2^n-1)*2^(n-1)*h
     =2^(n-1)(s*2^n+d)
     =rotr(2n,(D+c)2^k+S-b)
then z == a*b%(2^(2n)-1) mod 2^(2n)-1
*/

// {z,m}= {x,m}*{y,m} % (2^k-1) not fully reduced
// z needs 2m limbs
void	mpn_mulmod_2exp_m1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k)
{mp_limb_t cx,cy,c,xt[k/2],yt[k/2],l[k],h[k],tmp[k];
 mp_size_t n,m;

if(k%GMP_NUMB_BITS==0){mpn_mulmod_limbexp_m1(z,x,y,k/GMP_NUMB_BITS);return;}
ASSERT(k%GMP_NUMB_BITS!=0);
if((k&1)==0)
  {n=k/2;
   mpn_mod_2exp_m1(xt,x,n,tmp);
   mpn_mod_2exp_m1(yt,y,n,tmp);
   mpn_mulmod_2exp_m1(l,xt,yt,n);
   mpn_mod_2exp_p1(xt,x,n,tmp);
   mpn_mod_2exp_p1(yt,y,n,tmp);
   mpn_mulmod_2exp_p1(h,xt,yt,n,tmp);
   // so l[0 to n-1]       is our low  product not fully reduced 0<= l      <=2^n-1
   // so h[0 to n-1]       is our high product     fully reduced 0<= h <=2^n 
   ASSERT_NOCARRY(mpn_add_n(z,l,h,n));
   cy=mpn_sub_n(z+n,l,h,n)+c;ASSERT(cy==0 || cy==1);
   z[k]=0;
   ASSERT_NOCARRY(mpn_add_1(z+n,z+n,n+1,cx));
   ASSERT_NOCARRY(mpn_sub_1(z,z,k+1,cy));
   ASSERT(z[k]==0 || z[k]==1);
   c=mpn_rshift(z,z,k+1,1);
   z[k-1]+=c;
   if(z[k-1]<c)ASSERT_NOCARRY(mpn_add_1(z,z,k,1));
  }
else
  {m=(k-1)/GMP_NUMB_BITS+1;
   mpn_mul_n(z,x,y,m);
   mpn_mod_2exp_m1(z,z,k,tmp);
  }
return;}



// {z,k}= {x,k}*{y,k} % (2^(k*GMP_NUMB_BITS)-1) not fully reduced
// z needs 2k limbs
void	mpn_mulmod_limbexp_m1(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t k)
{mp_limb_t cx,cy,c,xt[k/2],yt[k/2],l[k],h[k];
 mp_size_t n;

if((k&1)==0)
  {n=k/2;
   c=mpn_add_n(xt,x,x+n,n);
   ASSERT_NOCARRY(mpn_add_1(xt,xt,n,c));
   c=mpn_add_n(yt,y,y+n,n);
   ASSERT_NOCARRY(mpn_add_1(yt,yt,n,c));
   mpn_mulmod_limbexp_m1(l,xt,yt,n);
   c=mpn_sub_n(xt,x,x+n,n);
   cx=mpn_add_1(xt,xt,n,c);
   c=mpn_sub_n(yt,y,y+n,n);
   cy=mpn_add_1(yt,yt,n,c);
   
// FIXME
// if(LIKELY(cx==0 && cy==0))
//     {c=mpn_mulmod_limbexp_p1(h,xt,yt,n);}
//   else
//     {MPN_ZERO(h,n);
//      c=mpn_sub_n(h,h,yt,n);c=mpn_add_1(h,h,n,c);// lose value of c here
//      c=mpn_sub_n(h,h,xt,n);c=mpn_add_1(h,h,n,c);// nearly correct
//      if(cx!=0 && cy!=0){h[0]=1;c=0;}
//     }
   if(cx==0)
     {if(cy==0){c=mpn_mulmod_limbexp_p1(h,xt,yt,n);}  // this is the common case , must test the rare cases
      else{MPN_ZERO(h,n);c=mpn_sub_n(h,h,yt,n);c=mpn_add_1(h,h,n,c);}}
   else
     {MPN_ZERO(h,n);
      if(cy==0){c=mpn_sub_n(h,h,xt,n);c=mpn_add_1(h,h,n,c);}
      else{h[0]=1;c=0;}}     
   
   // so l[0 to n-1]       is our low  product not fully reduced 0<= l      <=2^n-1
   // so h[0 to n-1]+c*2^n is our high product     fully reduced 0<= h+c2^n <=2^n 
   cx=mpn_add_n(z,l,h,n)+c;ASSERT(cx==0 || cx==1);
   cy=mpn_sub_n(z+n,l,h,n)+c;ASSERT(cy==0 || cy==1);
   z[k]=0;
   ASSERT_NOCARRY(mpn_add_1(z+n,z+n,n+1,cx));
   ASSERT_NOCARRY(mpn_sub_1(z,z,k+1,cy));
   ASSERT(z[k]==0 || z[k]==1);
   c=mpn_rshift(z,z,k+1,1);
   z[k-1]+=c;
   if(z[k-1]<c)ASSERT_NOCARRY(mpn_add_1(z,z,k,1));
  }
else
  {mpn_mul_n(z,x,y,k);
   c=mpn_add_n(z,z,z+k,k);
   ASSERT_NOCARRY(mpn_add_1(z,z,k,c));}
return;}



// {z,k bits}= {y,2k bits note ignores any bits above 2k} % 2^k-1   not fully reduced ie 0<={z,k bits}<=2^k-1
void	test_mpn_mod_2exp_m1(mp_ptr z,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mpz_t yt,mod;mp_size_t m;

mpz_init2(yt,2*k);
m=(2*k-1)/GMP_NUMB_BITS+1;
MPN_COPY(PTR(yt),y,m);SIZ(yt)=m;
mpz_init_set_ui(mod,1);mpz_mul_2exp(mod,mod,2*k);
mpz_mod(yt,yt,mod);
mpz_set_ui(mod,1);mpz_mul_2exp(mod,mod,k);mpz_sub_ui(mod,mod,1);
mpz_mod(yt,yt,mod);
m=(k-1)/GMP_NUMB_BITS+1;
MPN_COPY(z,PTR(yt),m);
if(SIZ(yt)<m)MPN_ZERO(z+SIZ(yt),m-SIZ(yt));
mpz_clear(mod);mpz_clear(yt);
return;}

// {z,k bits}= {y,2k bits note ignores any bits above 2k} % 2^k+1
mp_limb_t	test_mpn_mod_2exp_p1(mp_ptr z,mp_srcptr y,mp_size_t k,mp_ptr tmp)
{mpz_t yt,mod;mp_limb_t c;mp_size_t m;

mpz_init2(yt,2*k);
m=(2*k-1)/GMP_NUMB_BITS+1;
MPN_COPY(PTR(yt),y,m);SIZ(yt)=m;
mpz_init_set_ui(mod,1);mpz_mul_2exp(mod,mod,2*k);
mpz_mod(yt,yt,mod);
mpz_set_ui(mod,1);mpz_mul_2exp(mod,mod,k);mpz_add_ui(mod,mod,1);
mpz_mod(yt,yt,mod);
mpz_sub_ui(mod,mod,1);
m=(k-1)/GMP_NUMB_BITS+1;
MPN_COPY(z,PTR(yt),m);
if(mpz_cmp(yt,mod)==0){c=1;}
else{c=0;if(SIZ(yt)<m)MPN_ZERO(z+SIZ(yt),m-SIZ(yt));}
mpz_clear(mod);mpz_clear(yt);
return c;}

/*
int	main(void)
{mp_limb_t y[1000],zero[1000],one[1000],tmp[1000],z1[1000],z2[1000],m,c,k;

for(c=0;c<100;c++)
   {mpn_random(y,100);MPN_ZERO(zero,100);MPN_ZERO(one,100);
    for(k=1;k<100*GMP_NUMB_BITS/2;k++)
       {mpn_lshift(one,one,100,1);mpn_add_1(one,one,100,1);
        mpn_mod_2exp_p1(z1,y,k,tmp);
        test_mpn_mod_2exp_p1(z2,y,k,tmp);
        m=(k-1)/GMP_NUMB_BITS+1;
        if(mpn_cmp(z1,z2,m)!=0){printf("failed plus1 %d\n",k);}
        mpn_mod_2exp_m1(z1,y,k,tmp);
        test_mpn_mod_2exp_m1(z2,y,k,tmp);
        m=(k-1)/GMP_NUMB_BITS+1;
        if(mpn_cmp(z1,z2,m)!=0)
          {if(!(    (mpn_cmp(z1,zero,m)==0 && mpn_cmp(z2,one,m)==0)
                 || (mpn_cmp(z2,zero,m)==0 && mpn_cmp(z1,one,m)==0)
               )
             ){printf("failed minus1 %d\n",k);}}
       }
   }
return 0;}
*/


#endif




--Boundary-00=_cxpJ/3T+eD/tw6+--