mpz_prevprime

Seth Troisi braintwo at gmail.com
Fri Mar 6 21:55:48 UTC 2020


I tried using a segmented sieve. this avoids multiplication or mod in the
inner loop.
It's an improvement and the code is easy to comprehend.

$ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -p mpz_nextprime
$ hg import ~/Downloads/nextprime_segment.patch --no-commit
$ make -j4; make -C tune/ speed -j4
$ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -P mpz_nextprime2

$ pr -m -t mpz_nextprime.data mpz_nextprime2.data | awk '{ print
$0,"\t"$4/$2 }'
100 3.658593750e-05    100     3.326367187e-05 0.909193
110 3.904296875e-05    110     3.598828125e-05 0.921761
121 4.675585937e-05    121     4.157031250e-05 0.889093
133 8.379882812e-05    133     7.977343750e-05 0.951964
146 9.650195312e-05    146     8.803710937e-05 0.912283
160 1.245546875e-04    160     1.032539062e-04 0.828985
176 1.269101562e-04    176     1.173007813e-04 0.924282
193 1.977187500e-04    193     1.846230469e-04 0.933766
212 2.087812500e-04    212     1.933378906e-04 0.926031
233 2.356660156e-04    233     2.218164063e-04 0.941232
256 3.030214844e-04    256     2.868691406e-04 0.946696
281 4.203593750e-04    281     3.908398437e-04 0.929775
500 2.132431641e-03    500     2.013937500e-03 0.944432
1000 2.307201758e-02    1000     2.284832031e-02 0.990304

This can undoubtedly benefit from a larger sieve also,
which I'm testing in a different patch.

On Thu, Mar 5, 2020 at 6:05 PM Seth Troisi <braintwo at gmail.com> wrote:

> I tried using a segmented sieve. this avoids multiplication or mod in the
> inner loop.
> It's an improvement and the code is easy to comprehend.
>
>
>
>
> On Wed, Feb 12, 2020 at 10:25 AM Niels Möller <nisse at lysator.liu.se>
> wrote:
>
>> "Marco Bodrato" <bodrato at mail.dm.unipi.it> writes:
>>
>> > Ciao,
>> >
>> > Il Mer, 12 Febbraio 2020 7:26 am, Niels Möller ha scritto:
>> >> And for a small prime gap (common case), this cost should be the main
>> >> part of the sieving. So it would make sense to optimize it. If we also
>> >> use the ptab, we could compute modulo single-limb prime products using
>> >> precomputed constants.
>> >
>> > Yes, of course.
>>
>> Another version below, using ptab. Timing:
>>
>> $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime
>>         mpz_nextprime
>> 100       0.000116762
>> 110       0.000126770
>> 121       0.000147203
>> 133       0.000257994
>> 146       0.000291051
>> 160       0.000343168
>> 176       0.000386605
>> 193       0.000625994
>> 212       0.000670035
>> 233       0.000763240
>> 256       0.000942154
>> 281       0.001418639
>> 500       0.006632588
>> 1000      0.073935992
>>
>> So also close to 15% speedup for 100 bits, and the same as previous
>> version, almost 25%, for 1000 bits.
>>
>> I used the very crude tuning
>>
>>   limit = (nbits < 200) ? nbits / 2 : PTAB_SIZE;
>>
>> to avoid regressing for smaller sizes.
>>
>> > And the next question will be: should we generate on the fly an even
>> > larger table of primes when the required size grows beyond the
>> > precomputed table?
>>
>> Don't know. I don't quite understand how large a table could be
>> beneficial, but I guess it might make sense to at least limit it to size
>> of L1 cache.
>>
>> Regards,
>> /Niels
>>
>> /* mpz_nextprime(p,t) - compute the next prime > t and store that in p.
>>
>> Copyright 1999-2001, 2008, 2009, 2012 Free Software Foundation, Inc.
>>
>> Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
>>
>> This file is part of the GNU MP Library.
>>
>> The GNU MP Library is free software; you can redistribute it and/or modify
>> it under the terms of either:
>>
>>   * the GNU Lesser General Public License as published by the Free
>>     Software Foundation; either version 3 of the License, or (at your
>>     option) any later version.
>>
>> or
>>
>>   * the GNU General Public License as published by the Free Software
>>     Foundation; either version 2 of the License, or (at your option) any
>>     later version.
>>
>> or both in parallel, as here.
>>
>> The GNU MP Library is distributed in the hope that it will be useful, but
>> WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
>> or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
>> for more details.
>>
>> You should have received copies of the GNU General Public License and the
>> GNU Lesser General Public License along with the GNU MP Library.  If not,
>> see https://www.gnu.org/licenses/.  */
>>
>> #include "gmp-impl.h"
>> #include "longlong.h"
>>
>> #if 1
>> struct gmp_primes_dtab {
>>   mp_limb_t p;
>>   mp_limb_t binv;
>>   mp_limb_t lim;
>> };
>>
>> struct gmp_primes_ptab {
>>   mp_limb_t ppp;        /* primes, multiplied together */
>>   mp_limb_t cps[7];     /* ppp values pre-computed for mpn_mod_1s_4p */
>>   gmp_uint_least32_t idx:24;    /* index of  first primes in dtab */
>>   gmp_uint_least32_t np :8;     /* number of primes related to this entry
>> */
>> };
>>
>> static const struct gmp_primes_dtab dtab[] =
>> {
>> #define WANT_dtab
>> #define P(p,inv,lim) {p, inv,lim}
>> #include "trialdivtab.h"
>> #undef WANT_dtab
>> #undef P
>> };
>>
>> #define DTAB_SIZE (sizeof (dtab) / sizeof (dtab[0]))
>>
>> static const struct gmp_primes_ptab ptab[] =
>> {
>> #define WANT_ptab
>> #include "trialdivtab.h"
>> #undef WANT_ptab
>> };
>>
>> #define PTAB_SIZE (sizeof (ptab) / sizeof (ptab[0]))
>>
>> void
>> mpz_nextprime (mpz_ptr p, mpz_srcptr n)
>> {
>>   mp_limb_t *moduli;
>>   unsigned long difference;
>>   int i;
>>   unsigned prime_limit;
>>   mp_bitcnt_t nbits;
>>   unsigned incr;
>>   TMP_DECL;
>>
>>   /* First handle tiny numbers */
>>   if (mpz_cmp_ui (n, 2) < 0)
>>     {
>>       mpz_set_ui (p, 2);
>>       return;
>>     }
>>   mpz_add_ui (p, n, 1);
>>   mpz_setbit (p, 0);
>>
>>   if (mpz_cmp_ui (p, 7) <= 0)
>>     return;
>>
>>   MPN_SIZEINBASE_2EXP(nbits, PTR(p), SIZ(p), 1);
>>
>>   if (nbits < 200)
>>     {
>>       prime_limit = nbits / 2;
>>       if (SIZ(p) == 1)
>>         {
>>           /* If the prime list includes any prime >= p, do plain
>>              search. */
>>           mp_limb_t limb = PTR(p)[0];
>>           unsigned last = ptab[prime_limit-1].idx +
>> ptab[prime_limit-1].np - 1;
>>           if (dtab[last].p >= limb)
>>             {
>>               while (dtab[last-1].p >= limb)
>>                 last--;
>>               PTR(p)[0] = dtab[last].p;
>>               return;
>>             }
>>         }
>>     }
>>   else
>>     prime_limit = PTAB_SIZE;
>>
>>   TMP_MARK;
>>
>>   /* Compute residues modulo small odd primes */
>>   moduli = TMP_ALLOC_TYPE (prime_limit, mp_limb_t);
>>
>>   for (;;)
>>     {
>>       for (i = 0; i < prime_limit; i++)
>>         {
>>           mp_limb_t ppp = ptab[i].ppp;
>>           const mp_limb_t *cps = ptab[i].cps;
>>           moduli[i] = mpn_mod_1s_4p (PTR(p), SIZ(p), ppp << cps[1], cps);
>>         }
>>
>> #define INCR_LIMIT 0x10000      /* deep science */
>>
>>       for (difference = incr = 0; incr < INCR_LIMIT; difference += 2)
>>         {
>>           /* First check divisibility based on prime list */
>>           for (i = 0; i < prime_limit; i++)
>>             {
>>               /* FIXME: Ensure that this addition can't overflow */
>>               mp_limb_t m = moduli[i] + incr;
>>               int idx = ptab[i].idx;
>>               int np = ptab[i].np;
>>               int j;
>>
>>               for (j = 0; j < np; j++)
>>                 if (m * dtab[idx+j].binv <= dtab[idx+j].lim)
>>                   goto next;
>>             }
>>
>>           mpz_add_ui (p, p, difference);
>>           difference = 0;
>>
>>           /* Miller-Rabin test */
>>           if (mpz_millerrabin (p, 25))
>>             goto done;
>>         next:;
>>           incr += 2;
>>         }
>>       mpz_add_ui (p, p, difference);
>>       difference = 0;
>>     }
>>  done:
>>   TMP_FREE;
>> }
>> #else
>> static const unsigned char primegap[] =
>> {
>>   2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6,
>>   2,10,2,6,6,4,6,6,2,10,2,4,2,12,12,4,2,4,6,2,10,6,6,6,2,6,4,2,10,14,4,2,
>>   4,14,6,10,2,4,6,8,6,6,4,6,8,4,8,10,2,10,2,6,4,6,8,4,2,4,12,8,4,8,4,6,
>>
>> 12,2,18,6,10,6,6,2,6,10,6,6,2,6,6,4,2,12,10,2,4,6,6,2,12,4,6,8,10,8,10,8,
>>
>> 6,6,4,8,6,4,8,4,14,10,12,2,10,2,4,2,10,14,4,2,4,14,4,2,4,20,4,8,10,8,4,6,
>>   6,14,4,6,6,8,6,12
>> };
>>
>> #define NUMBER_OF_PRIMES 167
>>
>> void
>> mpz_nextprime (mpz_ptr p, mpz_srcptr n)
>> {
>>   unsigned short *moduli;
>>   unsigned long difference;
>>   int i;
>>   unsigned prime_limit;
>>   unsigned long prime;
>>   mp_size_t pn;
>>   mp_bitcnt_t nbits;
>>   unsigned incr;
>>   TMP_SDECL;
>>
>>   /* First handle tiny numbers */
>>   if (mpz_cmp_ui (n, 2) < 0)
>>     {
>>       mpz_set_ui (p, 2);
>>       return;
>>     }
>>   mpz_add_ui (p, n, 1);
>>   mpz_setbit (p, 0);
>>
>>   if (mpz_cmp_ui (p, 7) <= 0)
>>     return;
>>
>>   pn = SIZ(p);
>>   MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1);
>>   if (nbits / 2 >= NUMBER_OF_PRIMES)
>>     prime_limit = NUMBER_OF_PRIMES - 1;
>>   else
>>     prime_limit = nbits / 2;
>>
>>   TMP_SMARK;
>>
>>   /* Compute residues modulo small odd primes */
>>   moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short);
>>
>>   for (;;)
>>     {
>>       /* FIXME: Compute lazily? */
>>       prime = 3;
>>       for (i = 0; i < prime_limit; i++)
>>         {
>>           moduli[i] = mpz_tdiv_ui (p, prime);
>>           prime += primegap[i];
>>         }
>>
>> #define INCR_LIMIT 0x10000      /* deep science */
>>
>>       for (difference = incr = 0; incr < INCR_LIMIT; difference += 2)
>>         {
>>           /* First check residues */
>>           prime = 3;
>>           for (i = 0; i < prime_limit; i++)
>>             {
>>               unsigned r;
>>               /* FIXME: Reduce moduli + incr and store back, to allow for
>>                  division-free reductions.  Alternatively, table
>> primes[]'s
>>                  inverses (mod 2^16).  */
>>               r = (moduli[i] + incr) % prime;
>>               prime += primegap[i];
>>
>>               if (r == 0)
>>                 goto next;
>>             }
>>
>>           mpz_add_ui (p, p, difference);
>>           difference = 0;
>>
>>           /* Miller-Rabin test */
>>           if (mpz_millerrabin (p, 25))
>>             goto done;
>>         next:;
>>           incr += 2;
>>         }
>>       mpz_add_ui (p, p, difference);
>>       difference = 0;
>>     }
>>  done:
>>   TMP_SFREE;
>> }
>>
>> #endif
>>
>> --
>> Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
>> Internet email is subject to wholesale government surveillance.
>>
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: nextprime_segment.patch
Type: text/x-patch
Size: 2956 bytes
Desc: not available
URL: <https://gmplib.org/list-archives/gmp-devel/attachments/20200306/cd403b27/attachment.bin>


More information about the gmp-devel mailing list