/* Compute {up,n}^(-1) mod 2(n*GMP_NUMB_BITS).

   Contributed to the GNU project by Torbjorn Granlund.

   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE.  IT IS
   ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
   ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
   RELEASE.

Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option) any
later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
details.

You should have received a copy of the GNU General Public License along with
the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */

#include "gmp.h"
#include "gmp-impl.h"


#define __GMP_FORCE_mpn_neg_n

#if defined (__GMP_EXTERN_INLINE) || defined (__GMP_FORCE_mpn_neg_n)
#if ! defined (__GMP_FORCE_mpn_neg_n)
__GMP_EXTERN_INLINE
#endif
mp_limb_t
mpn_neg_n (mp_ptr __gmp_rp, mp_srcptr __gmp_up, mp_size_t __gmp_n)
{
  mp_limb_t __gmp_ul, __gmp_cy;
  __gmp_cy = 0;
  do {
      __gmp_ul = *__gmp_up++;
      *__gmp_rp++ = -__gmp_ul - __gmp_cy;
      __gmp_cy |= __gmp_ul != 0;
  } while (--__gmp_n != 0);
  return __gmp_cy;
}
#endif


/*
  r[k+1] = r[k] - r[k] * (u*r[k] - 1)
  r[k+1] = r[k] + r[k] - r[k]*(u*r[k])
*/

/* This is intended constant THRESHOLDs only, where the compiler can completely
   fold the result.  */
#define LOG2C(n) \
 (((n) >=    0x1) + ((n) >=    0x2) + ((n) >=    0x4) + ((n) >=    0x8) + \
  ((n) >=   0x10) + ((n) >=   0x20) + ((n) >=   0x40) + ((n) >=   0x80) + \
  ((n) >=  0x100) + ((n) >=  0x200) + ((n) >=  0x400) + ((n) >=  0x800) + \
  ((n) >= 0x1000) + ((n) >= 0x2000) + ((n) >= 0x4000) + ((n) >= 0x8000))

#if TUNE_PROGRAM_BUILD
#define NPOWS \
 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
#else
#define NPOWS \
 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (BINV_NEWTON_THRESHOLD))
#endif

mp_size_t
mpn_binvert_itch (mp_size_t n)
{
#if WANT_FFT
  if (ABOVE_THRESHOLD (n, 2 * MUL_FFT_MODF_THRESHOLD))
    return mpn_fft_next_size (n, mpn_fft_best_k (n, 0));
  else
#endif
    return 3 * (n - (n >> 1));
}

void
mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
{
  mp_ptr xp;
  mp_size_t rn, newrn;
  mp_size_t sizes[NPOWS], *sizp;
  mp_limb_t di;

  /* Compute the computation precisions from highest to lowest, leaving the
     base case size in 'rn'.  */
  sizp = sizes;
  for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1)
    *sizp++ = rn;

  xp = scratch;

  /* Compute a base value using a low-overhead O(n^2) algorithm.  FIXME: We
     should call some divide-and-conquer lsb division function here for an
     operand subrange.  */
  MPN_ZERO (xp, rn);
  xp[0] = 1;
  binvert_limb (di, up[0]);
  if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD))
    mpn_sb_bdiv_q (rp, xp, rn, up, rn, -di);
  else
    mpn_dc_bdiv_q (rp, xp, rn, up, rn, -di);

  /* Use Newton iterations to get the desired precision.  */
  for (; rn < n; rn = newrn)
    {
      newrn = *--sizp;

#if WANT_FFT
      if (ABOVE_THRESHOLD (newrn, 2 * MUL_FFT_MODF_THRESHOLD))
	{
	  int k;
	  mp_size_t m, i;

	  k = mpn_fft_best_k (newrn, 0);
	  m = mpn_fft_next_size (newrn, k);
	  mpn_mul_fft (xp, m, up, newrn, rp, rn, k);
	  for (i = rn - 1; i >= 0; i--)
	    if (xp[i] > (i == 0))
	      {
		mpn_add_1 (xp + rn, xp + rn, newrn - rn, 1);
		break;
	      }
	}
      else
#endif
	mpn_mul (xp, up, newrn, rp, rn);
      mpn_mullow_n (rp + rn, rp, xp + rn, newrn - rn);
      mpn_neg_n (rp + rn, rp + rn, newrn - rn);
    }
}
