[Gmp-commit] /var/hg/gmp: Add a 4th and 5th div1 method.

mercurial at gmplib.org mercurial at gmplib.org
Sun Sep 22 22:21:01 UTC 2019


details:   /var/hg/gmp/rev/bb72edd77eaa
changeset: 17921:bb72edd77eaa
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon Sep 23 00:20:59 2019 +0200
description:
Add a 4th and 5th div1 method.

diffstat:

 mpn/generic/hgcd2.c |  218 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 218 insertions(+), 0 deletions(-)

diffs (228 lines):

diff -r 3e04a9bbba13 -r bb72edd77eaa mpn/generic/hgcd2.c
--- a/mpn/generic/hgcd2.c	Fri Sep 20 20:36:30 2019 +0200
+++ b/mpn/generic/hgcd2.c	Mon Sep 23 00:20:59 2019 +0200
@@ -140,6 +140,224 @@
     }
   return res;
 }
+
+#elif HGCD2_DIV1_METHOD == 4
+
+/* Table quotients.  We extract the NBITS most significant bits of the
+   numerator limb, and the corresponding bits from the divisor limb, and use
+   these to form an index into the table.  We only get NBITS/2 bits of
+   progress, and the table is very large.  The hit rate for the table with
+   NBITS = 6 is about 80%, meaning we get a lot of poorly predicated branches.
+
+   Possible improvements:
+
+   * Don't identify the failure case early, by looking at the extracted divisor
+     bits.  Instead do the table thing, and only invoke the slow path if the
+     remainder is off after one conditional adjustment.  (This will require a
+     table where the 0 entries are properly filled in.)
+
+   * Perhaps extract the highest NBITS of the divisor instead of the same bits
+     as from the numerator.  That would require another count_leading_zeros,
+     and a post-multiply shift of the quotient.
+
+   * Compress the table.  Its values are tiny now, and there are lots of zero
+     entries (which are never used).
+
+   * Round the table entries more cleverly?
+*/
+
+#define NBITS 6		/* Must be even. */
+static const unsigned char tab[1 << (2 * NBITS - 1)] = {
+0,0,0,0,0,0,0,0,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+0,0,0,0,0,0,0,0,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
+0,0,0,0,0,0,0,0,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
+0,0,0,0,0,0,0,0,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
+
+mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  int ncnt;
+  size_t nbi, dbi;
+  mp_limb_t q0;
+  mp_limb_t r0;
+  mp_double_limb_t res;
+
+  count_leading_zeros (ncnt, n0);
+  nbi = n0 << ncnt >> (64 - NBITS);
+  dbi = d0 << ncnt >> (64 - NBITS);
+
+  if (UNLIKELY (dbi < (1 << NBITS/2)))
+    {
+      q0 = n0 / d0;
+      r0 = n0 - q0 * d0;
+    }
+  else
+    {
+      mp_limb_t mask;
+      q0 = tabp[(nbi << NBITS) + dbi];
+      r0 = n0 - q0 * d0;
+      mask = -(mp_limb_t) (r0 >= d0);
+      q0 -= mask;
+      r0 -= d0 & mask;
+    }
+
+  res.d1 = q0;
+  res.d0 = r0;
+  return res;
+}
+
+
+#elif HGCD2_DIV1_METHOD == 5
+
+/* Table inverses of divisors.  We don't bother with suppressing the msb from
+   the tables.  We index with the NBITS most significant divisor bits,
+   including the always-set highest bit, but use addressing trickery via tabp
+   to suppress it.  */
+
+#ifndef NBITS
+#define NBITS 8
+#endif
+
+#if NBITS == 7
+static const unsigned char tab[64] = {
+252,248,244,240,236,233,230,226,223,220,217,214,211,208,206,203,
+200,198,195,193,191,188,186,184,182,180,178,176,174,172,170,168,
+166,164,163,161,159,158,156,155,153,152,150,149,147,146,144,143,
+142,140,139,138,137,136,134,133,132,131,130,129,128,127,126,125
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 8
+static const unsigned short tab[128] = {
+508,504,500,496,492,488,485,481,477,474,470,467,464,460,457,454,
+451,447,444,441,438,435,432,430,427,424,421,418,416,413,410,408,
+405,403,400,398,395,393,390,388,386,383,381,379,377,374,372,370,
+368,366,364,362,360,358,356,354,352,350,348,346,344,342,340,339,
+337,335,333,332,330,328,326,325,323,322,320,318,317,315,314,312,
+311,309,308,306,305,303,302,300,299,298,296,295,293,292,291,289,
+288,287,285,284,283,282,280,279,278,277,276,274,273,272,271,270,
+269,267,266,265,264,263,262,261,260,259,258,257,256,255,254,253
+};
+static const unsigned short *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 9
+static const unsigned short tab[256] = {
+1020,1016,1012,1008,1004,1000,996,992,988,985,981,977,974,970,966,963,
+959,956,952,949,945,942,938,935,932,928,925,922,919,915,912,909,
+906,903,899,896,893,890,887,884,881,878,875,872,869,866,864,861,
+858,855,852,849,847,844,841,838,836,833,830,828,825,822,820,817,
+815,812,810,807,805,802,800,797,795,792,790,787,785,783,780,778,
+776,773,771,769,767,764,762,760,758,755,753,751,749,747,744,742,
+740,738,736,734,732,730,728,726,724,722,720,718,716,714,712,710,
+708,706,704,702,700,698,696,695,693,691,689,687,685,684,682,680,
+678,676,675,673,671,669,668,666,664,663,661,659,657,656,654,653,
+651,649,648,646,644,643,641,640,638,636,635,633,632,630,629,627,
+626,624,623,621,620,618,617,615,614,612,611,609,608,607,605,604,
+602,601,600,598,597,595,594,593,591,590,589,587,586,585,583,582,
+581,579,578,577,575,574,573,572,570,569,568,567,565,564,563,562,
+560,559,558,557,556,554,553,552,551,550,549,547,546,545,544,543,
+542,540,539,538,537,536,535,534,533,532,530,529,528,527,526,525,
+524,523,522,521,520,519,518,517,516,515,514,513,512,511,510,509
+};
+static const unsigned short *tabp = tab - (1 << (NBITS - 1));
+#else
+#error No table for provided NBITS
+#endif
+
+
+mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  int ncnt, dcnt;
+  mp_limb_t q0;
+  mp_limb_t r0;
+  mp_double_limb_t res;
+
+  count_leading_zeros (ncnt, n0);
+  count_leading_zeros (dcnt, d0);
+
+  if (UNLIKELY (dcnt - ncnt >= NBITS - 2))
+    {
+      q0 = n0 / d0;
+      r0 = n0 - q0 * d0;
+    }
+  else
+    {
+      size_t dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
+      mp_limb_t qbits = tabp[dbi];
+      mp_limb_t mask;
+      q0 = ((n0 << ncnt) >> (NBITS + 1)) * qbits >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
+      r0 = n0 - q0 * d0;
+      mask = -(mp_limb_t) (r0 >= d0);
+      q0 -= mask;
+      r0 -= d0 & mask;
+    }
+
+  res.d1 = q0;
+  res.d0 = r0;
+  return res;
+}
+
 #else
 #error Unknown HGCD2_DIV1_METHOD
 #endif


More information about the gmp-commit mailing list