[Gmp-commit] /var/hg/gmp: Add a 4th and 5th div1 method.
mercurial at gmplib.org
mercurial at gmplib.org
Sun Sep 22 22:21:01 UTC 2019
details: /var/hg/gmp/rev/bb72edd77eaa
changeset: 17921:bb72edd77eaa
user: Torbjorn Granlund <tg at gmplib.org>
date: Mon Sep 23 00:20:59 2019 +0200
description:
Add a 4th and 5th div1 method.
diffstat:
mpn/generic/hgcd2.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 218 insertions(+), 0 deletions(-)
diffs (228 lines):
diff -r 3e04a9bbba13 -r bb72edd77eaa mpn/generic/hgcd2.c
--- a/mpn/generic/hgcd2.c Fri Sep 20 20:36:30 2019 +0200
+++ b/mpn/generic/hgcd2.c Mon Sep 23 00:20:59 2019 +0200
@@ -140,6 +140,224 @@
}
return res;
}
+
+#elif HGCD2_DIV1_METHOD == 4
+
+/* Table quotients. We extract the NBITS most significant bits of the
+ numerator limb, and the corresponding bits from the divisor limb, and use
+ these to form an index into the table. We only get NBITS/2 bits of
+ progress, and the table is very large. The hit rate for the table with
+ NBITS = 6 is about 80%, meaning we get a lot of poorly predicated branches.
+
+ Possible improvements:
+
+ * Don't identify the failure case early, by looking at the extracted divisor
+ bits. Instead do the table thing, and only invoke the slow path if the
+ remainder is off after one conditional adjustment. (This will require a
+ table where the 0 entries are properly filled in.)
+
+ * Perhaps extract the highest NBITS of the divisor instead of the same bits
+ as from the numerator. That would require another count_leading_zeros,
+ and a post-multiply shift of the quotient.
+
+ * Compress the table. Its values are tiny now, and there are lots of zero
+ entries (which are never used).
+
+ * Round the table entries more cleverly?
+*/
+
+#define NBITS 6 /* Must be even. */
+static const unsigned char tab[1 << (2 * NBITS - 1)] = {
+0,0,0,0,0,0,0,0,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,4,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+0,0,0,0,0,0,0,0,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+0,0,0,0,0,0,0,0,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
+0,0,0,0,0,0,0,0,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
+0,0,0,0,0,0,0,0,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
+
+mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ int ncnt;
+ size_t nbi, dbi;
+ mp_limb_t q0;
+ mp_limb_t r0;
+ mp_double_limb_t res;
+
+ count_leading_zeros (ncnt, n0);
+ nbi = n0 << ncnt >> (64 - NBITS);
+ dbi = d0 << ncnt >> (64 - NBITS);
+
+ if (UNLIKELY (dbi < (1 << NBITS/2)))
+ {
+ q0 = n0 / d0;
+ r0 = n0 - q0 * d0;
+ }
+ else
+ {
+ mp_limb_t mask;
+ q0 = tabp[(nbi << NBITS) + dbi];
+ r0 = n0 - q0 * d0;
+ mask = -(mp_limb_t) (r0 >= d0);
+ q0 -= mask;
+ r0 -= d0 & mask;
+ }
+
+ res.d1 = q0;
+ res.d0 = r0;
+ return res;
+}
+
+
+#elif HGCD2_DIV1_METHOD == 5
+
+/* Table inverses of divisors. We don't bother with suppressing the msb from
+ the tables. We index with the NBITS most significant divisor bits,
+ including the always-set highest bit, but use addressing trickery via tabp
+ to suppress it. */
+
+#ifndef NBITS
+#define NBITS 8
+#endif
+
+#if NBITS == 7
+static const unsigned char tab[64] = {
+252,248,244,240,236,233,230,226,223,220,217,214,211,208,206,203,
+200,198,195,193,191,188,186,184,182,180,178,176,174,172,170,168,
+166,164,163,161,159,158,156,155,153,152,150,149,147,146,144,143,
+142,140,139,138,137,136,134,133,132,131,130,129,128,127,126,125
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 8
+static const unsigned short tab[128] = {
+508,504,500,496,492,488,485,481,477,474,470,467,464,460,457,454,
+451,447,444,441,438,435,432,430,427,424,421,418,416,413,410,408,
+405,403,400,398,395,393,390,388,386,383,381,379,377,374,372,370,
+368,366,364,362,360,358,356,354,352,350,348,346,344,342,340,339,
+337,335,333,332,330,328,326,325,323,322,320,318,317,315,314,312,
+311,309,308,306,305,303,302,300,299,298,296,295,293,292,291,289,
+288,287,285,284,283,282,280,279,278,277,276,274,273,272,271,270,
+269,267,266,265,264,263,262,261,260,259,258,257,256,255,254,253
+};
+static const unsigned short *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 9
+static const unsigned short tab[256] = {
+1020,1016,1012,1008,1004,1000,996,992,988,985,981,977,974,970,966,963,
+959,956,952,949,945,942,938,935,932,928,925,922,919,915,912,909,
+906,903,899,896,893,890,887,884,881,878,875,872,869,866,864,861,
+858,855,852,849,847,844,841,838,836,833,830,828,825,822,820,817,
+815,812,810,807,805,802,800,797,795,792,790,787,785,783,780,778,
+776,773,771,769,767,764,762,760,758,755,753,751,749,747,744,742,
+740,738,736,734,732,730,728,726,724,722,720,718,716,714,712,710,
+708,706,704,702,700,698,696,695,693,691,689,687,685,684,682,680,
+678,676,675,673,671,669,668,666,664,663,661,659,657,656,654,653,
+651,649,648,646,644,643,641,640,638,636,635,633,632,630,629,627,
+626,624,623,621,620,618,617,615,614,612,611,609,608,607,605,604,
+602,601,600,598,597,595,594,593,591,590,589,587,586,585,583,582,
+581,579,578,577,575,574,573,572,570,569,568,567,565,564,563,562,
+560,559,558,557,556,554,553,552,551,550,549,547,546,545,544,543,
+542,540,539,538,537,536,535,534,533,532,530,529,528,527,526,525,
+524,523,522,521,520,519,518,517,516,515,514,513,512,511,510,509
+};
+static const unsigned short *tabp = tab - (1 << (NBITS - 1));
+#else
+#error No table for provided NBITS
+#endif
+
+
+mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ int ncnt, dcnt;
+ mp_limb_t q0;
+ mp_limb_t r0;
+ mp_double_limb_t res;
+
+ count_leading_zeros (ncnt, n0);
+ count_leading_zeros (dcnt, d0);
+
+ if (UNLIKELY (dcnt - ncnt >= NBITS - 2))
+ {
+ q0 = n0 / d0;
+ r0 = n0 - q0 * d0;
+ }
+ else
+ {
+ size_t dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
+ mp_limb_t qbits = tabp[dbi];
+ mp_limb_t mask;
+ q0 = ((n0 << ncnt) >> (NBITS + 1)) * qbits >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
+ r0 = n0 - q0 * d0;
+ mask = -(mp_limb_t) (r0 >= d0);
+ q0 -= mask;
+ r0 -= d0 & mask;
+ }
+
+ res.d1 = q0;
+ res.d0 = r0;
+ return res;
+}
+
#else
#error Unknown HGCD2_DIV1_METHOD
#endif
More information about the gmp-commit
mailing list