udiv_qr_3by2 vs divappr

Niels Möller nisse at lysator.liu.se
Tue Jul 31 20:34:48 UTC 2018


tg at gmplib.org (Torbjörn Granlund) writes:

> It is very neat.  Perhaps remove the dn and dn offset-by-2?

New patch at the end of this message. The new code has a few indexing cleanups, and
there's an #if 0 to change to enable the new algorithm.

I have also had another look at the analysis, and I'm fairly confident
the algorithm is correct also for the corner cases of d_0 == 0 and {n1,
n0} very close to {d1, d0}.

I couldn't measure any nice speedup on my local machine (but benchmarking
is very noisy on that machine). On shell, I see a slight speedup. I
measure ratio to mul_basecase, to get nicer numbers.

On shell, before:

$ ./speed-orig -s 3-50 -r mpn_mul_basecase mpn_sbpi1_div_qr
overhead 0.000000002 secs, precision 10000 units of 2.86e-10 secs, CPU
freq 3500.09 MHz
        mpn_mul_basecase mpn_sbpi1_div_qr
3        #0.000000016        3.4867
4        #0.000000021        3.5958
5        #0.000000030        3.2876
6        #0.000000039        3.2126
7        #0.000000052        2.9068
8        #0.000000065        2.8136
9        #0.000000083        2.5651
10       #0.000000100        2.4419
11       #0.000000122        2.2774
12       #0.000000137        2.2546
13       #0.000000166        2.1592
14       #0.000000189        2.0516
15       #0.000000280        1.5683
16       #0.000000239        2.0217
17       #0.000000272        1.9248
18       #0.000000308        1.8982
19       #0.000000344        1.8586
20       #0.000000374        1.7873
21       #0.000000533        1.3847
22       #0.000000452        1.7306
23       #0.000000500        1.6640
24       #0.000000530        1.6621
25       #0.000000751        1.2999
26       #0.000000635        1.6316
27       #0.000000678        1.6053
28       #0.000000733        1.5545
29       #0.000000783        1.5318
30       #0.000000830        1.5225
31       #0.000000882        1.4979
32       #0.000000956        1.5405
33       #0.000001015        1.5636
34       #0.000001066        1.5133
35       #0.000001117        1.5049
36       #0.000001264        1.3767
37       #0.000001229        1.4766
38       #0.000001396        1.3529
39       #0.000001434        1.3824
40       #0.000001477        1.3749
41       #0.000001541        1.3732
42       #0.000001610        1.3665
43       #0.000001678        1.3590
44       #0.000001723        1.3641
45       #0.000001804        1.3567
46       #0.000001872        1.3543
47       #0.000001942        1.4840
48       #0.000002001        1.4982
49       #0.000002084        1.4695
50       #0.000002158        1.4617

On shell, after:

$ ./speed-divappr -s 3-50 -r mpn_mul_basecase mpn_sbpi1_div_qr
overhead 0.000000002 secs, precision 10000 units of 2.86e-10 secs, CPU
freq 3500.09 MHz
        mpn_mul_basecase mpn_sbpi1_div_qr
3        #0.000000016        3.2304
4        #0.000000021        3.3812
5        #0.000000030        3.0853
6        #0.000000039        3.0069
7        #0.000000053        2.7129
8        #0.000000064        2.6349
9        #0.000000082        2.3887
10       #0.000000100        2.3060
11       #0.000000120        2.1199
12       #0.000000137        2.1152
13       #0.000000163        1.9874
14       #0.000000242        1.5294
15       #0.000000218        1.8738
16       #0.000000239        1.9060
17       #0.000000275        1.8474
18       #0.000000309        1.7585
19       #0.000000342        1.8170
20       #0.000000365        1.7681
21       #0.000000409        1.6864
22       #0.000000452        1.6789
23       #0.000000502        1.6029
24       #0.000000527        1.6192
25       #0.000000589        1.6232
26       #0.000000634        1.5864
27       #0.000000677        1.5566
28       #0.000000737        1.5085
29       #0.000000782        1.4899
30       #0.000000826        1.4832
31       #0.000000875        1.4676
32       #0.000000955        1.5075
33       #0.000001013        1.4935
34       #0.000001064        1.4753
35       #0.000001118        1.4600
36       #0.000001154        1.4733
37       #0.000001220        1.4669
38       #0.000001280        1.4440
39       #0.000001433        1.3355
40       #0.000001475        1.3528
41       #0.000001543        1.3432
42       #0.000001606        1.3406
43       #0.000001673        1.3355
44       #0.000001723        1.3386
45       #0.000001798        1.3312
46       #0.000002137        1.1611
47       #0.000001940        1.5360
48       #0.000001998        1.4594
49       #0.000002080        1.4407
50       #0.000002165        1.4310

So it seems to be appr 5% faster för the smaller sizes.

Now speedup isn't main objective here; I'm happy it didn't regress, and
I hope it's not regressign on other important machines.

The thing is, if we can pull this off, then it should be reasonably
straight forward to write a corresponding routine handling the
unnormalized case without complete up-front shifting of both inputs.

One would only need to normalize top limbs of d, by the time the
reciprocal is computed. Then pass inverse and shift count to the main
loop. The loop would then need to compute n1 and n0 (used for quotient
approximation) from reading and shifting the top three limbs of n.


diff -r 1ad8cc22b714 mpn/generic/sbpi1_div_qr.c
--- a/mpn/generic/sbpi1_div_qr.c	Tue Jul 03 11:16:06 2018 +0200
+++ b/mpn/generic/sbpi1_div_qr.c	Tue Jul 31 22:01:12 2018 +0200
@@ -39,6 +39,91 @@
 #include "gmp-impl.h"
 #include "longlong.h"
 
+
+#if 0
+static inline mp_limb_t
+divappr_2 (mp_limb_t n2, mp_limb_t n1,
+	   mp_limb_t d1, mp_limb_t d0, mp_limb_t dinv)
+{
+  mp_limb_t q1, q0, r1, t1, t0, mask;
+
+  umul_ppmm (q1, q0, n2, dinv);
+  add_ssaaaa (q1, q0, q1, q0, n2, n1);
+
+  umul_ppmm (t1, t0, q1, d0);
+  q1++; /* May overflow */
+  r1 = n1 - d1 * q1 - t1 - 1 - (t0 > -d0);
+
+  mask = - (mp_limb_t) (r1 >= q0);
+  q1 += mask;
+  r1 += mask & (d1 + 1);
+  q1 += (r1 >= d1 - 1);
+
+  return q1;
+}
+
+mp_limb_t
+mpn_sbpi1_div_qr (mp_ptr qp,
+		  mp_ptr np, mp_size_t nn,
+		  mp_srcptr dp, mp_size_t dn,
+		  mp_limb_t dinv)
+{
+  mp_limb_t qh;
+  mp_size_t i;
+  mp_limb_t d1, d0, d1m2;
+
+  ASSERT (dn > 2);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+  np += nn;
+
+  qh = mpn_cmp (np - dn, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (np - dn, np - dn, dp, dn);
+
+  qp += nn - dn;
+
+  d1 = dp[dn - 1];
+  d0 = dp[dn - 2];
+
+  /* d - 2 = {d1m2, d0 - 2} */
+  d1m2 = d1 - (d0 < 2);
+
+  /*   np -= 2; */
+
+  for (i = nn - dn; i > 0; i--)
+    {
+      mp_limb_t n1, n0;
+      mp_limb_t cy;
+      mp_limb_t q;
+      np--;
+      n1 = np[0];
+      n0 = np[-1];
+
+      if (UNLIKELY (n1 >= d1m2)	&& (n1 > d1m2 || n0 >= (d0-2)))
+	q = GMP_NUMB_MASK;
+      else
+	q = divappr_2 (n1, n0, d1, d0, dinv);
+
+      cy = mpn_submul_1 (np - dn, dp, dn, q);
+      ASSERT (cy >= n1);
+
+      if (UNLIKELY (cy != n1))
+	{
+	  ASSERT (cy - 1 == n1);
+	  ASSERT_CARRY(mpn_add_n (np - dn, np - dn, dp, dn));
+	  q--;
+	}
+
+      *--qp = q;
+    }
+
+  return qh;
+}
+
+#else
+
 mp_limb_t
 mpn_sbpi1_div_qr (mp_ptr qp,
 		  mp_ptr np, mp_size_t nn,
@@ -107,3 +192,4 @@
 
   return qh;
 }
+#endif

> And moving the np decrement earlier would change np[-1] to np[0], a very
> slight improvement.

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.


More information about the gmp-devel mailing list