udiv_qr_4by2
Marco Bodrato
bodrato at mail.dm.unipi.it
Tue Sep 3 19:55:03 UTC 2019
Ciao,
while some discussion on division emerges... Even if completely unrelated,
I post a proposed patch to rearrange operations in udiv_qr_4by2.
It should be faster, even if I do not know how much it is actually used:
almost all thresholds in
https://gmplib.org/devel/thres/DIV_QR_2_PI2_THRESHOLD.html are
MP_SIZE_T_MAX /* never */
The patches replaces 1 add_SSSaaaa and 2 += with 1 add_SSaaaa and 1 ++.
*** /tmp/extdiff.bZsVXA/gmp.746b5528f6a5/mpn/generic/div_qr_2.c 2019-08-25
10:32:39.000000000 +0200
--- /home/bodrato/gmp/mpn/generic/div_qr_2.c 2019-08-26 21:47:22.562877220
+0200
***************
*** 111,136 ****
/* Typically used with r1, r0 same as n3, n2. Other types of overlap
between inputs and outputs are not supported. */
#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
do { \
mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
mp_limb_t _t1, _t0; \
! mp_limb_t _c, _mask; \
\
! umul_ppmm (_q3,_q2a, n3, di1); \
umul_ppmm (_q2,_q1, n2, di1); \
umul_ppmm (_q2c,_q1c, n3, di0); \
! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \
umul_ppmm (_q1d,_q0, n2, di0); \
! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \
! \
! /* [q3,q2,q1,q0] += [n3,n2,n1,n0] + [ 0, 1, 0, 0] */ \
! _q3 += n3; \
! _q0 += n0; _c = n0 > _q0; \
! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2, _c); \
! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, CNST_LIMB(1), n1); \
\
umul_ppmm (_t1,_t0, _q2, d0); \
_t1 += _q2 * d1 + _q3 * d0; \
\
sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
\
--- 111,134 ----
/* Typically used with r1, r0 same as n3, n2. Other types of overlap
between inputs and outputs are not supported. */
#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
do { \
mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
mp_limb_t _t1, _t0; \
! mp_limb_t _mask; \
\
! /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \
umul_ppmm (_q2,_q1, n2, di1); \
+ umul_ppmm (_q3,_q2a, n3, di1); \
+ ++_q2; /* _q2 cannot overflow */ \
+ add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \
umul_ppmm (_q2c,_q1c, n3, di0); \
! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \
umul_ppmm (_q1d,_q0, n2, di0); \
! add_sssaaaa (_q2c,_q1d,_q0, _q1d,_q0, n1,n0); /* _q2c cannot
overflow */ \
! add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \
\
umul_ppmm (_t1,_t0, _q2, d0); \
_t1 += _q2 * d1 + _q3 * d0; \
\
sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
\
Ĝis,
m
--
http://bodrato.it/papers/
More information about the gmp-devel
mailing list