mpn_sqrtrem{1,2}  patch for pure C implem
Adrien ProstBoucle
adrien.prostboucle at laposte.net
Sat Apr 1 19:02:37 UTC 2017
On Sat, 20170401 at 18:15 +0200, Marco Bodrato wrote:
> Sorry, but even correcting the obvious typos, it doesn't pass the
> tests.
I think I have found the error.
The final correction was wrong.
I hope it's OK now, but... I still can't compile GMP with ABI=32.
Like you suggested I launched: ./configure ABI=32 && make && make check
The 5th or 6th source file does not compile:
libtool: compile: gcc DHAVE_CONFIG_H I. I.. D__GMP_WITHIN_GMP I.. DOPERATION_com m32 O2 pedantic fomitframepointer mtune=core2 march=core2 c com.c fPIC DPIC o .libs/com.o
In file included from ../gmpimpl.h:147:0,
from com.c:31:
../fib_table.h:4:1: warning: data definition has no type or storage class
Error, error, this data is for 64 bits
^~~~~
../fib_table.h:4:1: warning: type defaults to ‘int’ in declaration of ‘Error’ [Wimplicitint]
../fib_table.h:4:8: warning: type defaults to ‘int’ in declaration of ‘error’ [Wimplicitint]
Error, error, this data is for 64 bits
^~~~~
../fib_table.h:4:20: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘data’
Error, error, this data is for 64 bits
^~~~
Previously I was thinking there was an install issue with my multilib packages.
But I can compile my own programs with 32bit and x32 ABIs.
Only GMP doesn't want to compile...
> On the other side, I tested the proposed sqrtrem1 for ABI=64 on shell (
>
> Before the patch:
> $ (cd tests/devel; make sqrtrem_1_2)&&time tests/devel/sqrtrem_1_2 c
> Corner cases tested, up to bits:
> \ 63
> Values of a single limb, tested.
>
> > user 1m56.807s
>
> After the patch:
> $ (cd tests/devel; make sqrtrem_1_2)&&time tests/devel/sqrtrem_1_2 c
> Corner cases tested, up to bits:
> \ 63
> Values of a single limb, tested.
>
> > user 1m47.889s
>
> The code passed the test with an overall 8% gain in speed.
I'm relieved to know it works for other people :]
> May I suggest to save one more operation with:
>
> invroot = invroot * (((CNST_LIMB(3) << (GMP_LIMB_BITS29)) 
> (a0 >> 27) * invroot * invroot));
>
> another 1% can be gained...
Great catch!
I was too focused on using as many significant bits possible, didn't realize it was not useful there.
I also saved one instruction in the final correction step (for now lightly tested, just make check).
The new diff is at the end of this message.
Regards,
Adrien
diff r 6393a907d1b5 mpn/generic/sqrtrem.c
 a/mpn/generic/sqrtrem.c Wed Mar 08 19:57:44 2017 +0100
+++ b/mpn/generic/sqrtrem.c Sat Apr 01 20:52:22 2017 +0200
@@ 49,121 +49,144 @@
#define USE_DIVAPPR_Q 1
#define TRACE(x)
static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
{
 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
+/* Table of precalculated inverse square root 1/sqrt(a)
+ Note: x must be normalized, i.e. the 2 MSBs are not both zero
+ Access the table by taking the 9 MSBs of the input value, minus 128
+ Then add 256 to the table value
+ For 32bit values: values in the table are shifted 24 bits (multiplied by 2^24)
+ For 64bit values: values in the table are shifted 40 bits (multiplied by 2^40) */
+static const unsigned char invsqrttab[384] = {
+ 255, 253, 251, 249, 247, 245, 243, 242, 240, 238, 236, 234, 233, 231, 229, 227,
+ 226, 224, 222, 221, 219, 218, 216, 215, 213, 211, 210, 208, 207, 205, 204, 203,
+ 201, 200, 198, 197, 195, 194, 193, 191, 190, 189, 187, 186, 185, 184, 182, 181,
+ 180, 179, 177, 176, 175, 174, 173, 171, 170, 169, 168, 167, 166, 165, 164, 162,
+ 161, 160, 159, 158, 157, 156, 155, 154, 153, 152, 151, 150, 149, 148, 147, 146,
+ 145, 144, 143, 142, 141, 140, 139, 138, 138, 137, 136, 135, 134, 133, 132, 131,
+ 130, 130, 129, 128, 127, 126, 125, 125, 124, 123, 122, 121, 121, 120, 119, 118,
+ 117, 117, 116, 115, 114, 114, 113, 112, 111, 111, 110, 109, 108, 108, 107, 106,
+ 106, 105, 104, 103, 103, 102, 101, 101, 100, 99, 99, 98, 97, 97, 96, 95,
+ 95, 94, 94, 93, 92, 92, 91, 90, 90, 89, 89, 88, 87, 87, 86, 86,
+ 85, 84, 84, 83, 83, 82, 81, 81, 80, 80, 79, 79, 78, 78, 77, 76,
+ 76, 75, 75, 74, 74, 73, 73, 72, 72, 71, 71, 70, 70, 69, 69, 68,
+ 67, 67, 66, 66, 65, 65, 65, 64, 64, 63, 63, 62, 62, 61, 61, 60,
+ 60, 59, 59, 58, 58, 57, 57, 56, 56, 56, 55, 55, 54, 54, 53, 53,
+ 52, 52, 52, 51, 51, 50, 50, 49, 49, 49, 48, 48, 47, 47, 47, 46,
+ 46, 45, 45, 45, 44, 44, 43, 43, 43, 42, 42, 41, 41, 41, 40, 40,
+ 39, 39, 39, 38, 38, 37, 37, 37, 36, 36, 36, 35, 35, 35, 34, 34,
+ 33, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28,
+ 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 24, 24, 24, 24, 23, 23,
+ 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9,
+ 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4,
+ 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0,
};
/* Compute s = floor(sqrt(a0)), and *rp = a0  s^2. */
+/* mpn_sqrtrem1: Compute s = floor(sqrt(a0)), and *rp = a0  s^2 */
#if GMP_NUMB_BITS > 32
#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
#else
#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
#endif
+#if GMP_NUMB_BITS <= 32
+/* 32bit code */
static mp_limb_t
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
{
#if GMP_NUMB_BITS > 32
 mp_limb_t a1;
#endif
 mp_limb_t x0, t2, t, x2;
 unsigned abits;
+ mp_limb_t root, invroot;
+ mp_limb_signed_t accu;
ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
 ASSERT_ALWAYS (GMP_LIMB_BITS == 32  GMP_LIMB_BITS == 64);
+ ASSERT_ALWAYS (GMP_LIMB_BITS == 32);
ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
 since we can do the former without division. As part of the last
 iteration convert from 1/sqrt(a) to sqrt(a). */
+ /* Get 9 bits of the inverse square root */
+ invroot = ((mp_limb_t)invsqrttab[(a0 >> 23)  128]) + 256;
 abits = a0 >> (GMP_LIMB_BITS  1  8); /* extract bits for table lookup */
 x0 = 0x100  invsqrttab[abits  0x80]; /* initial 1/sqrt(a) */
+ /* Here we have an approximation of 1/sqrt(x), shifted by 24 bits
+ Exactly 9 bits are used */
 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
+ /* Get the root: it is shifted by 24  9 = 15 */
+ root = (a0 >> 9) * invroot;
+ root = root >> 15;
+ /* Here we have the root, not shifted */
#if GMP_NUMB_BITS > 32
 a1 = a0 >> (GMP_LIMB_BITS  1  32);
 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000)  0x30000  a1 * x0 * x0) >> 16;
 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));

 /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
+ /* Perform one Newton iteration for the square root:
+ r = r + (a  r^2) / 2r */
 t2 = x0 * (a0 >> (328));
 t = t2 >> 25;
 t = ((mp_limb_signed_t) ((a0 << 14)  t * t  MAGIC) >> (328));
 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
 x0 >>= 32;
#else
 t2 = x0 * (a0 >> (168));
 t = t2 >> 13;
 t = ((mp_limb_signed_t) ((a0 << 6)  t * t  MAGIC) >> (168));
 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
 x0 >>= 16;
#endif
+ accu = a0  root * root;
+ accu = (accu >> 3) * invroot;
+ /* Here we have (a  r^2) / r, shifted by 24  3 = 21
+ Add a bit of rounding to ease correction */
+ root += ((accu + (1 << 21)) >> 21) / 2;
 /* x0 is now a full limb approximation of sqrt(a0) */
+ /* Correction: subtract 1 to the root if needed */
+ accu = a0  root * root;
+ if (accu < 0) {
+ accu += 2*root  1;
+ root;
+ }
 x2 = x0 * x0;
 if (x2 + 2*x0 <= a0  1)
 {
 x2 += 2*x0 + 1;
 x0++;
 }

 *rp = a0  x2;
 return x0;
+ *rp = accu;
+ return root;
}
+#else
+/* 64bit code */
+
+static mp_limb_t
+mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
+{
+ mp_limb_t root, invroot;
+ mp_limb_t accu;
+
+ ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
+ ASSERT_ALWAYS (GMP_LIMB_BITS == 64);
+ ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
+
+ // Get 9 bits of the inverse square root
+ invroot = ((mp_limb_t)invsqrttab[(a0 >> 55)  128]) + 256;
+
+ /* Here we have an approximation of 1/sqrt(x), shifted by 40 bits
+ Exactly 9 bits are used */
+
+ /* Perform one Newton iteration for the inverse square root:
+ y = y * (3  a * y^2) / 2*/
+
+ /* First, do a * y^2, using as many significant bits as possible */
+ accu = (a0 >> 27) * invroot * invroot;
+ /* Here we have a * y^2 shifted by 2 * 40  27 = 53
+ Subtract that to the value 3, shifted accordingly */
+ accu = ((mp_limb_t)3 << 53)  accu;
+ /* Now the multiplication by y / 2: the division by 2 gets part of the shift */
+ invroot = invroot * accu;
+
+ /* Here the inverse square root is shifted by 40 + 53 + 1 = 94 bits
+ 63 or 64 bits are used */
+
+ // Get the root
+ root = (a0 >> 32) * (invroot >> 32);
+ // Here we have the root, shifted by 94  64 = 30
+ root = root >> 30;
+ /* Here we have the root, not shifted */
+
+ /* Perform one Newton iteration for the square root:
+ r = r + (a  r^2) / 2r */
+
+ accu = a0  root * root;
+ accu = (accu >> 20) * (invroot >> 27);
+ // Here we have (a  r^2) / r, shifted by 94  47 = 47
+ root += (accu >> 47) / 2;
+
+ /* Correction: add 1 to the root if needed (test: r^2 + 2r  a < 0) */
+ accu = a0  root * root;
+ if ( (mp_limb_signed_t)(2*root  accu) < 0) {
+ accu = 2*root + 1;
+ root++;
+ }
+
+ *rp = accu;
+ return root;
+}
+
+#endif
#define Prec (GMP_NUMB_BITS >> 1)
#if ! defined(SQRTREM2_INPLACE)
More information about the gmpdevel
mailing list