Illegal instruction fix

William Blanke bill at chia.net
Wed Jan 6 00:03:19 UTC 2021


At Chia, we are using GMP inside Python binary wheels. This means we
compile GMP on one machine and then run this code on different machines,
possibly with different CPUs.

When GMP 6.2 was released, we encountered illegal instruction crashes on
Intel CPUs older than Haswell. We traced this to the LZCNT instruction
being run on these older CPUs even though fat binary was chosen on the post
Haswell compile machine. Unfortunately, this still happens with GMP 6.2.1

To fix this we have created the patch below. It chooses whether to use the
LZCNT, MULX, and TZCNT instructions based on CPU support via runtime CPUID
checks for LZCNT, BMI2, and BMI1 capabilities respectively.

We would like to submit this for upstream inclusion in future versions of
GMP. We hope that it helps and we thank you for all your efforts!

Cheers
Bill

longlong.h

1042a1043,1122
>
> #ifndef RUNTIMECPUID
> #define RUNTIMECPUID
>
> extern int bCheckedBMI;
> extern int bBMI1;
> extern int bBMI2;
>
> inline void hasBMI()
> {
>     if(bCheckedBMI)
>         return;
>
>     bCheckedBMI = 1;
>     int info[4] = {0};
> #if defined(_MSC_VER)
>     __cpuid(info, 0x7);
> #elif defined(__GNUC__) || defined(__clang__)
> #if defined(ARCH_X86) && defined(__PIC__)
>     __asm__ __volatile__ (
>                 "xchg{l} {%%}ebx, %k1;"
>                 "cpuid;"
>                 "xchg{l} {%%}ebx, %k1;"
>                 : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]),
"=d"(info[3]) : "a"(0x7), "c"(0)
>     );
> #else
>     __asm__ __volatile__ (
>                 "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]),
"=d"(info[3]) : "a"(0x7), "c"(0)
>     );
> #endif
> #endif
>     bBMI1 = ((info[1] & (1 << 3)) != 0);
>     bBMI2 = ((info[1] & (1 << 8)) != 0);
> }
>
> inline int hasBMI1()
> {
>     hasBMI();
>     return bBMI1;
> }
>
> inline int hasBMI2()
> {
>     hasBMI();
>     return bBMI2;
> }
>
> extern int bCheckedLZCNT;
> extern int bLZCNT;
>
> inline int hasLZCNT()
> {
>     if(bCheckedLZCNT)
>         return bLZCNT;
>
>     bCheckedLZCNT = 1;
>     int info[4] = {0};
>     #if defined(_MSC_VER)
>         __cpuid(info, 0x80000001);
>     #elif defined(__GNUC__) || defined(__clang__)
>         #if defined(ARCH_X86) && defined(__PIC__)
>             __asm__ __volatile__ (
>                 "xchg{l} {%%}ebx, %k1;"
>                 "cpuid;"
>                 "xchg{l} {%%}ebx, %k1;"
>                 : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]),
"=d"(info[3]) : "a"(0x80000001), "c"(0)
>             );
>         #else
>             __asm__ __volatile__ (
>                 "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]),
"=d"(info[3]) : "a"(0x80000001), "c"(0)
>             );
>         #endif
>     #endif
>
>     bLZCNT = ((info[2] & (1 << 5)) != 0);
>     return bLZCNT;
> }
>
> #endif // RUNTIMECPUID
>
1053,1055d1132
< #if X86_ASM_MULX \
<    && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
<        || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1057c1134,1135
<   __asm__ ("mulx\t%3, %q0, %q1" \
---
>   if(hasBMI2()) {                                                       \
>       __asm__ ("mulx\t%3, %q0, %q1" \
1059,1062c1137,1139
<   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
< #else
< #define umul_ppmm(w1, w0, u, v) \
<   __asm__ ("mulq\t%3" \
---
>   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)));                  \
>   } else {                                                              \
>       __asm__ ("mulq\t%3" \
1064,1065c1141,1142
<   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
< #endif
---
>   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)));                  \
>   }
1071,1074d1147
< #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell ||
HAVE_HOST_CPU_skylake \
<   || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \
<   || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \
<   || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1076,1080c1149,1162
<   do { \
<     /* This is lzcnt, spelled for older assemblers.  Destination and */ \
<     /* source must be a 64-bit registers, hence cast and %q.         */ \
<     __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
<   } while (0)
---
>   if(hasLZCNT()) {                                                      \
>     do { \
>       /* This is lzcnt, spelled for older assemblers.  Destination and */
\
>       /* source must be a 64-bit registers, hence cast and %q.         */
\
>       __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
>      } while (0);
\
>   } else {
 \
>     do {
 \
>       UDItype __cbtmp;
 \
>       ASSERT ((x) != 0);
 \
>       __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));
 \
>       (count) = __cbtmp ^ 63;
\
>     } while (0);                                                         \
>   }
1082,1090d1163
< #else
< #define count_leading_zeros(count, x) \
<   do { \
<     UDItype __cbtmp; \
<     ASSERT ((x) != 0); \
<     __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
<     (count) = __cbtmp ^ 63; \
<   } while (0)
< #endif
1092,1093d1164
< #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
<   || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1095,1099c1166,1177
<   do { \
<     /* This is tzcnt, spelled for older assemblers.  Destination and */ \
<     /* source must be a 64-bit registers, hence cast and %q.         */ \
<     __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
<   } while (0)
---
>   if(hasBMI1()) {                                                       \
>     do { \
>       /* This is tzcnt, spelled for older assemblers.  Destination and */
\
>       /* source must be a 64-bit registers, hence cast and %q.         */
\
>       __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
>     } while (0);                                                        \
>   } else {                                                              \
>     do {                                                                \
>       ASSERT ((x) != 0);                                                \
>       __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));    \
>     } while (0);                                                        \
>   }
1101,1107d1178
< #else
< #define count_trailing_zeros(count, x) \
<   do { \
<     ASSERT ((x) != 0); \
<     __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
<   } while (0)
< #endif

compat.c

33a34,39
> /* RUNTIMECPUID */
> int bCheckedBMI = 0;
> int bBMI1 = 0;
> int bBMI2 = 0;
> int bCheckedLZCNT = 0;
> int bLZCNT = 0;


More information about the gmp-bugs mailing list