Skip to content

Commit df4617b

Browse files
daviesrobjkbonfield
authored andcommitted
Check XCR0 register before using AVX2 / AVX512 instructions
Along with checking CPUID, it's necessary to look in the XCR0 register to check that AVX, AVX2 and AVX512 instructions can be used. (The operating system can write to this register to selectively enable or disable these features). See the Intel 64 and IA-32 Architectures Software Developer’s Manual Vol. 1 sections 13.2 and 13.3 for details. XCR0 is read using the XGETBV instruction. While there is an intrinsic for this, using it requires specific compiler options that we may not want to use for htscodecs/rANS_static4x16pr.c compilation. The intrinsic also didn't work until gcc 9. As some binutils still in use may not know about XGETBV, the instruction is written as a byte stream in the inline assembly.
1 parent 4b2c1de commit df4617b

File tree

1 file changed

+29
-6
lines changed

1 file changed

+29
-6
lines changed

htscodecs/rANS_static4x16pr.c

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,9 @@ static void htscodecs_tls_cpu_init(void) {
864864
unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
865865
unsigned int have_xsave UNUSED = 0;
866866
unsigned int have_avx UNUSED = 0;
867+
uint64_t xcr0 UNUSED = 0ULL;
868+
const uint64_t xcr0_can_use_avx UNUSED = (1ULL << 2);
869+
const uint64_t xcr0_can_use_avx512 UNUSED = (7ULL << 5);
867870
// These may be unused, depending on HAVE_* config.h macros
868871

869872
int level = __get_cpuid_max(0, NULL);
@@ -880,21 +883,41 @@ static void htscodecs_tls_cpu_init(void) {
880883
#if defined(bit_SSE4_1)
881884
have_sse4_1 = ecx & bit_SSE4_1;
882885
#endif
883-
#if defined(bit_XSAVE) && defined(bit_OSXSAVE)
884-
have_xsave = (ecx & bit_XSAVE) && (ecx & bit_OSXSAVE);
885-
#endif
886886
#if defined(bit_AVX)
887887
have_avx = ecx & bit_AVX;
888+
#endif
889+
#if defined(bit_XSAVE) && defined(bit_OSXSAVE)
890+
have_xsave = (ecx & bit_XSAVE) && (ecx & bit_OSXSAVE);
891+
if (have_xsave) {
892+
/* OSXSAVE tells us it's safe to use XGETBV to read XCR0
893+
which then describes if AVX / AVX512 instructions can be
894+
executed. See Intel 64 and IA-32 Architectures Software
895+
Developer’s Manual Vol. 1 sections 13.2 and 13.3.
896+
897+
Use inline assembly for XGETBV here to avoid problems
898+
with builtins either not working correctly, or requiring
899+
specific compiler options to be in use. Also emit raw
900+
bytes here as older toolchains may not have the XGETBV
901+
instruction.
902+
*/
903+
__asm__ volatile (".byte 0x0f, 0x01, 0xd0" :
904+
"=d" (edx), "=a" (eax) :
905+
"c" (0));
906+
xcr0 = ((uint64_t) edx << 32) | eax;
907+
}
888908
#endif
889909
}
890-
// AVX2 and AVX512F depend on XSAVE, OSXSAVE and AVX
891-
if (level >= 7 && have_xsave && have_avx) {
910+
// AVX2 and AVX512F depend on XSAVE, AVX and bit 2 of XCR0.
911+
if (level >= 7 && have_xsave && have_avx
912+
&& (xcr0 & xcr0_can_use_avx) == xcr0_can_use_avx) {
892913
__cpuid_count(7, 0, eax, ebx, ecx, edx);
893914
#if defined(bit_AVX2)
894915
have_avx2 = ebx & bit_AVX2;
895916
#endif
896917
#if defined(bit_AVX512F)
897-
have_avx512f = ebx & bit_AVX512F;
918+
// AVX512 depends on bits 5:7 of XCR0
919+
if ((xcr0 & xcr0_can_use_avx512) == xcr0_can_use_avx512)
920+
have_avx512f = ebx & bit_AVX512F;
898921
#endif
899922
}
900923

0 commit comments

Comments
 (0)