88// For now, we only try to enable SIMD instructions for x86-64 Intel CPUs.
99// In the future, we should carefully enable support for ARM NEON and POWER
1010// as well as AMD. See https://sourceforge.net/p/predef/wiki/Architectures.
11+ #define HAS_CPUID_SUPPORT
1112#if defined(__x86_64__ ) && defined(__GNUC__ )
1213# include <cpuid.h> // __cpuid_count()
13- # define HAS_CPUID_SUPPORT
14- # if defined(__clang__ )
15- # include <immintrin.h> // _xgetbv()
16- # endif
17- # define HAS_XGETBV_SUPPORT
14+ # include <immintrin.h> // _xgetbv()
1815#elif defined(_M_X64 ) || defined(__amd64__ ) || defined(_M_AMD64 )
1916# include <intrin.h> // __cpuidex()
20- # define HAS_CPUID_SUPPORT
2117# include <immintrin.h> // _xgetbv()
22- # define HAS_XGETBV_SUPPORT
2318#else
2419# undef HAS_CPUID_SUPPORT
25- # undef HAS_XGETBV_SUPPORT
2620#endif
2721
2822// Below, we declare macros for guarding the detection of SSE, AVX/AVX2
2923// and AVX-512 instructions. If the compiler does not even recognize the
3024// corresponding flags or if we are not on an 64-bit platform we do not
3125// even try to inspect the output of CPUID for those specific features.
3226#ifdef HAS_CPUID_SUPPORT
27+ #if defined(_Py_CPUINFO_USE_XGETBV_FUNC ) \
28+ || defined(_Py_CPUINFO_USE_XGETBV_OPCODE )
29+ # define HAS_XGETBV_SUPPORT
30+ #endif
31+
3332#if defined(_Py_CAN_COMPILE_SIMD_SSE_INSTRUCTIONS ) \
3433 || defined(_Py_CAN_COMPILE_SIMD_SSE2_INSTRUCTIONS ) \
3534 || defined(_Py_CAN_COMPILE_SIMD_SSE3_INSTRUCTIONS ) \
@@ -159,19 +158,10 @@ static uint64_t /* should only be used after calling cpuid(1, 0, ...) */
159158get_xgetbv (uint32_t index )
160159{
161160 assert (index == 0 ); // only XCR0 is supported for now
162- # if defined(HAS_CPUID_SUPPORT ) && defined(__x86_64__ ) && defined(__GNUC__ )
163- # if defined(__clang__ )
164- # if _Py__has_builtin (__builtin_ia32_xgetbv )
161+ #if defined(_Py_CPUINFO_USE_XGETBV_FUNC )
162+ /* directly use the compiler's helper if -mxsave is available */
165163 return (uint64_t )_xgetbv (index );
166- # else
167- /*
168- * Without -mxsave support, directly using xgetbv() with raw opcode
169- * may still fail on some platforms (e.g., AMD64 + FreeBSD + clang).
170- * To be on the safe side, we assume that XGETBV is not supported.
171- */
172- return 0 ;
173- # endif
174- # else /* gcc & icc */
164+ #elif defined(__x86_64__ ) && defined(__GNUC__ )
175165 uint32_t eax = 0 , edx = 0 ;
176166 __asm__ volatile (
177167 /* raw opcode for xgetbv for compatibility with older toolchains */
@@ -180,14 +170,15 @@ get_xgetbv(uint32_t index)
180170 : "c" (index )
181171 );
182172 return ((uint64_t )edx << 32 ) | eax ;
183- # endif
184- # elif defined(HAS_CPUID_SUPPORT ) && defined(_M_X64 )
173+ #elif defined(_M_X64 )
185174 return (uint64_t )_xgetbv (index );
186- # else
175+ #else
187176 (void )index ;
188177 return 0 ;
189- # endif
178+ #endif
190179}
180+ #else
181+ #define get_xgetbv (_INDEX ) 0
191182#endif
192183
193184/* Highest Function Parameter and Manufacturer ID (LEAF=0, SUBLEAF=0). */
@@ -364,14 +355,12 @@ detect_cpuid_xsave_state(_Py_cpuid_features *flags)
364355 assert (flags -> maxleaf >= 1 );
365356 (void )flags ;
366357 // Keep the ordering and newlines as they are declared in the structure.
367- #ifdef HAS_XGETBV_SUPPORT
368358 uint64_t xcr0 = flags -> xsave && flags -> osxsave ? get_xgetbv (0 ) : 0 ;
369359 flags -> xcr0_sse = XSAVE_CHECK_REG (xcr0 , XCR0_SSE );
370360 flags -> xcr0_avx = XSAVE_CHECK_REG (xcr0 , XCR0_AVX );
371361 flags -> xcr0_avx512_opmask = XSAVE_CHECK_REG (xcr0 , XCR0_AVX512_OPMASK );
372362 flags -> xcr0_avx512_zmm_hi256 = XSAVE_CHECK_REG (xcr0 , XCR0_AVX512_ZMM_HI256 );
373363 flags -> xcr0_avx512_hi16_zmm = XSAVE_CHECK_REG (xcr0 , XCR0_AVX512_HI16_ZMM );
374- #endif
375364}
376365#endif
377366
0 commit comments