44
55#include "blake3_impl.h"
66
7+ #if defined(_MSC_VER )
8+ #include <Windows.h>
9+ #endif
10+
711#if defined(IS_X86 )
812#if defined(_MSC_VER )
913#include <intrin.h>
1014#elif defined(__GNUC__ )
1115#include <immintrin.h>
1216#else
13- #error " Unimplemented!"
17+ #undef IS_X86 /* Unimplemented! */
1418#endif
1519#endif
1620
17- /* Atomic access abstraction (since MSVC does not do C11 yet) */
18- #if defined(_MSC_VER ) && !defined(__clang__ )
19- #if !defined(IS_X86 )
20- #include <intrin.h>
21- #endif
22- #pragma warning(disable : 5105)
23- #ifndef FORCEINLINE
24- #define FORCEINLINE inline __forceinline
25- #endif
26- typedef volatile long atomic32_t ;
27- static FORCEINLINE int32_t atomic_load32 (atomic32_t * src ) {
28- return _InterlockedOr (src , 0 );
29- }
30- static FORCEINLINE void atomic_store32 (atomic32_t * dst , int32_t val ) {
31- _InterlockedExchange (dst , val );
32- }
21+ #if !defined(BLAKE3_ATOMICS )
22+ #if defined(__has_include )
23+ #if __has_include (< stdatomic .h > ) && !defined(_MSC_VER )
24+ #define BLAKE3_ATOMICS 1
3325#else
34- #include <stdatomic.h>
35- #ifndef FORCEINLINE
36- #define FORCEINLINE inline __attribute__((__always_inline__))
37- #endif
38- typedef volatile _Atomic (int32_t ) atomic32_t ;
39- static FORCEINLINE int32_t atomic_load32 (atomic32_t * src ) {
40- return atomic_load_explicit (src , memory_order_relaxed );
41- }
42- static FORCEINLINE void atomic_store32 (atomic32_t * dst , int32_t val ) {
43- atomic_store_explicit (dst , val , memory_order_relaxed );
44- }
26+ #define BLAKE3_ATOMICS 0
27+ #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
28+ #else
29+ #define BLAKE3_ATOMICS 0
30+ #endif /* defined(__has_include) */
31+ #endif /* BLAKE3_ATOMICS */
32+
33+ #if BLAKE3_ATOMICS
34+ #define ATOMIC_INT _Atomic int
35+ #define ATOMIC_LOAD (x ) x
36+ #define ATOMIC_STORE (x , y ) x = y
37+ #elif defined(_MSC_VER )
38+ #define ATOMIC_INT LONG
39+ #define ATOMIC_LOAD (x ) InterlockedOr(&x, 0)
40+ #define ATOMIC_STORE (x , y ) InterlockedExchange(&x, y)
41+ #else
42+ #define ATOMIC_INT int
43+ #define ATOMIC_LOAD (x ) x
44+ #define ATOMIC_STORE (x , y ) x = y
4545#endif
4646
4747#define MAYBE_UNUSED (x ) (void)((x))
@@ -89,7 +89,6 @@ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
8989#endif
9090}
9191
92- #endif
9392
9493enum cpu_feature {
9594 SSE2 = 1 << 0 ,
@@ -106,24 +105,25 @@ enum cpu_feature {
106105#if !defined(BLAKE3_TESTING )
107106static /* Allow the variable to be controlled manually for testing */
108107#endif
109- atomic32_t g_cpu_features = UNDEFINED ;
108+ ATOMIC_INT g_cpu_features = UNDEFINED ;
110109
111110LLVM_ATTRIBUTE_USED
112111#if !defined(BLAKE3_TESTING )
113112static
114113#endif
115114 enum cpu_feature
116115 get_cpu_features (void ) {
117- enum cpu_feature _cpu_features ;
118- _cpu_features = (enum cpu_feature )atomic_load32 (& g_cpu_features );
119- if (_cpu_features != UNDEFINED ) {
120- return _cpu_features ;
116+
117+ /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
118+ enum cpu_feature features = ATOMIC_LOAD (g_cpu_features );
119+ if (features != UNDEFINED ) {
120+ return features ;
121121 } else {
122122#if defined(IS_X86 )
123123 uint32_t regs [4 ] = {0 };
124124 uint32_t * eax = & regs [0 ], * ebx = & regs [1 ], * ecx = & regs [2 ], * edx = & regs [3 ];
125125 (void )edx ;
126- enum cpu_feature features = 0 ;
126+ features = 0 ;
127127 cpuid (regs , 0 );
128128 const int max_id = * eax ;
129129 cpuid (regs , 1 );
@@ -133,7 +133,7 @@ static
133133 if (* edx & (1UL << 26 ))
134134 features |= SSE2 ;
135135#endif
136- if (* ecx & (1UL << 0 ))
136+ if (* ecx & (1UL << 9 ))
137137 features |= SSSE3 ;
138138 if (* ecx & (1UL << 19 ))
139139 features |= SSE41 ;
@@ -156,15 +156,15 @@ static
156156 }
157157 }
158158 }
159- atomic_store32 ( & g_cpu_features , ( int32_t ) features );
159+ ATOMIC_STORE ( g_cpu_features , features );
160160 return features ;
161161#else
162162 /* How to detect NEON? */
163- atomic_store32 (& g_cpu_features , 0 );
164163 return 0 ;
165164#endif
166165 }
167166}
167+ #endif
168168
169169void blake3_compress_in_place (uint32_t cv [8 ],
170170 const uint8_t block [BLAKE3_BLOCK_LEN ],
0 commit comments