Fix the build and implementation of the 16-byte atomics for MSVC.

rjmccall · rjmccall · commit c346d94655d6 · 2020-12-04T01:51:29.000-05:00
Credit for the cmake fix here goes to Saleem Abdulrasool.

The substantive fix is embarrassing; I didn't pay close attention
to the intrinsic's argument order and just assumed that the first
argument for the replacement value was the low half (the part
you'd find at index 0 if it were an array), but in fact it's the
high half (the part you'd find at index 1).

I also change the code to be much more reinterpret_casty, which
isolates the type-punning mostly "within" the intrinsic, and
which seems to match how other code uses it.
diff --git a/cmake/modules/AddSwift.cmake b/cmake/modules/AddSwift.cmake
@@ -248,18 +248,6 @@ function(_add_host_variant_c_compile_flags target)
     endif()
   endif()
 
-  # The concurrency library uses double-word atomics.  MSVC's std::atomic
-  # uses a spin lock for this, so to get reasonable behavior we have to
-  # implement it ourselves using _InterlockedCompareExchange128.
-  # clang-cl requires us to enable the `cx16` feature to use this intrinsic.
-  if(SWIFT_HOST_VARIANT_SDK STREQUAL WINDOWS)
-    if(SWIFT_HOST_VARIANT_ARCH STREQUAL x86_64)
-      if(CMAKE_C_COMPILER_ID MATCHES Clang)
-        target_compile_options(${target} PRIVATE -mcx16)
-      endif()
-    endif()
-  endif()
-
   if(LLVM_ENABLE_ASSERTIONS)
     target_compile_options(${target} PRIVATE -UNDEBUG)
   else()
diff --git a/include/swift/Runtime/Atomic.h b/include/swift/Runtime/Atomic.h
@@ -66,8 +66,7 @@ class alignas(Size) atomic_impl {
   }
 };
 
-// FIXME: get this to build reliably
-#if 0 && defined(_WIN64)
+#if defined(_WIN64)
 #include <intrin.h>
 
 /// MSVC's std::atomic uses an inline spin lock for 16-byte atomics,
@@ -76,11 +75,7 @@ class alignas(Size) atomic_impl {
 /// AMD processors that lack cmpxchg16b, so we just use the intrinsic.
 template <class Value>
 class alignas(2 * sizeof(void*)) atomic_impl<Value, 2 * sizeof(void*)> {
-  // MSVC is not strict about aliasing, so we can get away with this.
-  union {
-    volatile Value atomicValue;
-    volatile __int64 atomicArray[2];
-  };
+  volatile Value atomicValue;
 public:
   constexpr atomic_impl(Value initialValue) : atomicValue(initialValue) {}
 
@@ -98,10 +93,14 @@ class alignas(2 * sizeof(void*)) atomic_impl<Value, 2 * sizeof(void*)> {
     __int64 resultArray[2] = {};
 #if SWIFT_HAS_MSVC_ARM_ATOMICS
     if (order != std::memory_order_acquire) {
-      (void) _InterlockedCompareExchange128_nf(atomicArray, 0, 0, resultArray);
+      (void) _InterlockedCompareExchange128_nf(
+                            reinterpret_cast<volatile __int64*>(&atomicValue),
+                            0, 0, resultArray);
     } else {
 #endif
-      (void) _InterlockedCompareExchange128(atomicArray, 0, 0, resultArray);
+      (void) _InterlockedCompareExchange128(
+                            reinterpret_cast<volatile __int64*>(&atomicValue),
+                            0, 0, resultArray);
 #if SWIFT_HAS_MSVC_ARM_ATOMICS
     }
 #endif
@@ -116,31 +115,33 @@ class alignas(2 * sizeof(void*)) atomic_impl<Value, 2 * sizeof(void*)> {
            failureOrder == std::memory_order_consume);
     assert(successOrder == std::memory_order_relaxed ||
            successOrder == std::memory_order_release);
-    __int64 newValueArray[2];
-    memcpy(newValueArray, &newValue, sizeof(Value));
 #if SWIFT_HAS_MSVC_ARM_ATOMICS
     if (successOrder == std::memory_order_relaxed &&
         failureOrder != std::memory_order_acquire) {
-      return _InterlockedCompareExchange128_nf(atomicArray,
-                                               newValueArray[0],
-                                               newValueArray[1],
-                                     reinterpret_cast<__int64*>(&oldValue));
+      return _InterlockedCompareExchange128_nf(
+                            reinterpret_cast<volatile __int64*>(&atomicValue),
+                            reinterpret_cast<const __int64*>(&newValue)[1],
+                            reinterpret_cast<const __int64*>(&newValue)[0],
+                            reinterpret_cast<__int64*>(&oldValue));
     } else if (successOrder == std::memory_order_relaxed) {
-      return _InterlockedCompareExchange128_acq(atomicArray,
-                                                newValueArray[0],
-                                                newValueArray[1],
-                                     reinterpret_cast<__int64*>(&oldValue));
+      return _InterlockedCompareExchange128_acq(
+                            reinterpret_cast<volatile __int64*>(&atomicValue),
+                            reinterpret_cast<const __int64*>(&newValue)[1],
+                            reinterpret_cast<const __int64*>(&newValue)[0],
+                            reinterpret_cast<__int64*>(&oldValue));
     } else if (failureOrder != std::memory_order_acquire) {
-      return _InterlockedCompareExchange128_rel(atomicArray,
-                                                newValueArray[0],
-                                                newValueArray[1],
-                                     reinterpret_cast<__int64*>(&oldValue));
+      return _InterlockedCompareExchange128_rel(
+                            reinterpret_cast<volatile __int64*>(&atomicValue),
+                            reinterpret_cast<const __int64*>(&newValue)[1],
+                            reinterpret_cast<const __int64*>(&newValue)[0],
+                            reinterpret_cast<__int64*>(&oldValue));
     } else {
 #endif
-      return _InterlockedCompareExchange128(atomicArray,
-                                            newValueArray[0],
-                                            newValueArray[1],
-                                     reinterpret_cast<__int64*>(&oldValue));
+      return _InterlockedCompareExchange128(
+                            reinterpret_cast<volatile __int64*>(&atomicValue),
+                            reinterpret_cast<const __int64*>(&newValue)[1],
+                            reinterpret_cast<const __int64*>(&newValue)[0],
+                            reinterpret_cast<__int64*>(&oldValue));
 #if SWIFT_HAS_MSVC_ARM_ATOMICS
     }
 #endif
diff --git a/stdlib/cmake/modules/AddSwiftStdlib.cmake b/stdlib/cmake/modules/AddSwiftStdlib.cmake
@@ -261,6 +261,18 @@ function(_add_target_variant_c_compile_flags)
     endif()
   endif()
 
+  # The concurrency library uses double-word atomics.  MSVC's std::atomic
+  # uses a spin lock for this, so to get reasonable behavior we have to
+  # implement it ourselves using _InterlockedCompareExchange128.
+  # clang-cl requires us to enable the `cx16` feature to use this intrinsic.
+  if(SWIFT_HOST_VARIANT_ARCH STREQUAL x86_64)
+    if(SWIFT_COMPILER_IS_MSVC_LIKE)
+      list(APPEND result /clang:-mcx16)
+    else()
+      list(APPEND result -mcx16)
+    endif()
+  endif()
+
   if(${CFLAGS_SDK} STREQUAL ANDROID)
     if(${CFLAGS_ARCH} STREQUAL x86_64)
       # NOTE(compnerd) Android NDK 21 or lower will generate library calls to