intel · martygrant · Nov 4, 2024 · Oct 17, 2024 · Oct 18, 2024 · Oct 25, 2024
@@ -1513,7 +1513,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     const llvm::Triple::SubArchType DeviceSubArch = DeviceTriple.getSubArch();
     if (DeviceTriple.isNVPTX() || DeviceTriple.isAMDGPU() ||
         (DeviceTriple.isSPIR() &&
-         DeviceSubArch != llvm::Triple::SPIRSubArch_fpga))
+         DeviceSubArch != llvm::Triple::SPIRSubArch_fpga) ||
+        LangOpts.SYCLIsNativeCPU)
       Builder.defineMacro("SYCL_USE_NATIVE_FP_ATOMICS");
     // Enable generation of USM address spaces for FPGA.
     if (DeviceSubArch == llvm::Triple::SPIRSubArch_fpga) {

@@ -0,0 +1,28 @@
+//===------- FAtomicsNativeCPU.h - Materializes FP Atomics ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A transformation pass that materializes floating points atomics by emitting
+// corresponding atomicrmw instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ModulePass;
+
+class FAtomicsNativeCPU : public PassInfoMixin<FAtomicsNativeCPU> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+};
+
+} // namespace llvm
@@ -4,6 +4,7 @@ add_llvm_component_library(LLVMSYCLNativeCPUUtils
   RenameKernelSYCLNativeCPU.cpp
   ConvertToMuxBuiltinsSYCLNativeCPU.cpp
   FixABIMuxBuiltinsSYCLNativeCPU.cpp
+  FAtomicsNativeCPU.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR

@@ -0,0 +1,49 @@
+//===------- FAtomicsNativeCPU.cpp - Materializes FP Atomics --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A transformation pass that materializes floating points atomics by emitting
+// corresponding atomicrmw instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SYCLLowerIR/FAtomicsNativeCPU.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/AtomicOrdering.h"
+
+using namespace llvm;
+
+PreservedAnalyses FAtomicsNativeCPU::run(Module &M,
+                                         ModuleAnalysisManager &MAM) {
+  bool ModuleChanged = false;
+  auto &Ctx = M.getContext();
+  for (auto &F : M) {
+    AtomicRMWInst::BinOp OpCode;
+    if (F.getName().starts_with("_Z21__spirv_AtomicFAddEXT")) {
+      OpCode = AtomicRMWInst::BinOp::FAdd;
+    } else if (F.getName().starts_with("_Z21__spirv_AtomicFMinEXT")) {
+      OpCode = AtomicRMWInst::BinOp::FMin;
+    } else if (F.getName().starts_with("_Z21__spirv_AtomicFMaxEXT")) {
+      OpCode = AtomicRMWInst::BinOp::FMax;
+    } else {
+      continue;
+    }
+
+    BasicBlock *BB = BasicBlock::Create(Ctx, "entry", &F);
+    IRBuilder<> Builder(BB);
+    // Currently we drop arguments 1 and 2 (scope and memory ordering),
+    // defaulting to Monotonic ordering and System scope.
+    auto A =
+        Builder.CreateAtomicRMW(OpCode, F.getArg(0), F.getArg(3), MaybeAlign(),
+                                AtomicOrdering::Monotonic, SyncScope::System);
+    Builder.CreateRet(A);
+  }
+  return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/FAtomicsNativeCPU.h"
 #include "llvm/SYCLLowerIR/FixABIMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
@@ -70,6 +71,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
     OptimizationLevel OptLevel) {
   MPM.addPass(SpecConstantsPass(SpecConstantsPass::HandlingMode::emulation));
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
+  MPM.addPass(FAtomicsNativeCPU());
 #ifdef NATIVECPU_USE_OCK
   MPM.addPass(compiler::utils::PrepareBarriersPass());
   MPM.addPass(compiler::utils::TransferKernelMetadataPass());

@@ -0,0 +1,40 @@
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
+#include <sycl/sycl.hpp>
+
+constexpr sycl::memory_order order = sycl::memory_order::relaxed;
+constexpr sycl::memory_scope scope = sycl::memory_scope::work_group;
+constexpr sycl::access::address_space space =
+    sycl::access::address_space::global_space;
+
+class Test;
+using namespace sycl;
+int main() {
+  queue q;
+  const size_t N = 32;
+  float sum = 0;
+  std::vector<float> output(N);
+  std::fill(output.begin(), output.end(), 0.f);
+  {
+    buffer<float> sum_buf(&sum, 1);
+    q.submit([&](handler &cgh) {
+       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+       cgh.parallel_for<Test>(range<1>(N), [=](item<1> it) {
+         int gid = it.get_id(0);
+         auto atm = atomic_ref<float, order, scope, space>(sum[0]);
+         atm.fetch_add(1.f, order);
+         //CHECK-DAG: float @_Z21__spirv_AtomicFAddEXT{{.*}}(ptr {{.*}} %[[ARG0:.*]], i32 {{.*}}, i32 {{.*}}, float {{.*}} %[[ARG3:.*]])
+         //CHECK:   %[[RES:.*]] = atomicrmw fadd ptr addrspace(1) %[[ARG0]], float %[[ARG3]] monotonic, align 4
+         //CHECK:   ret float %[[RES]]
+         atm.fetch_max(1.f, order);
+         //CHECK-DAG: float @_Z21__spirv_AtomicFMaxEXT{{.*}}(ptr {{.*}} %[[ARG0:.*]], i32 {{.*}}, i32 {{.*}}, float {{.*}} %[[ARG3:.*]])
+         //CHECK:   %[[RES:.*]] = atomicrmw fmax ptr addrspace(1) %[[ARG0]], float %[[ARG3]] monotonic, align 4
+         //CHECK:   ret float %[[RES]]
+         atm.fetch_min(1.f, order);
+         //CHECK-DAG: float @_Z21__spirv_AtomicFMinEXT{{.*}}(ptr {{.*}} %[[ARG0:.*]], i32 {{.*}}, i32 {{.*}}, float {{.*}} %[[ARG3:.*]])
+         //CHECK:   %[[RES:.*]] = atomicrmw fmin ptr addrspace(1) %[[ARG0]], float %[[ARG3]] monotonic, align 4
+         //CHECK:   ret float %[[RES]]
+       });
+     }).wait_and_throw();
+  }
+}