[SYCL][CUDA] Implement root group barrier (#14828)

Konrad Kusiak · web-flow · commit 132f763fd793 · 2024-08-01T08:12:02.000+02:00
This PR adds an algorithm for doing a GPU wide barrier in CUDA backend. 

Rough outline of the algorithm:
- Every `0th` thread from each workgroup performs `atomic.add(1)`
- The same thread checks the atomic result with `ld.acquire` in a loop
until it's equal to total amount of workgroups.
- All threads call group-wide `barrier.sync`

One caveat to this is that there is no initialization of the atomic
start value. So if we call this barrier several times in a kernel, on
the second iteration, the start value will already contain the result
from previous barrier. That's why we actually spin the while loop while
`current value % totalWgroups != 0`.
diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
@@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
 _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
 __spirv_ControlBarrier(unsigned int scope, unsigned int memory,
                        unsigned int semantics) {
+  unsigned int order = semantics & 0x1F;
   if (scope == Subgroup) {
     // use a full mask as barriers are required to be convergent and exited
     // threads can safely be in the mask
     __nvvm_bar_warp_sync(0xFFFFFFFF);
+  } else if (scope == Device && memory == Device &&
+             order == SequentiallyConsistent &&
+             __clc_nvvm_reflect_arch() >= 700) {
+    unsigned int env1, env2;
+    __asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1));
+    __asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2));
+    long long envreg1 = env1;
+    long long envreg2 = env2;
+    // Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of
+    // envreg1: s64[envreg2][envreg1]. The resulting value is the address in
+    // device global memory region, where atomic operations can be performed.
+    long long atomicAddr;
+    __asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;"
+                         : "=l"(atomicAddr)
+                         : "l"(envreg1), "l"(envreg2));
+    if (!atomicAddr) {
+      __builtin_trap();
+    } else {
+      unsigned int tidX = __nvvm_read_ptx_sreg_tid_x();
+      unsigned int tidY = __nvvm_read_ptx_sreg_tid_y();
+      unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z();
+      if (tidX + tidY + tidZ == 0) {
+        // Increment address by 4 to get the precise region initialized to 0.
+        atomicAddr += 4;
+        unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x();
+        unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y();
+        unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z();
+        unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ;
+
+        // Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all
+        // CTAs have performed the addition
+        unsigned int prev, current;
+        __asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;"
+                             : "=r"(prev)
+                             : "l"(atomicAddr));
+        do {
+          __asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];"
+                               : "=r"(current)
+                               : "l"(atomicAddr));
+        } while (current % totalNctaid != 0);
+      }
+      __nvvm_barrier_sync(0);
+    }
   } else {
     __syncthreads();
   }
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -1,6 +1,6 @@
 // Fails with opencl non-cpu, enable when fixed.
 // XFAIL: (opencl && !cpu && !accelerator)
-// RUN: %{build} -I . -o %t.out
+// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
 // Disabled temporarily while investigation into the failure is ongoing.
@@ -10,6 +10,7 @@
 #include <cstdlib>
 #include <type_traits>
 
+#include <sycl/builtins.hpp>
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/experimental/root_group.hpp>
 #include <sycl/group_barrier.hpp>
@@ -53,10 +54,17 @@ void testRootGroup() {
     sycl::accessor data{dataBuf, h};
     h.parallel_for<
         class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
+      volatile float X = 1.0f;
+      volatile float Y = 1.0f;
       auto root = it.ext_oneapi_get_root_group();
       data[root.get_local_id()] = root.get_local_id();
       sycl::group_barrier(root);
-
+      // Delay half of the workgroups with extra work to check that the barrier
+      // synchronizes the whole device.
+      if (it.get_group(0) % 2 == 0) {
+        X += sycl::sin(X);
+        Y += sycl::cos(Y);
+      }
       root =
           sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
       int sum = data[root.get_local_id()] +