Skip to content

Commit 132f763

Browse files
author
Konrad Kusiak
authored
[SYCL][CUDA] Implement root group barrier (#14828)
This PR adds an algorithm for doing a GPU wide barrier in CUDA backend. Rough outline of the algorithm: - Every `0th` thread from each workgroup performs `atomic.add(1)` - The same thread checks the atomic result with `ld.acquire` in a loop until it's equal to total amount of workgroups. - All threads call group-wide `barrier.sync` One caveat to this is that there is no initialization of the atomic start value. So if we call this barrier several times in a kernel, on the second iteration, the start value will already contain the result from previous barrier. That's why we actually spin the while loop while `current value % totalWgroups != 0`.
1 parent 6532637 commit 132f763

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed

libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
4545
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
4646
__spirv_ControlBarrier(unsigned int scope, unsigned int memory,
4747
unsigned int semantics) {
48+
unsigned int order = semantics & 0x1F;
4849
if (scope == Subgroup) {
4950
// use a full mask as barriers are required to be convergent and exited
5051
// threads can safely be in the mask
5152
__nvvm_bar_warp_sync(0xFFFFFFFF);
53+
} else if (scope == Device && memory == Device &&
54+
order == SequentiallyConsistent &&
55+
__clc_nvvm_reflect_arch() >= 700) {
56+
unsigned int env1, env2;
57+
__asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1));
58+
__asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2));
59+
long long envreg1 = env1;
60+
long long envreg2 = env2;
61+
// Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of
62+
// envreg1: s64[envreg2][envreg1]. The resulting value is the address in
63+
// device global memory region, where atomic operations can be performed.
64+
long long atomicAddr;
65+
__asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;"
66+
: "=l"(atomicAddr)
67+
: "l"(envreg1), "l"(envreg2));
68+
if (!atomicAddr) {
69+
__builtin_trap();
70+
} else {
71+
unsigned int tidX = __nvvm_read_ptx_sreg_tid_x();
72+
unsigned int tidY = __nvvm_read_ptx_sreg_tid_y();
73+
unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z();
74+
if (tidX + tidY + tidZ == 0) {
75+
// Increment address by 4 to get the precise region initialized to 0.
76+
atomicAddr += 4;
77+
unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x();
78+
unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y();
79+
unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z();
80+
unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ;
81+
82+
// Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all
83+
// CTAs have performed the addition
84+
unsigned int prev, current;
85+
__asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;"
86+
: "=r"(prev)
87+
: "l"(atomicAddr));
88+
do {
89+
__asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];"
90+
: "=r"(current)
91+
: "l"(atomicAddr));
92+
} while (current % totalNctaid != 0);
93+
}
94+
__nvvm_barrier_sync(0);
95+
}
5296
} else {
5397
__syncthreads();
5498
}

sycl/test-e2e/GroupAlgorithm/root_group.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// Fails with opencl non-cpu, enable when fixed.
22
// XFAIL: (opencl && !cpu && !accelerator)
3-
// RUN: %{build} -I . -o %t.out
3+
// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
44
// RUN: %{run} %t.out
55

66
// Disabled temporarily while investigation into the failure is ongoing.
@@ -10,6 +10,7 @@
1010
#include <cstdlib>
1111
#include <type_traits>
1212

13+
#include <sycl/builtins.hpp>
1314
#include <sycl/detail/core.hpp>
1415
#include <sycl/ext/oneapi/experimental/root_group.hpp>
1516
#include <sycl/group_barrier.hpp>
@@ -53,10 +54,17 @@ void testRootGroup() {
5354
sycl::accessor data{dataBuf, h};
5455
h.parallel_for<
5556
class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
57+
volatile float X = 1.0f;
58+
volatile float Y = 1.0f;
5659
auto root = it.ext_oneapi_get_root_group();
5760
data[root.get_local_id()] = root.get_local_id();
5861
sycl::group_barrier(root);
59-
62+
// Delay half of the workgroups with extra work to check that the barrier
63+
// synchronizes the whole device.
64+
if (it.get_group(0) % 2 == 0) {
65+
X += sycl::sin(X);
66+
Y += sycl::cos(Y);
67+
}
6068
root =
6169
sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
6270
int sum = data[root.get_local_id()] +

0 commit comments

Comments
 (0)