Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 48d213a

Browse files
authored
Merge pull request #111 from NVIDIA/barrier-parity
Add parity waiting for `cuda::std::barrier`
2 parents 5fe780d + f882b5d commit 48d213a

File tree

3 files changed

+233
-51
lines changed

3 files changed

+233
-51
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// UNSUPPORTED: nvrtc, pre-sm-70
10+
11+
// uncomment for a really verbose output detailing what test steps are being launched
12+
// #define DEBUG_TESTERS
13+
14+
#include "helpers.h"
15+
16+
#include <atomic>
17+
#include <cuda/barrier>
18+
19+
template<typename Barrier>
20+
struct barrier_and_token
21+
{
22+
using barrier_t = Barrier;
23+
using token_t = typename barrier_t::arrival_token;
24+
25+
barrier_t barrier;
26+
cuda::std::atomic<bool> parity_waiting{false};
27+
28+
template<typename ...Args>
29+
__host__ __device__
30+
barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
31+
{
32+
}
33+
};
34+
35+
struct barrier_arrive_and_wait
36+
{
37+
using async = cuda::std::true_type;
38+
39+
template<typename Data>
40+
__host__ __device__
41+
static void perform(Data & data)
42+
{
43+
while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
44+
{
45+
data.parity_waiting.wait(false);
46+
}
47+
data.barrier.arrive_and_wait();
48+
}
49+
};
50+
51+
template <bool Phase>
52+
struct barrier_parity_wait
53+
{
54+
using async = cuda::std::true_type;
55+
56+
template<typename Data>
57+
__host__ __device__
58+
static void perform(Data & data)
59+
{
60+
data.parity_waiting.store(true, cuda::std::memory_order_release);
61+
data.parity_waiting.notify_all();
62+
data.barrier.wait_parity(Phase);
63+
}
64+
};
65+
66+
struct clear_token
67+
{
68+
template<typename Data>
69+
__host__ __device__
70+
static void perform(Data & data)
71+
{
72+
data.parity_waiting.store(false, cuda::std::memory_order_release);
73+
}
74+
};
75+
76+
using aw_aw_pw = performer_list<
77+
barrier_parity_wait<false>,
78+
barrier_arrive_and_wait,
79+
barrier_arrive_and_wait,
80+
async_tester_fence,
81+
clear_token,
82+
barrier_parity_wait<true>,
83+
barrier_arrive_and_wait,
84+
barrier_arrive_and_wait,
85+
async_tester_fence,
86+
clear_token
87+
>;
88+
89+
void kernel_invoker()
90+
{
91+
validate_not_movable<
92+
barrier_and_token<cuda::std::barrier<>>,
93+
aw_aw_pw
94+
>(2);
95+
validate_not_movable<
96+
barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
97+
aw_aw_pw
98+
>(2);
99+
}
100+
101+
int main(int arg, char ** argv)
102+
{
103+
#ifndef __CUDA_ARCH__
104+
kernel_invoker();
105+
#endif
106+
107+
return 0;
108+
}
109+

include/cuda/std/barrier

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
4040
template<thread_scope>
4141
friend class pipeline;
4242

43-
using std::__barrier_base<_CompletionF, _Sco>::__try_wait;
44-
4543
public:
4644
barrier() = default;
4745

@@ -88,24 +86,6 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
8886
public:
8987
using arrival_token = typename __barrier_base::arrival_token;
9088

91-
private:
92-
struct __poll_tester {
93-
barrier const* __this;
94-
arrival_token __phase;
95-
96-
_LIBCUDACXX_INLINE_VISIBILITY
97-
__poll_tester(barrier const* __this_, arrival_token&& __phase_)
98-
: __this(__this_)
99-
, __phase(_CUDA_VSTD::move(__phase_))
100-
{}
101-
102-
inline _LIBCUDACXX_INLINE_VISIBILITY
103-
bool operator()() const
104-
{
105-
return __this->__try_wait(__phase);
106-
}
107-
};
108-
10989
_LIBCUDACXX_INLINE_VISIBILITY
11090
bool __try_wait(arrival_token __phase) const {
11191
#if __CUDA_ARCH__ >= 800
@@ -124,14 +104,35 @@ private:
124104
else
125105
#endif
126106
{
127-
return __barrier.__try_wait(std::move(__phase));
107+
return __barrier.try_wait(std::move(__phase));
128108
}
129109
}
130110

131111
template<thread_scope>
132112
friend class pipeline;
133113

134-
public:
114+
_LIBCUDACXX_INLINE_VISIBILITY
115+
bool __try_wait_parity(bool __parity) const {
116+
#if __CUDA_ARCH__ >= 800
117+
if (__isShared(&__barrier)) {
118+
int __ready = 0;
119+
asm volatile ("{\n\t"
120+
".reg .pred p;\n\t"
121+
"mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
122+
"selp.b32 %0, 1, 0, p;\n\t"
123+
"}"
124+
: "=r"(__ready)
125+
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
126+
: "memory");
127+
return bool(__ready);
128+
}
129+
else
130+
#endif
131+
{
132+
return __barrier.try_wait_parity(__parity);
133+
}
134+
}
135+
135136
barrier() = default;
136137

137138
barrier(const barrier &) = delete;
@@ -216,7 +217,24 @@ public:
216217
_LIBCUDACXX_INLINE_VISIBILITY
217218
void wait(arrival_token && __phase) const
218219
{
219-
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
220+
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
221+
}
222+
223+
_LIBCUDACXX_INLINE_VISIBILITY
224+
bool try_wait(arrival_token __phase) const {
225+
return __try_wait(_CUDA_VSTD::move(__phase));
226+
}
227+
228+
inline _LIBCUDACXX_INLINE_VISIBILITY
229+
bool try_wait_parity(bool __parity) const
230+
{
231+
return __try_wait_parity(__parity);
232+
}
233+
234+
inline _LIBCUDACXX_INLINE_VISIBILITY
235+
void wait_parity(bool __parity) const
236+
{
237+
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
220238
}
221239

222240
inline _LIBCUDACXX_INLINE_VISIBILITY

libcxx/include/barrier

Lines changed: 83 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ class __barrier_base {
209209
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
210210
_LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
211211
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco> __phase;
212+
213+
_LIBCUDACXX_INLINE_VISIBILITY
214+
bool __try_wait_phase(bool __old_phase) const
215+
{
216+
return __phase.load(memory_order_acquire) != __old_phase;
217+
}
212218
public:
213219
using arrival_token = bool;
214220

@@ -241,11 +247,15 @@ public:
241247
return __old_phase;
242248
}
243249
_LIBCUDACXX_INLINE_VISIBILITY
244-
bool __try_wait(arrival_token __old_phase) const
250+
bool try_wait(arrival_token __old) const
245251
{
246-
return __phase != __old_phase;
252+
return __try_wait_phase(__old);
253+
}
254+
_LIBCUDACXX_INLINE_VISIBILITY
255+
bool __try_wait_parity(bool __parity) const
256+
{
257+
return __try_wait_phase(__parity);
247258
}
248-
249259
_LIBCUDACXX_INLINE_VISIBILITY
250260
void wait(arrival_token&& __old_phase) const
251261
{
@@ -270,6 +280,42 @@ public:
270280
}
271281
};
272282

283+
template<class __Barrier>
284+
struct __barrier_poll_tester {
285+
__Barrier const* __this;
286+
typename __Barrier::arrival_token __phase;
287+
288+
_LIBCUDACXX_INLINE_VISIBILITY
289+
__barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
290+
: __this(__this_)
291+
, __phase(_CUDA_VSTD::move(__phase_))
292+
{}
293+
294+
_LIBCUDACXX_INLINE_VISIBILITY
295+
bool operator()() const
296+
{
297+
return __this->try_wait(__phase);
298+
}
299+
};
300+
301+
template<class __Barrier>
302+
struct __barrier_poll_tester_parity {
303+
__Barrier const* __this;
304+
bool __parity;
305+
306+
_LIBCUDACXX_INLINE_VISIBILITY
307+
__barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
308+
: __this(__this_)
309+
, __parity(__parity_)
310+
{}
311+
312+
inline _LIBCUDACXX_INLINE_VISIBILITY
313+
bool operator()() const
314+
{
315+
return __this->try_wait_parity(__parity);
316+
}
317+
};
318+
273319
template<int _Sco>
274320
class __barrier_base<__empty_completion, _Sco> {
275321

@@ -285,29 +331,23 @@ public:
285331
using arrival_token = uint64_t;
286332

287333
private:
288-
struct __poll_tester {
289-
__barrier_base const* __this;
290-
arrival_token __phase;
291-
292-
_LIBCUDACXX_INLINE_VISIBILITY
293-
__poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
294-
: __this(__this_)
295-
, __phase(_CUDA_VSTD::move(__phase_))
296-
{}
297-
298-
inline _LIBCUDACXX_INLINE_VISIBILITY
299-
bool operator()() const
300-
{
301-
return __this->__try_wait(__phase);
302-
}
303-
};
304-
305-
static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
334+
static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
306335
uint64_t __init(ptrdiff_t __count) _NOEXCEPT
307336
{
308337
return (((1u << 31) - __count) << 32)
309338
| ((1u << 31) - __count);
310339
}
340+
_LIBCUDACXX_INLINE_VISIBILITY
341+
bool __try_wait_phase(uint64_t __phase) const
342+
{
343+
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
344+
return ((__current & __phase_bit) != __phase);
345+
}
346+
_LIBCUDACXX_INLINE_VISIBILITY
347+
bool __try_wait_parity(bool __parity) const
348+
{
349+
return __try_wait_phase(__parity ? __phase_bit : 0);
350+
}
311351

312352
public:
313353
__barrier_base() = default;
@@ -323,10 +363,20 @@ public:
323363
__barrier_base& operator=(__barrier_base const&) = delete;
324364

325365
_LIBCUDACXX_INLINE_VISIBILITY
326-
bool __try_wait(arrival_token __phase) const
366+
bool __try_wait(arrival_token __old) const
327367
{
328-
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
329-
return ((__current & __phase_bit) != __phase);
368+
return __try_wait_phase(__old & __phase_bit);
369+
}
370+
371+
_LIBCUDACXX_INLINE_VISIBILITY
372+
bool try_wait_parity(bool __parity) const
373+
{
374+
return __try_wait_parity(__parity);
375+
}
376+
_LIBCUDACXX_INLINE_VISIBILITY
377+
bool try_wait(arrival_token __old) const
378+
{
379+
return __try_wait(__old);
330380
}
331381

332382
_LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
@@ -340,17 +390,22 @@ public:
340390
}
341391
return __old & __phase_bit;
342392
}
343-
inline _LIBCUDACXX_INLINE_VISIBILITY
393+
_LIBCUDACXX_INLINE_VISIBILITY
344394
void wait(arrival_token&& __phase) const
345395
{
346-
__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
396+
__libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
347397
}
348-
inline _LIBCUDACXX_INLINE_VISIBILITY
398+
_LIBCUDACXX_INLINE_VISIBILITY
399+
void wait_parity(bool __parity) const
400+
{
401+
__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
402+
}
403+
_LIBCUDACXX_INLINE_VISIBILITY
349404
void arrive_and_wait()
350405
{
351406
wait(arrive());
352407
}
353-
inline _LIBCUDACXX_INLINE_VISIBILITY
408+
_LIBCUDACXX_INLINE_VISIBILITY
354409
void arrive_and_drop()
355410
{
356411
__phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);

0 commit comments

Comments
 (0)