Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 6a2e2ac

Browse files
authored
Merge pull request #193 from NVIDIA/revert-111-barrier-parity
Revert "Add parity waiting"
2 parents e6030e0 + 5fdf7f2 commit 6a2e2ac

File tree

4 files changed

+51
-238
lines changed

4 files changed

+51
-238
lines changed

.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp

Lines changed: 0 additions & 109 deletions
This file was deleted.

docs/releases/changelog.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,6 @@ Supported ABI Versions: 4 (default), 3, and 2.
2222

2323
Included in: CUDA Toolkit 11.5.
2424

25-
### New Features
26-
27-
- #111: Adds two APIs to `cuda::std::barrier` for waiting on the parity of the barrier.
28-
- Thanks to Olivier Giroux for this contribution.
29-
3025
### Issues Fixed
3126

3227
- #179: Refactors the atomic layer to allow for layering the host device/host abstractions.

include/cuda/std/barrier

Lines changed: 23 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
4040
template<thread_scope>
4141
friend class pipeline;
4242

43+
using std::__barrier_base<_CompletionF, _Sco>::__try_wait;
44+
4345
public:
4446
barrier() = default;
4547

@@ -86,6 +88,24 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
8688
public:
8789
using arrival_token = typename __barrier_base::arrival_token;
8890

91+
private:
92+
struct __poll_tester {
93+
barrier const* __this;
94+
arrival_token __phase;
95+
96+
_LIBCUDACXX_INLINE_VISIBILITY
97+
__poll_tester(barrier const* __this_, arrival_token&& __phase_)
98+
: __this(__this_)
99+
, __phase(_CUDA_VSTD::move(__phase_))
100+
{}
101+
102+
inline _LIBCUDACXX_INLINE_VISIBILITY
103+
bool operator()() const
104+
{
105+
return __this->__try_wait(__phase);
106+
}
107+
};
108+
89109
_LIBCUDACXX_INLINE_VISIBILITY
90110
bool __try_wait(arrival_token __phase) const {
91111
#if __CUDA_ARCH__ >= 800
@@ -104,35 +124,14 @@ public:
104124
else
105125
#endif
106126
{
107-
return __barrier.try_wait(std::move(__phase));
127+
return __barrier.__try_wait(std::move(__phase));
108128
}
109129
}
110130

111131
template<thread_scope>
112132
friend class pipeline;
113133

114-
_LIBCUDACXX_INLINE_VISIBILITY
115-
bool __try_wait_parity(bool __parity) const {
116-
#if __CUDA_ARCH__ >= 800
117-
if (__isShared(&__barrier)) {
118-
int __ready = 0;
119-
asm volatile ("{\n\t"
120-
".reg .pred p;\n\t"
121-
"mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
122-
"selp.b32 %0, 1, 0, p;\n\t"
123-
"}"
124-
: "=r"(__ready)
125-
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
126-
: "memory");
127-
return bool(__ready);
128-
}
129-
else
130-
#endif
131-
{
132-
return __barrier.try_wait_parity(__parity);
133-
}
134-
}
135-
134+
public:
136135
barrier() = default;
137136

138137
barrier(const barrier &) = delete;
@@ -217,24 +216,7 @@ public:
217216
_LIBCUDACXX_INLINE_VISIBILITY
218217
void wait(arrival_token && __phase) const
219218
{
220-
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
221-
}
222-
223-
_LIBCUDACXX_INLINE_VISIBILITY
224-
bool try_wait(arrival_token __phase) const {
225-
return __try_wait(_CUDA_VSTD::move(__phase));
226-
}
227-
228-
inline _LIBCUDACXX_INLINE_VISIBILITY
229-
bool try_wait_parity(bool __parity) const
230-
{
231-
return __try_wait_parity(__parity);
232-
}
233-
234-
inline _LIBCUDACXX_INLINE_VISIBILITY
235-
void wait_parity(bool __parity) const
236-
{
237-
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
219+
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
238220
}
239221

240222
inline _LIBCUDACXX_INLINE_VISIBILITY

libcxx/include/barrier

Lines changed: 28 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,6 @@ class __barrier_base {
209209
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
210210
_LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
211211
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco> __phase;
212-
213-
_LIBCUDACXX_INLINE_VISIBILITY
214-
bool __try_wait_phase(bool __old_phase) const
215-
{
216-
return __phase.load(memory_order_acquire) != __old_phase;
217-
}
218212
public:
219213
using arrival_token = bool;
220214

@@ -247,15 +241,11 @@ public:
247241
return __old_phase;
248242
}
249243
_LIBCUDACXX_INLINE_VISIBILITY
250-
bool try_wait(arrival_token __old) const
251-
{
252-
return __try_wait_phase(__old);
253-
}
254-
_LIBCUDACXX_INLINE_VISIBILITY
255-
bool __try_wait_parity(bool __parity) const
244+
bool __try_wait(arrival_token __old_phase) const
256245
{
257-
return __try_wait_phase(__parity);
246+
return __phase != __old_phase;
258247
}
248+
259249
_LIBCUDACXX_INLINE_VISIBILITY
260250
void wait(arrival_token&& __old_phase) const
261251
{
@@ -280,42 +270,6 @@ public:
280270
}
281271
};
282272

283-
template<class __Barrier>
284-
struct __barrier_poll_tester {
285-
__Barrier const* __this;
286-
typename __Barrier::arrival_token __phase;
287-
288-
_LIBCUDACXX_INLINE_VISIBILITY
289-
__barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
290-
: __this(__this_)
291-
, __phase(_CUDA_VSTD::move(__phase_))
292-
{}
293-
294-
_LIBCUDACXX_INLINE_VISIBILITY
295-
bool operator()() const
296-
{
297-
return __this->try_wait(__phase);
298-
}
299-
};
300-
301-
template<class __Barrier>
302-
struct __barrier_poll_tester_parity {
303-
__Barrier const* __this;
304-
bool __parity;
305-
306-
_LIBCUDACXX_INLINE_VISIBILITY
307-
__barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
308-
: __this(__this_)
309-
, __parity(__parity_)
310-
{}
311-
312-
inline _LIBCUDACXX_INLINE_VISIBILITY
313-
bool operator()() const
314-
{
315-
return __this->try_wait_parity(__parity);
316-
}
317-
};
318-
319273
template<int _Sco>
320274
class __barrier_base<__empty_completion, _Sco> {
321275

@@ -331,23 +285,29 @@ public:
331285
using arrival_token = uint64_t;
332286

333287
private:
334-
static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
288+
struct __poll_tester {
289+
__barrier_base const* __this;
290+
arrival_token __phase;
291+
292+
_LIBCUDACXX_INLINE_VISIBILITY
293+
__poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
294+
: __this(__this_)
295+
, __phase(_CUDA_VSTD::move(__phase_))
296+
{}
297+
298+
inline _LIBCUDACXX_INLINE_VISIBILITY
299+
bool operator()() const
300+
{
301+
return __this->__try_wait(__phase);
302+
}
303+
};
304+
305+
static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
335306
uint64_t __init(ptrdiff_t __count) _NOEXCEPT
336307
{
337308
return (((1u << 31) - __count) << 32)
338309
| ((1u << 31) - __count);
339310
}
340-
_LIBCUDACXX_INLINE_VISIBILITY
341-
bool __try_wait_phase(uint64_t __phase) const
342-
{
343-
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
344-
return ((__current & __phase_bit) != __phase);
345-
}
346-
_LIBCUDACXX_INLINE_VISIBILITY
347-
bool __try_wait_parity(bool __parity) const
348-
{
349-
return __try_wait_phase(__parity ? __phase_bit : 0);
350-
}
351311

352312
public:
353313
__barrier_base() = default;
@@ -363,20 +323,10 @@ public:
363323
__barrier_base& operator=(__barrier_base const&) = delete;
364324

365325
_LIBCUDACXX_INLINE_VISIBILITY
366-
bool __try_wait(arrival_token __old) const
367-
{
368-
return __try_wait_phase(__old & __phase_bit);
369-
}
370-
371-
_LIBCUDACXX_INLINE_VISIBILITY
372-
bool try_wait_parity(bool __parity) const
373-
{
374-
return __try_wait_parity(__parity);
375-
}
376-
_LIBCUDACXX_INLINE_VISIBILITY
377-
bool try_wait(arrival_token __old) const
326+
bool __try_wait(arrival_token __phase) const
378327
{
379-
return __try_wait(__old);
328+
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
329+
return ((__current & __phase_bit) != __phase);
380330
}
381331

382332
_LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
@@ -390,22 +340,17 @@ public:
390340
}
391341
return __old & __phase_bit;
392342
}
393-
_LIBCUDACXX_INLINE_VISIBILITY
343+
inline _LIBCUDACXX_INLINE_VISIBILITY
394344
void wait(arrival_token&& __phase) const
395345
{
396-
__libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
346+
__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
397347
}
398-
_LIBCUDACXX_INLINE_VISIBILITY
399-
void wait_parity(bool __parity) const
400-
{
401-
__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
402-
}
403-
_LIBCUDACXX_INLINE_VISIBILITY
348+
inline _LIBCUDACXX_INLINE_VISIBILITY
404349
void arrive_and_wait()
405350
{
406351
wait(arrive());
407352
}
408-
_LIBCUDACXX_INLINE_VISIBILITY
353+
inline _LIBCUDACXX_INLINE_VISIBILITY
409354
void arrive_and_drop()
410355
{
411356
__phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);

0 commit comments

Comments
 (0)