Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 0 additions & 109 deletions .upstream-tests/test/heterogeneous/barrier_parity.pass.cpp

This file was deleted.

5 changes: 0 additions & 5 deletions docs/releases/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,6 @@ Supported ABI Versions: 4 (default), 3, and 2.

Included in: CUDA Toolkit 11.5.

### New Features

- #111: Adds two APIs to `cuda::std::barrier` for waiting on the parity of the barrier.
- Thanks to Olivier Giroux for this contribution.

### Issues Fixed

- #179: Refactors the atomic layer to allow for layering the host device/host abstractions.
Expand Down
64 changes: 23 additions & 41 deletions include/cuda/std/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
template<thread_scope>
friend class pipeline;

using std::__barrier_base<_CompletionF, _Sco>::__try_wait;

public:
barrier() = default;

Expand Down Expand Up @@ -86,6 +88,24 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
public:
using arrival_token = typename __barrier_base::arrival_token;

private:
struct __poll_tester {
barrier const* __this;
arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(barrier const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __phase) const {
#if __CUDA_ARCH__ >= 800
Expand All @@ -104,35 +124,14 @@ public:
else
#endif
{
return __barrier.try_wait(std::move(__phase));
return __barrier.__try_wait(std::move(__phase));
}
}

template<thread_scope>
friend class pipeline;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const {
#if __CUDA_ARCH__ >= 800
if (__isShared(&__barrier)) {
int __ready = 0;
asm volatile ("{\n\t"
".reg .pred p;\n\t"
"mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, p;\n\t"
"}"
: "=r"(__ready)
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
: "memory");
return bool(__ready);
}
else
#endif
{
return __barrier.try_wait_parity(__parity);
}
}

public:
barrier() = default;

barrier(const barrier &) = delete;
Expand Down Expand Up @@ -217,24 +216,7 @@ public:
_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token && __phase) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
}

_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __phase) const {
return __try_wait(_CUDA_VSTD::move(__phase));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool try_wait_parity(bool __parity) const
{
return __try_wait_parity(__parity);
}

inline _LIBCUDACXX_INLINE_VISIBILITY
void wait_parity(bool __parity) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
Expand Down
111 changes: 28 additions & 83 deletions libcxx/include/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,6 @@ class __barrier_base {
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
_LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
_LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco> __phase;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_phase(bool __old_phase) const
{
return __phase.load(memory_order_acquire) != __old_phase;
}
public:
using arrival_token = bool;

Expand Down Expand Up @@ -247,15 +241,11 @@ public:
return __old_phase;
}
_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __old) const
{
return __try_wait_phase(__old);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const
bool __try_wait(arrival_token __old_phase) const
{
return __try_wait_phase(__parity);
return __phase != __old_phase;
}

_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token&& __old_phase) const
{
Expand All @@ -280,42 +270,6 @@ public:
}
};

template<class __Barrier>
struct __barrier_poll_tester {
__Barrier const* __this;
typename __Barrier::arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

_LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->try_wait(__phase);
}
};

template<class __Barrier>
struct __barrier_poll_tester_parity {
__Barrier const* __this;
bool __parity;

_LIBCUDACXX_INLINE_VISIBILITY
__barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
: __this(__this_)
, __parity(__parity_)
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->try_wait_parity(__parity);
}
};

template<int _Sco>
class __barrier_base<__empty_completion, _Sco> {

Expand All @@ -331,23 +285,29 @@ public:
using arrival_token = uint64_t;

private:
static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
struct __poll_tester {
__barrier_base const* __this;
arrival_token __phase;

_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}

inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};

static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
uint64_t __init(ptrdiff_t __count) _NOEXCEPT
{
return (((1u << 31) - __count) << 32)
| ((1u << 31) - __count);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_phase(uint64_t __phase) const
{
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
return ((__current & __phase_bit) != __phase);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait_parity(bool __parity) const
{
return __try_wait_phase(__parity ? __phase_bit : 0);
}

public:
__barrier_base() = default;
Expand All @@ -363,20 +323,10 @@ public:
__barrier_base& operator=(__barrier_base const&) = delete;

_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __old) const
{
return __try_wait_phase(__old & __phase_bit);
}

_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait_parity(bool __parity) const
{
return __try_wait_parity(__parity);
}
_LIBCUDACXX_INLINE_VISIBILITY
bool try_wait(arrival_token __old) const
bool __try_wait(arrival_token __phase) const
{
return __try_wait(__old);
uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
return ((__current & __phase_bit) != __phase);
}

_LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
Expand All @@ -390,22 +340,17 @@ public:
}
return __old & __phase_bit;
}
_LIBCUDACXX_INLINE_VISIBILITY
inline _LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token&& __phase) const
{
__libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
}
_LIBCUDACXX_INLINE_VISIBILITY
void wait_parity(bool __parity) const
{
__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
}
_LIBCUDACXX_INLINE_VISIBILITY
inline _LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_wait()
{
wait(arrive());
}
_LIBCUDACXX_INLINE_VISIBILITY
inline _LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_drop()
{
__phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
Expand Down