diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp deleted file mode 100644 index 2c4460995b..0000000000 --- a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp +++ /dev/null @@ -1,109 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: nvrtc, pre-sm-70 - -// uncomment for a really verbose output detailing what test steps are being launched -// #define DEBUG_TESTERS - -#include "helpers.h" - -#include -#include - -template -struct barrier_and_token -{ - using barrier_t = Barrier; - using token_t = typename barrier_t::arrival_token; - - barrier_t barrier; - cuda::std::atomic parity_waiting{false}; - - template - __host__ __device__ - barrier_and_token(Args && ...args) : barrier{ cuda::std::forward(args)... } - { - } -}; - -struct barrier_arrive_and_wait -{ - using async = cuda::std::true_type; - - template - __host__ __device__ - static void perform(Data & data) - { - while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false) - { - data.parity_waiting.wait(false); - } - data.barrier.arrive_and_wait(); - } -}; - -template -struct barrier_parity_wait -{ - using async = cuda::std::true_type; - - template - __host__ __device__ - static void perform(Data & data) - { - data.parity_waiting.store(true, cuda::std::memory_order_release); - data.parity_waiting.notify_all(); - data.barrier.wait_parity(Phase); - } -}; - -struct clear_token -{ - template - __host__ __device__ - static void perform(Data & data) - { - data.parity_waiting.store(false, cuda::std::memory_order_release); - } -}; - -using aw_aw_pw = performer_list< - barrier_parity_wait, - barrier_arrive_and_wait, - barrier_arrive_and_wait, - async_tester_fence, - clear_token, - barrier_parity_wait, - barrier_arrive_and_wait, - barrier_arrive_and_wait, - async_tester_fence, - clear_token ->; - -void kernel_invoker() -{ - validate_not_movable< - barrier_and_token>, - aw_aw_pw - >(2); - validate_not_movable< - barrier_and_token>, - aw_aw_pw - >(2); -} - -int main(int arg, char ** argv) -{ -#ifndef __CUDA_ARCH__ - kernel_invoker(); -#endif - - return 0; -} - diff --git a/docs/releases/changelog.md b/docs/releases/changelog.md index cde1871e83..77ff291c15 100644 --- a/docs/releases/changelog.md +++ b/docs/releases/changelog.md @@ -22,11 +22,6 @@ Supported ABI Versions: 4 (default), 3, and 2. Included in: CUDA Toolkit 11.5. -### New Features - -- #111: Adds two APIs to `cuda::std::barrier` for waiting on the parity of the barrier. - - Thanks to Olivier Giroux for this contribution. - ### Issues Fixed - #179: Refactors the atomic layer to allow for layering the host device/host abstractions. diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index e7af6f138c..b35e927e34 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -40,6 +40,8 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> { template friend class pipeline; + using std::__barrier_base<_CompletionF, _Sco>::__try_wait; + public: barrier() = default; @@ -86,6 +88,24 @@ class barrier : public __block_scop public: using arrival_token = typename __barrier_base::arrival_token; +private: + struct __poll_tester { + barrier const* __this; + arrival_token __phase; + + _LIBCUDACXX_INLINE_VISIBILITY + __poll_tester(barrier const* __this_, arrival_token&& __phase_) + : __this(__this_) + , __phase(_CUDA_VSTD::move(__phase_)) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->__try_wait(__phase); + } + }; + _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __phase) const { #if __CUDA_ARCH__ >= 800 @@ -104,35 +124,14 @@ public: else #endif { - return __barrier.try_wait(std::move(__phase)); + return __barrier.__try_wait(std::move(__phase)); } } template friend class pipeline; - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_parity(bool __parity) const { -#if __CUDA_ARCH__ >= 800 - if (__isShared(&__barrier)) { - int __ready = 0; - asm volatile ("{\n\t" - ".reg .pred p;\n\t" - "mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t" - "selp.b32 %0, 1, 0, p;\n\t" - "}" - : "=r"(__ready) - : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "r"(static_cast(__parity)) - : "memory"); - return bool(__ready); - } - else -#endif - { - return __barrier.try_wait_parity(__parity); - } - } - +public: barrier() = default; barrier(const barrier &) = delete; @@ -217,24 +216,7 @@ public: _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token && __phase) const { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester(this, _CUDA_VSTD::move(__phase))); - } - - _LIBCUDACXX_INLINE_VISIBILITY - bool try_wait(arrival_token __phase) const { - return __try_wait(_CUDA_VSTD::move(__phase)); - } - - inline _LIBCUDACXX_INLINE_VISIBILITY - bool try_wait_parity(bool __parity) const - { - return __try_wait_parity(__parity); - } - - inline _LIBCUDACXX_INLINE_VISIBILITY - void wait_parity(bool __parity) const - { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity(this, __parity)); + _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); } inline _LIBCUDACXX_INLINE_VISIBILITY diff --git a/libcxx/include/barrier b/libcxx/include/barrier index d8be55452c..762df1934d 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -209,12 +209,6 @@ class __barrier_base { _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __expected, __arrived; _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion; _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base __phase; - - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_phase(bool __old_phase) const - { - return __phase.load(memory_order_acquire) != __old_phase; - } public: using arrival_token = bool; @@ -247,15 +241,11 @@ public: return __old_phase; } _LIBCUDACXX_INLINE_VISIBILITY - bool try_wait(arrival_token __old) const - { - return __try_wait_phase(__old); - } - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_parity(bool __parity) const + bool __try_wait(arrival_token __old_phase) const { - return __try_wait_phase(__parity); + return __phase != __old_phase; } + _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const { @@ -280,42 +270,6 @@ public: } }; -template -struct __barrier_poll_tester { - __Barrier const* __this; - typename __Barrier::arrival_token __phase; - - _LIBCUDACXX_INLINE_VISIBILITY - __barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_) - : __this(__this_) - , __phase(_CUDA_VSTD::move(__phase_)) - {} - - _LIBCUDACXX_INLINE_VISIBILITY - bool operator()() const - { - return __this->try_wait(__phase); - } -}; - -template -struct __barrier_poll_tester_parity { - __Barrier const* __this; - bool __parity; - - _LIBCUDACXX_INLINE_VISIBILITY - __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_) - : __this(__this_) - , __parity(__parity_) - {} - - inline _LIBCUDACXX_INLINE_VISIBILITY - bool operator()() const - { - return __this->try_wait_parity(__parity); - } -}; - template class __barrier_base<__empty_completion, _Sco> { @@ -331,23 +285,29 @@ public: using arrival_token = uint64_t; private: - static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR + struct __poll_tester { + __barrier_base const* __this; + arrival_token __phase; + + _LIBCUDACXX_INLINE_VISIBILITY + __poll_tester(__barrier_base const* __this_, arrival_token&& __phase_) + : __this(__this_) + , __phase(_CUDA_VSTD::move(__phase_)) + {} + + inline _LIBCUDACXX_INLINE_VISIBILITY + bool operator()() const + { + return __this->__try_wait(__phase); + } + }; + + static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR uint64_t __init(ptrdiff_t __count) _NOEXCEPT { return (((1u << 31) - __count) << 32) | ((1u << 31) - __count); } - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_phase(uint64_t __phase) const - { - uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); - return ((__current & __phase_bit) != __phase); - } - _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait_parity(bool __parity) const - { - return __try_wait_phase(__parity ? __phase_bit : 0); - } public: __barrier_base() = default; @@ -363,20 +323,10 @@ public: __barrier_base& operator=(__barrier_base const&) = delete; _LIBCUDACXX_INLINE_VISIBILITY - bool __try_wait(arrival_token __old) const - { - return __try_wait_phase(__old & __phase_bit); - } - - _LIBCUDACXX_INLINE_VISIBILITY - bool try_wait_parity(bool __parity) const - { - return __try_wait_parity(__parity); - } - _LIBCUDACXX_INLINE_VISIBILITY - bool try_wait(arrival_token __old) const + bool __try_wait(arrival_token __phase) const { - return __try_wait(__old); + uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); + return ((__current & __phase_bit) != __phase); } _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY @@ -390,22 +340,17 @@ public: } return __old & __phase_bit; } - _LIBCUDACXX_INLINE_VISIBILITY + inline _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const { - __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase))); + __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); } - _LIBCUDACXX_INLINE_VISIBILITY - void wait_parity(bool __parity) const - { - __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity)); - } - _LIBCUDACXX_INLINE_VISIBILITY + inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() { wait(arrive()); } - _LIBCUDACXX_INLINE_VISIBILITY + inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop() { __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);