NVIDIA · wmaxey · Aug 6, 2021 · Aug 5, 2021 · Aug 5, 2021
diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
diff --git a/docs/releases/changelog.md b/docs/releases/changelog.md
@@ -22,11 +22,6 @@ Supported ABI Versions: 4 (default), 3, and 2.
 
 Included in: CUDA Toolkit 11.5.
 
-### New Features
-
-- #111: Adds two APIs to `cuda::std::barrier` for waiting on the parity of the barrier.
-  - Thanks to Olivier Giroux for this contribution.
-
 ### Issues Fixed
 
 - #179: Refactors the atomic layer to allow for layering the host device/host abstractions.

diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier
@@ -40,6 +40,8 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
     template<thread_scope>
     friend class pipeline;
 
+    using std::__barrier_base<_CompletionF, _Sco>::__try_wait;
+
 public:
     barrier() = default;
 
@@ -86,6 +88,24 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
 public:
     using arrival_token = typename __barrier_base::arrival_token;
 
+private:
+    struct __poll_tester {
+        barrier const* __this;
+        arrival_token __phase;
+
+        _LIBCUDACXX_INLINE_VISIBILITY
+        __poll_tester(barrier const* __this_, arrival_token&& __phase_)
+          : __this(__this_)
+          , __phase(_CUDA_VSTD::move(__phase_))
+        {}
+
+        inline _LIBCUDACXX_INLINE_VISIBILITY
+        bool operator()() const
+        {
+            return __this->__try_wait(__phase);
+        }
+    };
+
     _LIBCUDACXX_INLINE_VISIBILITY
     bool __try_wait(arrival_token __phase) const {
 #if __CUDA_ARCH__ >= 800
@@ -104,35 +124,14 @@ public:
         else
 #endif
         {
-            return __barrier.try_wait(std::move(__phase));
+            return __barrier.__try_wait(std::move(__phase));
         }
     }
 
     template<thread_scope>
     friend class pipeline;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_parity(bool __parity) const {
-#if __CUDA_ARCH__ >= 800
-        if (__isShared(&__barrier)) {
-            int __ready = 0;
-            asm volatile ("{\n\t"
-                    ".reg .pred p;\n\t"
-                    "mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
-                    "selp.b32 %0, 1, 0, p;\n\t"
-                    "}"
-                : "=r"(__ready)
-                : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
-                : "memory");
-            return bool(__ready);
-        }
-        else
-#endif
-        {
-            return __barrier.try_wait_parity(__parity);
-        }
-    }
-
+public:
     barrier() = default;
 
     barrier(const barrier &) = delete;
@@ -217,24 +216,7 @@ public:
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token && __phase) const
     {
-        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait(arrival_token __phase) const {
-        return __try_wait(_CUDA_VSTD::move(__phase));
-    }
-
-    inline _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait_parity(bool __parity) const
-    {
-        return __try_wait_parity(__parity);
-    }
-
-    inline _LIBCUDACXX_INLINE_VISIBILITY
-    void wait_parity(bool __parity) const
-    {
-        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
+        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
     }
 
     inline _LIBCUDACXX_INLINE_VISIBILITY

diff --git a/libcxx/include/barrier b/libcxx/include/barrier
@@ -209,12 +209,6 @@ class __barrier_base {
     _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
     _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF                   __completion;
     _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco>      __phase;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_phase(bool __old_phase) const
-    {
-        return __phase.load(memory_order_acquire) != __old_phase;
-    }
 public:
     using arrival_token = bool;
 
@@ -247,15 +241,11 @@ public:
         return __old_phase;
     }
     _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait(arrival_token __old) const
-    {
-        return __try_wait_phase(__old);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_parity(bool __parity) const
+    bool __try_wait(arrival_token __old_phase) const
     {
-        return __try_wait_phase(__parity);
+        return __phase != __old_phase;
     }
+
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token&& __old_phase) const
     {
@@ -280,42 +270,6 @@ public:
     }
 };
 
-template<class __Barrier>
-struct __barrier_poll_tester {
-    __Barrier const* __this;
-    typename __Barrier::arrival_token __phase;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
-        : __this(__this_)
-        , __phase(_CUDA_VSTD::move(__phase_))
-    {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator()() const
-    {
-        return __this->try_wait(__phase);
-    }
-};
-
-template<class __Barrier>
-struct __barrier_poll_tester_parity {
-    __Barrier const* __this;
-    bool __parity;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
-        : __this(__this_)
-        , __parity(__parity_)
-    {}
-
-    inline _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator()() const
-    {
-        return __this->try_wait_parity(__parity);
-    }
-};
-
 template<int _Sco>
 class __barrier_base<__empty_completion, _Sco> {
 
@@ -331,23 +285,29 @@ public:
     using arrival_token = uint64_t;
 
 private:
-    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
+    struct __poll_tester {
+        __barrier_base const* __this;
+        arrival_token __phase;
+
+        _LIBCUDACXX_INLINE_VISIBILITY
+        __poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
+          : __this(__this_)
+          , __phase(_CUDA_VSTD::move(__phase_))
+        {}
+
+        inline _LIBCUDACXX_INLINE_VISIBILITY
+        bool operator()() const
+        {
+            return __this->__try_wait(__phase);
+        }
+    };
+
+    static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
     uint64_t __init(ptrdiff_t __count) _NOEXCEPT
     {
         return (((1u << 31) - __count) << 32)
               | ((1u << 31) - __count);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_phase(uint64_t __phase) const
-    {
-        uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
-        return ((__current & __phase_bit) != __phase);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_parity(bool __parity) const
-    {
-        return __try_wait_phase(__parity ? __phase_bit : 0);
-    }
 
 public:
     __barrier_base() = default;
@@ -363,20 +323,10 @@ public:
     __barrier_base& operator=(__barrier_base const&) = delete;
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait(arrival_token __old) const
-    {
-        return __try_wait_phase(__old & __phase_bit);
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait_parity(bool __parity) const
-    {
-        return __try_wait_parity(__parity);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait(arrival_token __old) const
+    bool __try_wait(arrival_token __phase) const
     {
-        return __try_wait(__old);
+        uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
+        return ((__current & __phase_bit) != __phase);
     }
 
     _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
@@ -390,22 +340,17 @@ public:
         }
         return __old & __phase_bit;
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
+    inline _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token&& __phase) const
     {
-        __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
+        __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait_parity(bool __parity) const
-    {
-        __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
+    inline _LIBCUDACXX_INLINE_VISIBILITY
     void arrive_and_wait()
     {
         wait(arrive());
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
+    inline _LIBCUDACXX_INLINE_VISIBILITY
     void arrive_and_drop()
     {
         __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);