diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
deleted file mode 100644
index 2c4460995b..0000000000
--- a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: nvrtc, pre-sm-70
-
-// uncomment for a really verbose output detailing what test steps are being launched
-// #define DEBUG_TESTERS
-
-#include "helpers.h"
-
-#include <atomic>
-#include <cuda/barrier>
-
-template<typename Barrier>
-struct barrier_and_token
-{
-    using barrier_t = Barrier;
-    using token_t = typename barrier_t::arrival_token;
-
-    barrier_t barrier;
-    cuda::std::atomic<bool> parity_waiting{false};
-
-    template<typename ...Args>
-    __host__ __device__
-    barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
-    {
-    }
-};
-
-struct barrier_arrive_and_wait
-{
-    using async = cuda::std::true_type;
-
-    template<typename Data>
-    __host__ __device__
-    static void perform(Data & data)
-    {
-        while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
-        {
-            data.parity_waiting.wait(false);
-        }
-        data.barrier.arrive_and_wait();
-    }
-};
-
-template <bool Phase>
-struct barrier_parity_wait
-{
-    using async = cuda::std::true_type;
-
-    template<typename Data>
-    __host__ __device__
-    static void perform(Data & data)
-    {
-        data.parity_waiting.store(true, cuda::std::memory_order_release);
-        data.parity_waiting.notify_all();
-        data.barrier.wait_parity(Phase);
-    }
-};
-
-struct clear_token
-{
-    template<typename Data>
-    __host__ __device__
-    static void perform(Data & data)
-    {
-        data.parity_waiting.store(false, cuda::std::memory_order_release);
-    }
-};
-
-using aw_aw_pw = performer_list<
-    barrier_parity_wait<false>,
-    barrier_arrive_and_wait,
-    barrier_arrive_and_wait,
-    async_tester_fence,
-    clear_token,
-    barrier_parity_wait<true>,
-    barrier_arrive_and_wait,
-    barrier_arrive_and_wait,
-    async_tester_fence,
-    clear_token
->;
-
-void kernel_invoker()
-{
-    validate_not_movable<
-        barrier_and_token<cuda::std::barrier<>>,
-        aw_aw_pw
-      >(2);
-    validate_not_movable<
-        barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
-        aw_aw_pw
-      >(2);
-}
-
-int main(int arg, char ** argv)
-{
-#ifndef __CUDA_ARCH__
-    kernel_invoker();
-#endif
-
-    return 0;
-}
-
diff --git a/docs/releases/changelog.md b/docs/releases/changelog.md
index cde1871e83..77ff291c15 100644
--- a/docs/releases/changelog.md
+++ b/docs/releases/changelog.md
@@ -22,11 +22,6 @@ Supported ABI Versions: 4 (default), 3, and 2.
 
 Included in: CUDA Toolkit 11.5.
 
-### New Features
-
-- #111: Adds two APIs to `cuda::std::barrier` for waiting on the parity of the barrier.
-  - Thanks to Olivier Giroux for this contribution.
-
 ### Issues Fixed
 
 - #179: Refactors the atomic layer to allow for layering the host device/host abstractions.
diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier
index e7af6f138c..b35e927e34 100644
--- a/include/cuda/std/barrier
+++ b/include/cuda/std/barrier
@@ -40,6 +40,8 @@ class barrier : public std::__barrier_base<_CompletionF, _Sco> {
     template<thread_scope>
     friend class pipeline;
 
+    using std::__barrier_base<_CompletionF, _Sco>::__try_wait;
+
 public:
     barrier() = default;
 
@@ -86,6 +88,24 @@ class barrier<thread_scope_block, std::__empty_completion> : public __block_scop
 public:
     using arrival_token = typename __barrier_base::arrival_token;
 
+private:
+    struct __poll_tester {
+        barrier const* __this;
+        arrival_token __phase;
+
+        _LIBCUDACXX_INLINE_VISIBILITY
+        __poll_tester(barrier const* __this_, arrival_token&& __phase_)
+          : __this(__this_)
+          , __phase(_CUDA_VSTD::move(__phase_))
+        {}
+
+        inline _LIBCUDACXX_INLINE_VISIBILITY
+        bool operator()() const
+        {
+            return __this->__try_wait(__phase);
+        }
+    };
+
     _LIBCUDACXX_INLINE_VISIBILITY
     bool __try_wait(arrival_token __phase) const {
 #if __CUDA_ARCH__ >= 800
@@ -104,35 +124,14 @@ public:
         else
 #endif
         {
-            return __barrier.try_wait(std::move(__phase));
+            return __barrier.__try_wait(std::move(__phase));
         }
     }
 
     template<thread_scope>
     friend class pipeline;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_parity(bool __parity) const {
-#if __CUDA_ARCH__ >= 800
-        if (__isShared(&__barrier)) {
-            int __ready = 0;
-            asm volatile ("{\n\t"
-                    ".reg .pred p;\n\t"
-                    "mbarrier.test_wait.parity.shared.b64 p, [%1], %2;\n\t"
-                    "selp.b32 %0, 1, 0, p;\n\t"
-                    "}"
-                : "=r"(__ready)
-                : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<std::uint32_t>(__parity))
-                : "memory");
-            return bool(__ready);
-        }
-        else
-#endif
-        {
-            return __barrier.try_wait_parity(__parity);
-        }
-    }
-
+public:
     barrier() = default;
 
     barrier(const barrier &) = delete;
@@ -217,24 +216,7 @@ public:
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token && __phase) const
     {
-        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester<barrier>(this, _CUDA_VSTD::move(__phase)));
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait(arrival_token __phase) const {
-        return __try_wait(_CUDA_VSTD::move(__phase));
-    }
-
-    inline _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait_parity(bool __parity) const
-    {
-        return __try_wait_parity(__parity);
-    }
-
-    inline _LIBCUDACXX_INLINE_VISIBILITY
-    void wait_parity(bool __parity) const
-    {
-        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(std::__barrier_poll_tester_parity<barrier>(this, __parity));
+        _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
     }
 
     inline _LIBCUDACXX_INLINE_VISIBILITY
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index d8be55452c..762df1934d 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -209,12 +209,6 @@ class __barrier_base {
     _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
     _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF                   __completion;
     _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco>      __phase;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_phase(bool __old_phase) const
-    {
-        return __phase.load(memory_order_acquire) != __old_phase;
-    }
 public:
     using arrival_token = bool;
 
@@ -247,15 +241,11 @@ public:
         return __old_phase;
     }
     _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait(arrival_token __old) const
-    {
-        return __try_wait_phase(__old);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_parity(bool __parity) const
+    bool __try_wait(arrival_token __old_phase) const
     {
-        return __try_wait_phase(__parity);
+        return __phase != __old_phase;
     }
+
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token&& __old_phase) const
     {
@@ -280,42 +270,6 @@ public:
     }
 };
 
-template<class __Barrier>
-struct __barrier_poll_tester {
-    __Barrier const* __this;
-    typename __Barrier::arrival_token __phase;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __barrier_poll_tester(__Barrier const* __this_, typename __Barrier::arrival_token&& __phase_)
-        : __this(__this_)
-        , __phase(_CUDA_VSTD::move(__phase_))
-    {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator()() const
-    {
-        return __this->try_wait(__phase);
-    }
-};
-
-template<class __Barrier>
-struct __barrier_poll_tester_parity {
-    __Barrier const* __this;
-    bool __parity;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __barrier_poll_tester_parity(__Barrier const* __this_, bool __parity_)
-        : __this(__this_)
-        , __parity(__parity_)
-    {}
-
-    inline _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator()() const
-    {
-        return __this->try_wait_parity(__parity);
-    }
-};
-
 template<int _Sco>
 class __barrier_base<__empty_completion, _Sco> {
 
@@ -331,23 +285,29 @@ public:
     using arrival_token = uint64_t;
 
 private:
-    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
+    struct __poll_tester {
+        __barrier_base const* __this;
+        arrival_token __phase;
+
+        _LIBCUDACXX_INLINE_VISIBILITY
+        __poll_tester(__barrier_base const* __this_, arrival_token&& __phase_)
+          : __this(__this_)
+          , __phase(_CUDA_VSTD::move(__phase_))
+        {}
+
+        inline _LIBCUDACXX_INLINE_VISIBILITY
+        bool operator()() const
+        {
+            return __this->__try_wait(__phase);
+        }
+    };
+
+    static inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
     uint64_t __init(ptrdiff_t __count) _NOEXCEPT
     {
         return (((1u << 31) - __count) << 32)
               | ((1u << 31) - __count);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_phase(uint64_t __phase) const
-    {
-        uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
-        return ((__current & __phase_bit) != __phase);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait_parity(bool __parity) const
-    {
-        return __try_wait_phase(__parity ? __phase_bit : 0);
-    }
 
 public:
     __barrier_base() = default;
@@ -363,20 +323,10 @@ public:
     __barrier_base& operator=(__barrier_base const&) = delete;
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    bool __try_wait(arrival_token __old) const
-    {
-        return __try_wait_phase(__old & __phase_bit);
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait_parity(bool __parity) const
-    {
-        return __try_wait_parity(__parity);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool try_wait(arrival_token __old) const
+    bool __try_wait(arrival_token __phase) const
     {
-        return __try_wait(__old);
+        uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
+        return ((__current & __phase_bit) != __phase);
     }
 
     _LIBCUDACXX_NODISCARD_ATTRIBUTE inline _LIBCUDACXX_INLINE_VISIBILITY
@@ -390,22 +340,17 @@ public:
         }
         return __old & __phase_bit;
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
+    inline _LIBCUDACXX_INLINE_VISIBILITY
     void wait(arrival_token&& __phase) const
     {
-        __libcpp_thread_poll_with_backoff(__barrier_poll_tester<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
+        __libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait_parity(bool __parity) const
-    {
-        __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
+    inline _LIBCUDACXX_INLINE_VISIBILITY
     void arrive_and_wait()
     {
         wait(arrive());
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
+    inline _LIBCUDACXX_INLINE_VISIBILITY
     void arrive_and_drop()
     {
         __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);