Merge remote-tracking branch 'origin/main' into release-0.31.X

msimberg · msimberg · commit be0f6e0ff195 · 2024-12-18T15:24:34.000+01:00
diff --git a/.gitlab/scripts/run_performance_benchmarks.sh b/.gitlab/scripts/run_performance_benchmarks.sh
@@ -37,6 +37,7 @@ pika_targets=(
     "task_yield_test"
     "task_yield_test"
     "condition_variable_overhead_test"
+    "async_rw_mutex_scheduling_test"
 )
 pika_test_options=(
     "--pika:ini=pika.thread_queue.init_threads_count=100 \
@@ -107,6 +108,12 @@ pika_test_options=(
 --pika:threads=2
 --perftest-json"
 
+    "--num-iterations=1000
+--num-rw-accesses=5
+--num-ro-accesses=5
+--repetitions=100
+--pika:threads=4
+--perftest-json"
 )
 
 index=0
diff --git a/libs/pika/synchronization/include/pika/synchronization/async_rw_mutex.hpp b/libs/pika/synchronization/include/pika/synchronization/async_rw_mutex.hpp
@@ -17,6 +17,7 @@
 #include <pika/functional/unique_function.hpp>
 
 #include <atomic>
+#include <cstddef>
 #include <exception>
 #include <memory>
 #include <mutex>
@@ -72,7 +73,15 @@ namespace pika::execution::experimental {
                     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
                     next_state->set_value(std::move(*value));
 
-                    for (auto& continuation : continuations) { continuation(next_state); }
+                    if (!continuations.empty())
+                    {
+                        auto const size = continuations.size();
+                        for (std::size_t i = 0; i < size - 1; ++i) { continuations[i](next_state); }
+
+                        // Move shared state into the last continuation to ensure that the
+                        // continuations release the last reference and not this destructor.
+                        continuations[size - 1](std::move(next_state));
+                    }
                 }
             }
 
@@ -131,7 +140,15 @@ namespace pika::execution::experimental {
                 // If there is no next state the continuations must be empty.
                 PIKA_ASSERT(next_state || continuations.empty());
 
-                for (auto& continuation : continuations) { continuation(next_state); }
+                if (!continuations.empty())
+                {
+                    auto const size = continuations.size();
+                    for (std::size_t i = 0; i < size - 1; ++i) { continuations[i](next_state); }
+
+                    // Move shared state into the last continuation to ensure that the continuations
+                    // release the last reference and not this destructor.
+                    continuations[size - 1](std::move(next_state));
+                }
             }
 
             void set_next_state(std::shared_ptr<async_rw_mutex_shared_state> state)
diff --git a/libs/pika/synchronization/tests/performance/CMakeLists.txt b/libs/pika/synchronization/tests/performance/CMakeLists.txt
@@ -4,23 +4,19 @@
 # Distributed under the Boost Software License, Version 1.0. (See accompanying
 # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-set(benchmarks)
+set(benchmarks async_rw_mutex_scheduling)
 
 foreach(benchmark ${benchmarks})
-
   set(sources ${benchmark}.cpp)
 
   source_group("Source Files" FILES ${sources})
 
-  # add benchmark executable
   pika_add_executable(
     ${benchmark}_test INTERNAL_FLAGS
     SOURCES ${sources}
     EXCLUDE_FROM_ALL ${${benchmark}_FLAGS}
     FOLDER "Benchmarks/Modules/Synchronization"
   )
 
-  # add a custom target for this benchmark
   pika_add_performance_test("modules.synchronization" ${benchmark} ${${benchmark}_PARAMETERS})
-
 endforeach()
diff --git a/libs/pika/synchronization/tests/performance/async_rw_mutex_scheduling.cpp b/libs/pika/synchronization/tests/performance/async_rw_mutex_scheduling.cpp
@@ -0,0 +1,131 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// This test measures the performance of accessing values through async_rw_mutex. Accesses are
+// scheduled on new tasks to test the performance with concurrency. This means that the benchmark
+// includes the overhead of creating new tasks, but it represents a more realistic scenario.
+
+#include <pika/config.hpp>
+#include <pika/async_rw_mutex.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+#include <pika/runtime.hpp>
+#include <pika/testing/performance.hpp>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <fmt/printf.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <utility>
+
+using pika::program_options::bool_switch;
+using pika::program_options::options_description;
+using pika::program_options::value;
+using pika::program_options::variables_map;
+
+using pika::chrono::detail::high_resolution_timer;
+
+namespace ex = pika::execution::experimental;
+namespace tt = pika::this_thread::experimental;
+
+template <typename T>
+double test_async_rw_mutex(
+    std::uint64_t num_iterations, std::uint64_t num_rw_accesses, std::uint64_t num_ro_accesses)
+{
+    pika::chrono::detail::high_resolution_timer timer;
+
+    {
+        ex::async_rw_mutex<T> m;
+        ex::thread_pool_scheduler sched;
+
+        for (std::uint64_t i = 0; i < num_iterations; ++i)
+        {
+            for (std::uint64_t j = 0; j < num_rw_accesses; ++j)
+            {
+                ex::start_detached(m.readwrite() | ex::continues_on(sched));
+            }
+
+            for (std::uint64_t j = 0; j < num_ro_accesses; ++j)
+            {
+                ex::start_detached(m.read() | ex::continues_on(sched));
+            }
+        }
+
+        tt::sync_wait(m.readwrite());
+    }
+
+    return timer.elapsed();
+}
+
+int pika_main(variables_map& vm)
+{
+    auto const num_iterations = vm["num-iterations"].as<std::uint64_t>();
+    auto const num_rw_accesses = vm["num-rw-accesses"].as<std::uint64_t>();
+    auto const num_ro_accesses = vm["num-ro-accesses"].as<std::uint64_t>();
+    auto const repetitions = vm["repetitions"].as<std::uint64_t>();
+    auto const perftest_json = vm["perftest-json"].as<bool>();
+
+    double time_avg_s = 0.0;
+    double time_min_s = std::numeric_limits<double>::max();
+    double time_max_s = std::numeric_limits<double>::min();
+
+    for (std::uint64_t i = 0; i < repetitions; ++i)
+    {
+        double time_s = test_async_rw_mutex<void>(num_iterations, num_rw_accesses, num_ro_accesses);
+
+        time_avg_s += time_s;
+        time_max_s = (std::max)(time_max_s, time_s);
+        time_min_s = (std::min)(time_min_s, time_s);
+    }
+
+    time_avg_s /= repetitions;
+
+    double const time_avg_us = time_avg_s * 1e6 / num_iterations;
+    double const time_min_us = time_min_s * 1e6 / num_iterations;
+    double const time_max_us = time_max_s * 1e6 / num_iterations;
+
+    if (perftest_json)
+    {
+        pika::util::detail::json_perf_times t;
+        t.add(fmt::format("async_rw_mutex - {} threads - {}:{}", pika::get_num_worker_threads(),
+                  num_rw_accesses, num_ro_accesses),
+            time_avg_us);
+        std::cout << t;
+    }
+    else
+    {
+        fmt::print(
+            "repetitions,iterations,rw_accesses,ro_accesses,time_avg_us,time_min_us,time_max_us\n");
+        fmt::print("{},{},{},{},{},{},{}\n", repetitions, num_iterations, num_rw_accesses,
+            num_ro_accesses, time_avg_us, time_min_us, time_max_us);
+    }
+
+    pika::finalize();
+    return EXIT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[])
+{
+    options_description cmdline("usage: " PIKA_APPLICATION_STRING " [options]");
+    // clang-format off
+    cmdline.add_options()
+        ("num-iterations", value<std::uint64_t>()->default_value(100), "number of times to cycle through read-write and read-only accesses in one test")
+        ("num-rw-accesses", value<std::uint64_t>()->default_value(5), "number of consecutive read-write accesses")
+        ("num-ro-accesses", value<std::uint64_t>()->default_value(5), "number of consecutive read-only accesses")
+        ("repetitions", value<std::uint64_t>()->default_value(1), "number of repetitions of the full benchmark")
+        ("perftest-json", bool_switch(), "print final task size in json format for use with performance CI.")
+        // clang-format on
+        ;
+
+    pika::init_params init_args;
+    init_args.desc_cmdline = cmdline;
+    return pika::init(pika_main, argc, argv, init_args);
+}
diff --git a/libs/pika/synchronization/tests/unit/CMakeLists.txt b/libs/pika/synchronization/tests/unit/CMakeLists.txt
@@ -6,6 +6,7 @@
 
 set(tests
     async_rw_mutex
+    async_rw_mutex_yielding
     barrier
     binary_semaphore
     condition_variable
@@ -19,6 +20,7 @@ set(tests
 )
 
 set(async_rw_mutex_PARAMETERS THREADS 4)
+set(async_rw_mutex_yielding_PARAMETERS THREADS 4)
 set(barrier_cpp20_PARAMETERS THREADS 4)
 set(binary_semaphore_cpp20_PARAMETERS THREADS 4)
 
diff --git a/libs/pika/synchronization/tests/unit/async_rw_mutex_yielding.cpp b/libs/pika/synchronization/tests/unit/async_rw_mutex_yielding.cpp
@@ -0,0 +1,82 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// This test checks for a desirable property in async_rw_mutex: if a previous access is guaranteed
+// to have completed, e.g. via sync_wait, the next access is guaranteed to start inline. This makes
+// it slightly easier to reason about whether waiting for a sender from async_rw_mutex may yield or
+// not.
+//
+// Note that while we test the property here, we don't guarantee that it won't change in. We simply
+// want to preserve the property as long as it's reasonable with the current implementation.
+
+#include <pika/async_rw_mutex.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+#include <pika/modules/threading_base.hpp>
+#include <pika/testing.hpp>
+
+#include <cstddef>
+#include <cstdlib>
+
+namespace ex = pika::execution::experimental;
+namespace tt = pika::this_thread::experimental;
+
+template <typename M>
+void test(M&& m)
+{
+    ex::thread_pool_scheduler sched{};
+
+    // We first access the mutex in a way such that the wrapper will be released in another task.
+    ex::start_detached(m.readwrite() | ex::continues_on(sched) | ex::then([](auto&&) {}));
+
+    // Then we access the mutex again, but block to wait for the result. We discard the result so
+    // the wrapper is released immediately.
+    {
+        [[maybe_unused]] auto wrapper = tt::sync_wait(m.readwrite());
+    }
+
+    // Finally, since we blockingly waited for the result above, we expect the below sync_wait to
+    // never cause the task yield, or change worker thread. To achieve this, the async_rw_mutex
+    // implementation must guarantee that in a situation like this, the wrapper returned by
+    // sync_wait holds the last reference to the shared state of that particular access. This would
+    // not happen if e.g. the async_rw_mutex_shared_state destructor release the next shared state
+    // only once all the continuations have been triggered.
+    //
+    // We check that neither the thread phase (how many invocations of the tasks, or in other words:
+    // did the task yield?) nor worker thread change across the sync_wait. The thread phase is a
+    // more reliable check, but is not always available. The worker thread can change if the task
+    // yields whenever work stealing is enabled, but is much lower probability.
+    auto phase_before = pika::threads::detail::get_self_id_data()->get_thread_phase();
+    auto thread_before = pika::get_worker_thread_num();
+
+    {
+        [[maybe_unused]] auto wrapper = tt::sync_wait(m.read());
+    }
+
+    auto phase_after = pika::threads::detail::get_self_id_data()->get_thread_phase();
+    auto thread_after = pika::get_worker_thread_num();
+
+    PIKA_TEST_EQ(phase_before, phase_after);
+    PIKA_TEST_EQ(thread_before, thread_after);
+}
+
+int pika_main()
+{
+    pika::scoped_finalize sf{};
+
+    // This whole test fails only with low probability, so repeat it some reasonable number of
+    // times. 100 does not guarantee failure in a single run, but hopefully across multiple CI
+    // configurations at least one run will fail.
+    for (std::size_t iteration = 0; iteration < 100; ++iteration)
+    {
+        test(ex::async_rw_mutex<int>{42});
+        test(ex::async_rw_mutex<void>{});
+    }
+
+    return EXIT_SUCCESS;
+}
+
+int main(int argc, char* argv[]) { return pika::init(pika_main, argc, argv); }