[libc++][test] add benchmarks for std::atomic::wait (llvm#70571)

huixie90 · web-flow · commit 69279a8413e0 · 2024-02-21T13:43:35.000Z
For the mutex vs atomic test: Old: `unique_lock<mutex>` New: a lock implemented with `atomic::wait` On 10 years old Intel Macbook, `atomic::wait` is 50% slower than `mutex` ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------- BM_multi_thread_lock_unlock/1024 +0.3735 +2.4497 1724726 2368935 153159 528354 BM_multi_thread_lock_unlock/2048 +0.4174 +1.2487 3410538 4834012 435062 978311 BM_multi_thread_lock_unlock/4096 +0.5256 +1.9824 6903783 10532681 590266 1760405 BM_multi_thread_lock_unlock/8192 +0.5415 +0.4578 14536391 22408399 1456328 2123075 BM_multi_thread_lock_unlock/16384 +0.5663 +0.0513 30181991 47275023 3316850 3486950 BM_multi_thread_lock_unlock/32768 +0.5635 -0.2081 62027663 96977726 6477076 5129190 BM_multi_thread_lock_unlock/65536 +0.5228 -0.3273 129637761 197408739 11341630 7628955 BM_multi_thread_lock_unlock/131072 +0.4825 -0.1070 266256295 394712193 10379800 9269200 BM_multi_thread_lock_unlock/262144 +0.4793 +0.2795 539732340 798409253 10802200 13821100 BM_multi_thread_lock_unlock/524288 +0.5272 +0.2847 1070035132 1634124353 14523000 18657800 BM_multi_thread_lock_unlock/1048576 +0.4799 +0.3353 2125510441 3145636119 13404200 17899000 OVERALL_GEOMEAN +0.4970 +0.3886 0 0 0 0 ``` On Apple Arm, `atomic::wait` is 200% slower than `mutex`. And `atomic::wait` is even slower than my 10 years old Intel CPU Macbook ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------- BM_multi_thread_lock_unlock/1024 +2.1811 +3.9854 2036726 6478993 119817 597334 BM_multi_thread_lock_unlock/2048 +1.6736 +1.4301 3162161 8454415 426201 1035727 BM_multi_thread_lock_unlock/4096 +1.1017 +0.6456 6620503 13914159 893019 1469578 BM_multi_thread_lock_unlock/8192 +0.6688 +0.2148 12089392 20174635 1489000 1808799 BM_multi_thread_lock_unlock/16384 +1.4217 -0.2436 19365999 46899345 2068266 1564530 BM_multi_thread_lock_unlock/32768 +2.6161 -0.4927 31371052 113440165 3715100 1884540 BM_multi_thread_lock_unlock/65536 +2.6286 -0.3967 54314581 197086847 5912764 3567410 BM_multi_thread_lock_unlock/131072 +2.3554 +0.4990 103176565 346201425 9260407 13880900 BM_multi_thread_lock_unlock/262144 +2.8780 +0.4995 182355400 707170733 16335852 24496000 BM_multi_thread_lock_unlock/524288 +3.0280 +0.3001 360953079 1453902595 32548700 42316364 BM_multi_thread_lock_unlock/1048576 +3.7480 +1.2374 714500462 3392470417 48603455 108747000 OVERALL_GEOMEAN +2.0791 +0.3874 0 0 0 0 ``` For the atomic_wait test: On my 2013 MacBook with Intel CPU ``` Run on (8 X 2300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x4) L1 Instruction 32 KiB (x4) L2 Unified 256 KiB (x4) L3 Unified 6144 KiB (x1) Load Average: 1.95, 3.77, 4.13 ----------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------------------- BM_atomic_wait_one_thread_one_atomic_wait/1024 184455 ns 183979 ns 3760 BM_atomic_wait_one_thread_one_atomic_wait/2048 361607 ns 360917 ns 1912 BM_atomic_wait_one_thread_one_atomic_wait/4096 709055 ns 708326 ns 929 BM_atomic_wait_one_thread_one_atomic_wait/8192 1469063 ns 1467430 ns 488 BM_atomic_wait_one_thread_one_atomic_wait/16384 2865332 ns 2863473 ns 237 BM_atomic_wait_one_thread_one_atomic_wait/32768 5839429 ns 5834708 ns 113 BM_atomic_wait_one_thread_one_atomic_wait/65536 11460822 ns 11453183 ns 60 BM_atomic_wait_one_thread_one_atomic_wait/131072 23052804 ns 23035000 ns 30 BM_atomic_wait_one_thread_one_atomic_wait/262144 46958743 ns 46712733 ns 15 BM_atomic_wait_one_thread_one_atomic_wait/524288 93151904 ns 92977429 ns 7 BM_atomic_wait_one_thread_one_atomic_wait/1048576 186100011 ns 185888500 ns 4 BM_atomic_wait_one_thread_one_atomic_wait/2097152 364548135 ns 364280000 ns 2 BM_atomic_wait_one_thread_one_atomic_wait/4194304 747181672 ns 745056000 ns 1 BM_atomic_wait_one_thread_one_atomic_wait/8388608 1473070400 ns 1471165000 ns 1 BM_atomic_wait_one_thread_one_atomic_wait/16777216 2950352547 ns 2947373000 ns 1 BM_atomic_wait_multi_thread_one_atomic_wait/1024 668544 ns 167233 ns 4496 BM_atomic_wait_multi_thread_one_atomic_wait/2048 1384668 ns 369750 ns 1941 BM_atomic_wait_multi_thread_one_atomic_wait/4096 2851627 ns 768559 ns 995 BM_atomic_wait_multi_thread_one_atomic_wait/8192 5797669 ns 1476876 ns 526 BM_atomic_wait_multi_thread_one_atomic_wait/16384 11597952 ns 2692792 ns 260 BM_atomic_wait_multi_thread_one_atomic_wait/32768 23528028 ns 5291465 ns 142 BM_atomic_wait_multi_thread_one_atomic_wait/65536 46287247 ns 8547713 ns 87 BM_atomic_wait_multi_thread_one_atomic_wait/131072 90315848 ns 13294492 ns 61 BM_atomic_wait_multi_thread_one_atomic_wait/262144 190722393 ns 16193917 ns 36 BM_atomic_wait_multi_thread_one_atomic_wait/524288 408456684 ns 23641600 ns 10 BM_atomic_wait_multi_thread_one_atomic_wait/1048576 708809670 ns 36361900 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1024 2116444 ns 11669 ns 10000 BM_atomic_wait_multi_thread_wait_different_atomics/2048 12435259 ns 21905 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/4096 6393816 ns 17819 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/8192 11930400 ns 28637 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/16384 20987224 ns 35272 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/32768 44335820 ns 66660 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/65536 91395912 ns 129030 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/131072 145440007 ns 165960 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/262144 368219935 ns 420800 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/524288 630106863 ns 809500 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1048576 1138174673 ns 1093000 ns 10 ``` On apple arm ``` Run on (8 X 24.1208 MHz CPU s) CPU Caches: L1 Data 64 KiB (x8) L1 Instruction 128 KiB (x8) L2 Unified 4096 KiB (x2) Load Average: 1.34, 1.58, 1.66 ----------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------------------- BM_atomic_wait_one_thread_one_atomic_wait/1024 61602 ns 61602 ns 8701 BM_atomic_wait_one_thread_one_atomic_wait/2048 123148 ns 123146 ns 5688 BM_atomic_wait_one_thread_one_atomic_wait/4096 246248 ns 246249 ns 2888 BM_atomic_wait_one_thread_one_atomic_wait/8192 480373 ns 480359 ns 1455 BM_atomic_wait_one_thread_one_atomic_wait/16384 974725 ns 974721 ns 724 BM_atomic_wait_one_thread_one_atomic_wait/32768 1922185 ns 1922115 ns 355 BM_atomic_wait_one_thread_one_atomic_wait/65536 3940632 ns 3940608 ns 181 BM_atomic_wait_one_thread_one_atomic_wait/131072 7886302 ns 7886102 ns 88 BM_atomic_wait_one_thread_one_atomic_wait/262144 15393156 ns 15393000 ns 45 BM_atomic_wait_one_thread_one_atomic_wait/524288 30833221 ns 30832174 ns 23 BM_atomic_wait_one_thread_one_atomic_wait/1048576 62551936 ns 62551909 ns 11 BM_atomic_wait_one_thread_one_atomic_wait/2097152 123155625 ns 123155667 ns 6 BM_atomic_wait_one_thread_one_atomic_wait/4194304 252468180 ns 252458667 ns 3 BM_atomic_wait_one_thread_one_atomic_wait/8388608 505075604 ns 505075500 ns 2 BM_atomic_wait_one_thread_one_atomic_wait/16777216 992977209 ns 992935000 ns 1 BM_atomic_wait_multi_thread_one_atomic_wait/1024 531411 ns 239695 ns 2783 BM_atomic_wait_multi_thread_one_atomic_wait/2048 1030592 ns 484868 ns 1413 BM_atomic_wait_multi_thread_one_atomic_wait/4096 1951896 ns 922357 ns 631 BM_atomic_wait_multi_thread_one_atomic_wait/8192 3759893 ns 1952074 ns 390 BM_atomic_wait_multi_thread_one_atomic_wait/16384 7417929 ns 3458309 ns 233 BM_atomic_wait_multi_thread_one_atomic_wait/32768 14386361 ns 5590830 ns 100 BM_atomic_wait_multi_thread_one_atomic_wait/65536 29725536 ns 6521887 ns 115 BM_atomic_wait_multi_thread_one_atomic_wait/131072 60023797 ns 10766795 ns 73 BM_atomic_wait_multi_thread_one_atomic_wait/262144 120782267 ns 17532091 ns 44 BM_atomic_wait_multi_thread_one_atomic_wait/524288 242539333 ns 27506920 ns 25 BM_atomic_wait_multi_thread_one_atomic_wait/1048576 482833787 ns 53721600 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1024 2230048 ns 626042 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/2048 3931958 ns 837540 ns 884 BM_atomic_wait_multi_thread_wait_different_atomics/4096 6506887 ns 1127922 ns 586 BM_atomic_wait_multi_thread_wait_different_atomics/8192 10528008 ns 1651254 ns 456 BM_atomic_wait_multi_thread_wait_different_atomics/16384 18055829 ns 2066379 ns 317 BM_atomic_wait_multi_thread_wait_different_atomics/32768 29878496 ns 2875600 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/65536 50523799 ns 3193170 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/131072 85926943 ns 4121950 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/262144 154602296 ns 5879050 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/524288 279121754 ns 10063400 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1048576 522796900 ns 1237030 ns 10 ```
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
@@ -197,6 +197,8 @@ set(BENCHMARK_TESTS
     algorithms/sort.bench.cpp
     algorithms/sort_heap.bench.cpp
     algorithms/stable_sort.bench.cpp
+    atomic_wait.bench.cpp
+    atomic_wait_vs_mutex_lock.bench.cpp
     libcxxabi/dynamic_cast.bench.cpp
     libcxxabi/dynamic_cast_old_stress.bench.cpp
     allocation.bench.cpp
diff --git a/libcxx/benchmarks/atomic_wait.bench.cpp b/libcxx/benchmarks/atomic_wait.bench.cpp
@@ -0,0 +1,154 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <atomic>
+#include <numeric>
+#include <thread>
+
+#include "benchmark/benchmark.h"
+#include "make_test_thread.h"
+
+using namespace std::chrono_literals;
+
+void BM_atomic_wait_one_thread_one_atomic_wait(benchmark::State& state) {
+  std::atomic<std::uint64_t> a;
+  auto thread_func = [&](std::stop_token st) {
+    while (!st.stop_requested()) {
+      a.fetch_add(1, std::memory_order_relaxed);
+      a.notify_all();
+    }
+  };
+
+  std::uint64_t total_loop_test_param = state.range(0);
+
+  auto thread = support::make_test_jthread(thread_func);
+
+  for (auto _ : state) {
+    for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+      auto old = a.load(std::memory_order_relaxed);
+      a.wait(old);
+    }
+  }
+}
+BENCHMARK(BM_atomic_wait_one_thread_one_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 24);
+
+void BM_atomic_wait_multi_thread_one_atomic_wait(benchmark::State& state) {
+  std::atomic<std::uint64_t> a;
+  auto notify_func = [&](std::stop_token st) {
+    while (!st.stop_requested()) {
+      a.fetch_add(1, std::memory_order_relaxed);
+      a.notify_all();
+    }
+  };
+
+  std::uint64_t total_loop_test_param = state.range(0);
+  constexpr auto num_waiting_threads  = 15;
+  std::vector<std::jthread> wait_threads;
+  wait_threads.reserve(num_waiting_threads);
+
+  auto notify_thread = support::make_test_jthread(notify_func);
+
+  std::atomic<std::uint64_t> start_flag = 0;
+  std::atomic<std::uint64_t> done_count = 0;
+  auto wait_func                        = [&a, &start_flag, &done_count, total_loop_test_param](std::stop_token st) {
+    auto old_start = 0;
+    while (!st.stop_requested()) {
+      start_flag.wait(old_start);
+      old_start = start_flag.load();
+      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+        auto old = a.load(std::memory_order_relaxed);
+        a.wait(old);
+      }
+      done_count.fetch_add(1);
+    }
+  };
+
+  for (size_t i = 0; i < num_waiting_threads; ++i) {
+    wait_threads.emplace_back(support::make_test_jthread(wait_func));
+  }
+
+  for (auto _ : state) {
+    done_count = 0;
+    start_flag.fetch_add(1);
+    start_flag.notify_all();
+    while (done_count < num_waiting_threads) {
+      std::this_thread::yield();
+    }
+  }
+  for (auto& t : wait_threads) {
+    t.request_stop();
+  }
+  start_flag.fetch_add(1);
+  start_flag.notify_all();
+  for (auto& t : wait_threads) {
+    t.join();
+  }
+}
+BENCHMARK(BM_atomic_wait_multi_thread_one_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 20);
+
+void BM_atomic_wait_multi_thread_wait_different_atomics(benchmark::State& state) {
+  const std::uint64_t total_loop_test_param = state.range(0);
+  constexpr std::uint64_t num_atomics       = 7;
+  std::vector<std::atomic<std::uint64_t>> atomics(num_atomics);
+
+  auto notify_func = [&](std::stop_token st, size_t idx) {
+    while (!st.stop_requested()) {
+      atomics[idx].fetch_add(1, std::memory_order_relaxed);
+      atomics[idx].notify_all();
+    }
+  };
+
+  std::atomic<std::uint64_t> start_flag = 0;
+  std::atomic<std::uint64_t> done_count = 0;
+
+  auto wait_func = [&, total_loop_test_param](std::stop_token st, size_t idx) {
+    auto old_start = 0;
+    while (!st.stop_requested()) {
+      start_flag.wait(old_start);
+      old_start = start_flag.load();
+      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+        auto old = atomics[idx].load(std::memory_order_relaxed);
+        atomics[idx].wait(old);
+      }
+      done_count.fetch_add(1);
+    }
+  };
+
+  std::vector<std::jthread> notify_threads;
+  notify_threads.reserve(num_atomics);
+
+  std::vector<std::jthread> wait_threads;
+  wait_threads.reserve(num_atomics);
+
+  for (size_t i = 0; i < num_atomics; ++i) {
+    notify_threads.emplace_back(support::make_test_jthread(notify_func, i));
+  }
+
+  for (size_t i = 0; i < num_atomics; ++i) {
+    wait_threads.emplace_back(support::make_test_jthread(wait_func, i));
+  }
+
+  for (auto _ : state) {
+    done_count = 0;
+    start_flag.fetch_add(1);
+    start_flag.notify_all();
+    while (done_count < num_atomics) {
+      std::this_thread::yield();
+    }
+  }
+  for (auto& t : wait_threads) {
+    t.request_stop();
+  }
+  start_flag.fetch_add(1);
+  start_flag.notify_all();
+  for (auto& t : wait_threads) {
+    t.join();
+  }
+}
+BENCHMARK(BM_atomic_wait_multi_thread_wait_different_atomics)->RangeMultiplier(2)->Range(1 << 10, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp b/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// To run this test, build libcxx and cxx-benchmarks targets
+// cd third-party/benchmark/tools
+// ./compare.py filters ../../../build/libcxx/benchmarks/atomic_wait_vs_mutex_lock.libcxx.out BM_atomic_wait BM_mutex
+
+#include <atomic>
+#include <mutex>
+#include <numeric>
+#include <thread>
+
+#include "benchmark/benchmark.h"
+#include "make_test_thread.h"
+
+using namespace std::chrono_literals;
+
+struct AtomicLock {
+  std::atomic<bool>& locked_;
+
+  AtomicLock(const AtomicLock&)            = delete;
+  AtomicLock& operator=(const AtomicLock&) = delete;
+
+  AtomicLock(std::atomic<bool>& l) : locked_(l) { lock(); }
+  ~AtomicLock() { unlock(); }
+
+  void lock() {
+    while (true) {
+      locked_.wait(true, std::memory_order_relaxed);
+      bool expected = false;
+      if (locked_.compare_exchange_weak(expected, true, std::memory_order_acquire, std::memory_order_relaxed))
+        break;
+    }
+  }
+
+  void unlock() {
+    locked_.store(false, std::memory_order_release);
+    locked_.notify_all();
+  }
+};
+
+// using LockState = std::atomic<bool>;
+// using Lock      = AtomicLock;
+
+// using LockState = std::mutex;
+// using Lock = std::unique_lock<std::mutex>;
+
+template <class LockState, class Lock>
+void test_multi_thread_lock_unlock(benchmark::State& state) {
+  std::uint64_t total_loop_test_param = state.range(0);
+  constexpr auto num_threads          = 15;
+  std::vector<std::jthread> threads;
+  threads.reserve(num_threads);
+
+  std::atomic<std::uint64_t> start_flag = 0;
+  std::atomic<std::uint64_t> done_count = 0;
+
+  LockState lock_state{};
+
+  auto func = [&start_flag, &done_count, &lock_state, total_loop_test_param](std::stop_token st) {
+    auto old_start = 0;
+    while (!st.stop_requested()) {
+      start_flag.wait(old_start);
+      old_start = start_flag.load();
+
+      // main things under test: locking and unlocking in the loop
+      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+        Lock l{lock_state};
+      }
+
+      done_count.fetch_add(1);
+    }
+  };
+
+  for (size_t i = 0; i < num_threads; ++i) {
+    threads.emplace_back(support::make_test_jthread(func));
+  }
+
+  for (auto _ : state) {
+    done_count = 0;
+    start_flag.fetch_add(1);
+    start_flag.notify_all();
+    while (done_count < num_threads) {
+      std::this_thread::yield();
+    }
+  }
+  for (auto& t : threads) {
+    t.request_stop();
+  }
+  start_flag.fetch_add(1);
+  start_flag.notify_all();
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+void BM_atomic_wait(benchmark::State& state) { test_multi_thread_lock_unlock<std::atomic<bool>, AtomicLock>(state); }
+BENCHMARK(BM_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 20);
+
+void BM_mutex(benchmark::State& state) {
+  test_multi_thread_lock_unlock<std::mutex, std::unique_lock<std::mutex>>(state);
+}
+BENCHMARK(BM_mutex)->RangeMultiplier(2)->Range(1 << 10, 1 << 20);
+
+BENCHMARK_MAIN();