Fix throughput overhead

leandrolcampos · leandrolcampos · commit 6e60a3d946f2 · 2025-08-13T15:38:44.000-03:00
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -176,21 +176,14 @@ benchmark(const BenchmarkOptions &options,
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
 
-  uint64_t overhead = UINT64_MAX;
-  int overhead_iterations = 10;
-  for (int i = 0; i < overhead_iterations; i++)
-    overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
-
   uint32_t call_index = 0;
 
   for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
     RefinableRuntimeEstimator sample_estimator;
 
     const clock_t start = clock();
     while (sample_estimator.get_iterations() < iterations) {
-      auto wrapper_intermediate = wrapper_func(call_index++);
-      uint64_t current_result =
-          wrapper_intermediate < overhead ? 0 : wrapper_intermediate - overhead;
+      auto current_result = wrapper_func(call_index++);
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
       sample_estimator.update(current_result);
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -248,6 +248,7 @@ template <typename T> class MathPerf {
   }
 
 public:
+  // Returns cycles-per-call (lower is better)
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
                                           uint32_t call_index) {
@@ -265,6 +266,7 @@ template <typename T> class MathPerf {
     return total_time / N;
   }
 
+  // Returns cycles-per-call (lower is better)
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                           int arg1_max_exp, int arg2_min_exp,
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
-    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.type_traits
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
@@ -105,54 +106,131 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
-// Provides throughput benchmarking.
-template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
   asm("" ::"v"(&inputs));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
 
+  T result{};
   for (auto input : inputs) {
-    auto result = f(input);
+    asm("" ::"v"(input));
+    result = input;
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"s"(stop));
+
+  volatile auto output = result;
+  (void)output;
+
+  return stop - start;
+}
+
+// Provides throughput benchmarking
+template <typename F, typename T, size_t N>
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
+  asm("" ::"v"(&inputs));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
 
+  asm("" ::"s"(start));
+
+  T result{};
+  for (auto input : inputs) {
+    asm("" ::"v"(input));
+    result = f(input);
     asm("" ::"v"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"s"(stop));
+
+  volatile auto output = result;
+  (void)output;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+    const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"v"(&inputs1), "v"(&inputs2));
+
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"v"(x), "v"(y));
+    result = x;
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"s"(stop));
+
+  volatile auto output = result;
+  (void)output;
 
-  // Return the time elapsed.
   return stop - start;
 }
 
 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
-    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+                                       const cpp::array<T, N> &inputs2) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
   asm("" ::"v"(&inputs1), "v"(&inputs2));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
 
-  for (size_t i = 0; i < inputs1.size(); i++) {
-    auto result = f(inputs1[i], inputs2[i]);
-
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"v"(x), "v"(y));
+    result = f(x, y);
     asm("" ::"v"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
-  asm("" ::"s"(stop));
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"s"(stop));
 
-  // Return the time elapsed.
-  return stop - start;
+  volatile auto output = result;
+  (void)output;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
-    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.type_traits
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
@@ -95,18 +96,50 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   return stop - start;
 }
 
-// Provides throughput benchmarking.
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
+  asm("" ::"r"(&inputs));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  T result{};
+  for (auto input : inputs) {
+    asm("" ::"r"(input));
+    result = input;
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"r"(stop));
+
+  volatile auto output = result;
+  (void)output;
+
+  return stop - start;
+}
+
+// Provides throughput benchmarking
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
   asm("" ::"r"(&inputs));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
 
-  uint64_t result;
+  T result{};
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = f(input);
@@ -116,37 +149,80 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   uint64_t stop = gpu::processor_clock();
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+
   volatile auto output = result;
+  (void)output;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+    const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"r"(x), "r"(y));
+    result = x;
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"r"(stop));
+
+  volatile auto output = result;
+  (void)output;
 
-  // Return the time elapsed.
   return stop - start;
 }
 
 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
-    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+                                       const cpp::array<T, N> &inputs2) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
   asm("" ::"r"(&inputs1), "r"(&inputs2));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
 
-  uint64_t result;
-  for (size_t i = 0; i < inputs1.size(); i++) {
-    result = f(inputs1[i], inputs2[i]);
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"r"(x), "r"(y));
+    result = f(x, y);
     asm("" ::"r"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+
   volatile auto output = result;
+  (void)output;
 
-  // Return the time elapsed.
-  return stop - start;
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
 }
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,7 @@ template <typename T> class MathPerf {`
`248`	`248`	`}`
`249`	`249`
`250`	`250`	`public:`
	`251`	`+ // Returns cycles-per-call (lower is better)`
`251`	`252`	`template <size_t N = 1>`
`252`	`253`	`static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,`
`253`	`254`	`uint32_t call_index) {`
`@@ -265,6 +266,7 @@ template <typename T> class MathPerf {`
`265`	`266`	`return total_time / N;`
`266`	`267`	`}`
`267`	`268`
	`269`	`+ // Returns cycles-per-call (lower is better)`
`268`	`270`	`template <size_t N = 1>`
`269`	`271`	`static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,`
`270`	`272`	`int arg1_max_exp, int arg2_min_exp,`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ add_header_library(`
`7`	`7`	`libc.src.__support.common`
`8`	`8`	`libc.src.__support.macros.config`
`9`	`9`	`libc.src.__support.macros.attributes`
`10`		`- libc.src.__support.CPP.type_traits`
	`10`	`+ libc.src.__support.CPP.algorithm`
`11`	`11`	`libc.src.__support.CPP.array`
	`12`	`+ libc.src.__support.CPP.type_traits`
`12`	`13`	`)`