[libc] Fix GPU benchmarking

jhuber6 · jhuber6 · commit de59e7b86cd3 · 2025-07-18T14:36:23.000-05:00
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -7,9 +7,9 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/time/gpu/time_utils.h"
 #include "src/stdio/printf.h"
 #include "src/stdlib/srand.h"
-#include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace benchmarks {
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -3,12 +3,8 @@
 #include "src/math/atan2.h"
 #include "src/stdlib/rand.h"
 
-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
 #endif
 
 #define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
@@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
 BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
-BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
-BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
-BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+BENCH(double, NvAtan2, __nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
-BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
-BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
-BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000);
 #endif
diff --git a/libc/benchmarks/gpu/src/math/platform.h b/libc/benchmarks/gpu/src/math/platform.h
@@ -0,0 +1,57 @@
+//===-- AMDGPU specific platform definitions for math support -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+#ifdef LIBC_TARGET_ARCH_IS_AMDGPU
+// The ROCm device library uses control globals to alter codegen for the
+// different targets. To avoid needing to link them in manually we simply
+// define them here.
+extern "C" {
+extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
+extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
+}
+
+// These aliases cause clang to emit the control constants with ODR linkage.
+// This allows us to link against the symbols without preventing them from being
+// optimized out or causing symbol collisions.
+[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
+[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
+[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
+    __oclc_correctly_rounded_sqrt32__;
+[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
+[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
+#endif
+} // namespace LIBC_NAMESPACE_DECL
+
+// Forward declarations for the vendor math libraries.
+extern "C" {
+#ifdef AMDGPU_MATH_FOUND
+double __ocml_sin_f64(double);
+float __ocml_sin_f32(float);
+double __ocml_atan2_f64(double, double);
+float __ocml_atan2_f32(float, float);
+#endif
+
+#ifdef NVPTX_MATH_FOUND
+double __nv_sin(double);
+float __nv_sinf(float);
+double __nv_atan2(double, double);
+float __nv_atan2f(float, float);
+#endif
+}
+
+#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -8,12 +8,8 @@
 #include "src/math/sinf.h"
 #include "src/stdlib/rand.h"
 
-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
 #endif
 
 // BENCHMARK() expects a function that with no parameters that returns a
@@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
 BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
-BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
-BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
-BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
+BENCH(double, NvSin, __nv_sin, -1023, 1023);
+BENCH(double, NvSinTwoPi, __nv_sin, -10, 3);
+BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30);
+BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
-BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
-BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
-BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023);
+BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3);
+BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30);
+BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000);
 #endif
 
 BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
@@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
 BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
-BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
-BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
-BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
+BENCH(float, NvSinf, __nv_sinf, -127, 128);
+BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3);
+BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30);
+BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
-BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
-BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
-BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
+BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128);
+BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3);
+BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30);
+BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
 // allows us to substract the constant-time overhead from the latency to
 // obtain a true result. This can vary with system load.
 [[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
   uint32_t result = 0.0;
   asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
@@ -44,13 +45,13 @@ template <typename F, typename T>
   T arg = storage;
 
   // The AMDGPU architecture needs to wait on pending results.
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   // Get the current timestamp from the clock.
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  asm("" ::"s"(start));
+  asm("" : "+v"(arg) : "s"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -71,7 +72,7 @@ template <typename F, typename T>
   // ordering.
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   // Return the time elapsed.
   return stop - start;
@@ -84,7 +85,7 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
@@ -100,7 +101,7 @@ template <typename F, typename T1, typename T2>
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   return stop - start;
 }
@@ -111,7 +112,7 @@ template <typename F, typename T, size_t N>
 throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"v"(&inputs));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
@@ -124,7 +125,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   // Return the time elapsed.
   return stop - start;
@@ -136,7 +137,7 @@ template <typename F, typename T, size_t N>
     F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
   asm("" ::"v"(&inputs1), "v"(&inputs2));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
@@ -149,7 +150,7 @@ template <typename F, typename T, size_t N>
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   // Return the time elapsed.
   return stop - start;
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -46,7 +47,7 @@ template <typename F, typename T>
   T arg = storage;
 
   // Get the current timestamp from the clock.
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock cycle
@@ -63,7 +64,7 @@ template <typename F, typename T>
   // Obtain the current timestamp after running the calculation and force
   // ordering.
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile T output = result;
 
@@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   T1 arg = storage;
   T2 arg2 = storage2;
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
@@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile auto output = result;
 
@@ -101,7 +102,7 @@ template <typename F, typename T, size_t N>
 throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"r"(&inputs));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
@@ -114,7 +115,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile auto output = result;
 
@@ -128,7 +129,7 @@ template <typename F, typename T, size_t N>
     F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
   asm("" ::"r"(&inputs1), "r"(&inputs2));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
@@ -140,7 +141,7 @@ template <typename F, typename T, size_t N>
   }
 
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile auto output = result;