llvm · jhuber6 · Aug 28, 2025 · Aug 27, 2025
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
@@ -40,6 +40,7 @@ add_unittest_framework_library(
     LibcGpuBenchmarkMain.cpp
   HDRS
     LibcGpuBenchmark.h
+    Random.h
   DEPENDS
     libc.benchmarks.gpu.timing.timing
     libc.hdr.stdint_proxy
@@ -49,12 +50,17 @@ add_unittest_framework_library(
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.optional
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.nearest_integer_operations
     libc.src.__support.FPUtil.sqrt
+    libc.src.__support.sign
     libc.src.__support.fixedvector
     libc.src.__support.GPU.utils
     libc.src.__support.time.gpu.time_utils
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
     libc.src.stdio.printf
     libc.src.time.clock
 )

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -1,6 +1,8 @@
 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
 
+#include "benchmarks/gpu/Random.h"
+
 #include "benchmarks/gpu/timing/timing.h"
 
 #include "hdr/stdint_proxy.h"
@@ -175,94 +177,6 @@ class Benchmark {
   }
 };
 
-class RandomGenerator {
-  uint64_t state;
-
-  static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
-    x += 0x9E3779B97F4A7C15ULL;
-    x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
-    x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
-    x = (x ^ (x >> 31));
-    return x ? x : 0x9E3779B97F4A7C15ULL;
-  }
-
-public:
-  explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
-      : state(splitmix64(seed)) {}
-
-  LIBC_INLINE uint64_t next64() noexcept {
-    uint64_t x = state;
-    x ^= x >> 12;
-    x ^= x << 25;
-    x ^= x >> 27;
-    state = x;
-    return x * 0x2545F4914F6CDD1DULL;
-  }
-
-  LIBC_INLINE uint32_t next32() noexcept {
-    return static_cast<uint32_t>(next64() >> 32);
-  }
-};
-
-// We want random floating-point values whose *unbiased* exponent e is
-// approximately uniform in [min_exp, max_exp]. That is,
-//   2^min_exp <= |value| < 2^(max_exp + 1).
-// Caveats / boundaries:
-// - e = -EXP_BIAS  ==> subnormal range (biased exponent = 0). We ensure a
-//                      non-zero mantissa so we don't accidentally produce 0.
-// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
-// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
-//                        defaults to EXP_BIAS.
-template <typename T>
-static T
-get_rand_input(RandomGenerator &rng,
-               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
-               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-  using Storage = typename FPBits::StorageType;
-
-  // Sanitize and clamp requested range to what the format supports
-  if (min_exp > max_exp) {
-    auto tmp = min_exp;
-    min_exp = max_exp;
-    max_exp = tmp;
-  };
-  min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS);
-  max_exp = cpp::min(max_exp, FPBits::EXP_BIAS);
-
-  // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
-  // bias
-  auto sample_in_range = [&](uint64_t r) -> int32_t {
-    const uint64_t range = static_cast<uint64_t>(
-        static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
-    const uint64_t threshold = (-range) % range;
-    while (r < threshold)
-      r = rng.next64();
-    return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
-  };
-  const int32_t e = sample_in_range(rng.next64());
-
-  // Start from random bits to get random sign and mantissa
-  FPBits xbits([&] {
-    if constexpr (cpp::is_same_v<T, double>)
-      return FPBits(rng.next64());
-    else
-      return FPBits(rng.next32());
-  }());
-
-  if (e == -FPBits::EXP_BIAS) {
-    // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
-    xbits.set_biased_exponent(Storage(0));
-    if (xbits.get_mantissa() == Storage(0))
-      xbits.set_mantissa(Storage(1));
-  } else {
-    // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
-    const int32_t biased = e + FPBits::EXP_BIAS;
-    xbits.set_biased_exponent(static_cast<Storage>(biased));
-  }
-  return xbits.get_val();
-}
-
 template <typename T> class MathPerf {
   static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
     const uint64_t tid = gpu::get_thread_id();
@@ -271,29 +185,27 @@ template <typename T> class MathPerf {
 
 public:
   // Returns cycles-per-call (lower is better)
-  template <size_t N = 1>
-  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
-                                          uint32_t call_index) {
+  template <size_t N = 1, typename Dist>
+  static uint64_t run_throughput(T (*f)(T), const Dist &dist,
+                                 uint32_t call_index) {
     cpp::array<T, N> inputs;
 
     uint64_t base_seed = static_cast<uint64_t>(call_index);
     uint64_t salt = static_cast<uint64_t>(N);
     RandomGenerator rng(make_seed(base_seed, salt));
 
     for (size_t i = 0; i < N; ++i)
-      inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
+      inputs[i] = dist(rng);
 
     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
 
     return total_time / N;
   }
 
   // Returns cycles-per-call (lower is better)
-  template <size_t N = 1>
-  static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
-                                          int arg1_max_exp, int arg2_min_exp,
-                                          int arg2_max_exp,
-                                          uint32_t call_index) {
+  template <size_t N = 1, typename Dist1, typename Dist2>
+  static uint64_t run_throughput(T (*f)(T, T), const Dist1 &dist1,
+                                 const Dist2 &dist2, uint32_t call_index) {
     cpp::array<T, N> inputs1;
     cpp::array<T, N> inputs2;
 
@@ -302,8 +214,8 @@ template <typename T> class MathPerf {
     RandomGenerator rng(make_seed(base_seed, salt));
 
     for (size_t i = 0; i < N; ++i) {
-      inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
-      inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
+      inputs1[i] = dist1(rng);
+      inputs2[i] = dist2(rng);
     }
 
     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);

diff --git a/libc/benchmarks/gpu/Random.h b/libc/benchmarks/gpu/Random.h
@@ -0,0 +1,190 @@
+//===-- Pseudo-random number generation utilities ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_GPU_RANDOM_H
+#define LLVM_LIBC_BENCHMARKS_GPU_RANDOM_H
+
+#include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+#include "src/__support/sign.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace benchmarks {
+
+// Pseudo-random number generator (PRNG) that produces unsigned 64-bit, 32-bit,
+// and 16-bit integers. The implementation is based on the xorshift* generator,
+// seeded using SplitMix64 for robust initialization. For more details, see:
+// https://en.wikipedia.org/wiki/Xorshift
+class RandomGenerator {
+  uint64_t state;
+
+  static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
+    x += 0x9E3779B97F4A7C15ULL;
+    x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
+    x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
+    x = (x ^ (x >> 31));
+    return x ? x : 0x9E3779B97F4A7C15ULL;
+  }
+
+public:
+  explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
+      : state(splitmix64(seed)) {}
+
+  LIBC_INLINE uint64_t next64() noexcept {
+    uint64_t x = state;
+    x ^= x >> 12;
+    x ^= x << 25;
+    x ^= x >> 27;
+    state = x;
+    return x * 0x2545F4914F6CDD1DULL;
+  }
+
+  LIBC_INLINE uint32_t next32() noexcept {
+    return static_cast<uint32_t>(next64() >> 32);
+  }
+
+  LIBC_INLINE uint16_t next16() noexcept {
+    return static_cast<uint16_t>(next64() >> 48);
+  }
+};
+
+// Generates random floating-point numbers where the unbiased binary exponent
+// is sampled uniformly in `[min_exp, max_exp]`. The significand bits are
+// always randomized, while the sign is randomized by default but can be fixed.
+// Evenly covers orders of magnitude; never yields Inf/NaN.
+template <typename T> class UniformExponent {
+  static_assert(cpp::is_same_v<T, float16> || cpp::is_same_v<T, float> ||
+                    cpp::is_same_v<T, double>,
+                "UniformExponent supports float16, float, and double");
+
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+  using Storage = typename FPBits::StorageType;
+
+public:
+  explicit UniformExponent(int min_exp = -FPBits::EXP_BIAS,
+                           int max_exp = FPBits::EXP_BIAS,
+                           cpp::optional<Sign> forced_sign = cpp::nullopt)
+      : min_exp(clamp_exponent(cpp::min(min_exp, max_exp))),
+        max_exp(clamp_exponent(cpp::max(min_exp, max_exp))),
+        forced_sign(forced_sign) {}
+
+  LIBC_INLINE T operator()(RandomGenerator &rng) const noexcept {
+    // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
+    // bias, using rejection sampling
+    auto sample_in_range = [&](uint64_t r) -> int32_t {
+      const uint64_t range = static_cast<uint64_t>(
+          static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
+      const uint64_t threshold = (-range) % range;
+      while (r < threshold)
+        r = rng.next64();
+      return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
+    };
+    const int32_t e = sample_in_range(rng.next64());
+
+    // Start from random bits to get random sign and mantissa
+    FPBits xbits([&] {
+      if constexpr (cpp::is_same_v<T, double>)
+        return FPBits(rng.next64());
+      else if constexpr (cpp::is_same_v<T, float>)
+        return FPBits(rng.next32());
+      else
+        return FPBits(rng.next16());
+    }());
+
+    if (e == -FPBits::EXP_BIAS) {
+      // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
+      xbits.set_biased_exponent(Storage(0));
+      if (xbits.get_mantissa() == Storage(0))
+        xbits.set_mantissa(Storage(1));
+    } else {
+      // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
+      const int32_t biased = e + FPBits::EXP_BIAS;
+      xbits.set_biased_exponent(static_cast<Storage>(biased));
+    }
+
+    if (forced_sign)
+      xbits.set_sign(*forced_sign);
+
+    return xbits.get_val();
+  }
+
+private:
+  static LIBC_INLINE int clamp_exponent(int val) noexcept {
+    if (val < -FPBits::EXP_BIAS)
+      return -FPBits::EXP_BIAS;
+
+    if (val > FPBits::EXP_BIAS)
+      return FPBits::EXP_BIAS;
+
+    return val;
+  }
+
+  const int min_exp;
+  const int max_exp;
+  const cpp::optional<Sign> forced_sign;
+};
+
+// Generates random floating-point numbers that are uniformly distributed on
+// a linear scale. Values are sampled from `[min_val, max_val)`.
+template <typename T> class UniformLinear {
+  static_assert(cpp::is_same_v<T, float16> || cpp::is_same_v<T, float> ||
+                    cpp::is_same_v<T, double>,
+                "UniformLinear supports float16, float, and double");
+
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+  using Storage = typename FPBits::StorageType;
+
+  static constexpr T MAX_NORMAL = FPBits::max_normal().get_val();
+
+public:
+  explicit UniformLinear(T min_val = -MAX_NORMAL, T max_val = MAX_NORMAL)
+      : min_val(clamp_val(cpp::min(min_val, max_val))),
+        max_val(clamp_val(cpp::max(min_val, max_val))) {}
+
+  LIBC_INLINE T operator()(RandomGenerator &rng) const noexcept {
+    double u = standard_uniform(rng.next64());
+    double a = static_cast<double>(min_val);
+    double b = static_cast<double>(max_val);
+    double y = a + (b - a) * u;
+    return static_cast<T>(y);
+  }
+
+private:
+  static LIBC_INLINE T clamp_val(T val) noexcept {
+    if (val < -MAX_NORMAL)
+      return -MAX_NORMAL;
+
+    if (val > MAX_NORMAL)
+      return MAX_NORMAL;
+
+    return val;
+  }
+
+  static LIBC_INLINE double standard_uniform(uint64_t x) noexcept {
+    constexpr int PREC_BITS =
+        LIBC_NAMESPACE::fputil::FPBits<double>::SIG_LEN + 1;
+    constexpr int SHIFT_BITS = LIBC_NAMESPACE::fputil::FPBits<double>::EXP_LEN;
+    constexpr double INV = 1.0 / static_cast<double>(1ULL << PREC_BITS);
+
+    return static_cast<double>(x >> SHIFT_BITS) * INV;
+  }
+
+  const T min_val;
+  const T max_val;
+};
+
+} // namespace benchmarks
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif