Fix random input generation

leandrolcampos · leandrolcampos · commit f36f86fd807b · 2025-08-12T13:04:55.000-03:00
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,5 @@
 #include "LibcGpuBenchmark.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
@@ -160,8 +161,9 @@ void Benchmark::run_benchmarks() {
   gpu::sync_threads();
 }
 
-BenchmarkResult benchmark(const BenchmarkOptions &options,
-                          cpp::function<uint64_t(void)> wrapper_func) {
+BenchmarkResult
+benchmark(const BenchmarkOptions &options,
+          const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
   BenchmarkResult result;
   RuntimeEstimationProgression rep;
   uint32_t total_iterations = 0;
@@ -181,11 +183,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   for (int i = 0; i < overhead_iterations; i++)
     overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
 
+  uint32_t call_index = 0;
+
   for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
     uint64_t sample_cycles = 0;
     const clock_t start = static_cast<double>(clock());
     for (uint32_t i = 0; i < iterations; i++) {
-      auto wrapper_intermediate = wrapper_func();
+      auto wrapper_intermediate = wrapper_func(call_index++);
       uint64_t current_result = wrapper_intermediate - overhead;
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
@@ -223,7 +227,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   result.total_iterations = total_iterations;
   result.total_time = total_time / total_iterations;
   return result;
-};
+}
 
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,6 +4,7 @@
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
@@ -77,17 +78,18 @@ struct BenchmarkResult {
   clock_t total_time = 0;
 };
 
-BenchmarkResult benchmark(const BenchmarkOptions &options,
-                          cpp::function<uint64_t(void)> wrapper_func);
+BenchmarkResult
+benchmark(const BenchmarkOptions &options,
+          const cpp::function<uint64_t(uint32_t)> &wrapper_func);
 
 class Benchmark {
-  const cpp::function<uint64_t(void)> func;
+  const cpp::function<uint64_t(uint32_t)> func;
   const cpp::string_view suite_name;
   const cpp::string_view test_name;
   const uint32_t num_threads;
 
 public:
-  Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
+  Benchmark(cpp::function<uint64_t(uint32_t)> func, char const *suite_name,
             char const *test_name, uint32_t num_threads)
       : func(func), suite_name(suite_name), test_name(test_name),
         num_threads(num_threads) {
@@ -111,7 +113,7 @@ class Benchmark {
 class RandomGenerator {
   uint64_t state;
 
-  static inline uint64_t splitmix64(uint64_t x) noexcept {
+  static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
     x += 0x9E3779B97F4A7C15ULL;
     x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
     x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
@@ -120,10 +122,10 @@ class RandomGenerator {
   }
 
 public:
-  explicit inline RandomGenerator(uint64_t seed) noexcept
+  explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
       : state(splitmix64(seed)) {}
 
-  inline uint64_t next64() noexcept {
+  LIBC_INLINE uint64_t next64() noexcept {
     uint64_t x = state;
     x ^= x >> 12;
     x ^= x << 25;
@@ -132,52 +134,86 @@ class RandomGenerator {
     return x * 0x2545F4914F6CDD1DULL;
   }
 
-  inline uint32_t next32() noexcept {
+  LIBC_INLINE uint32_t next32() noexcept {
     return static_cast<uint32_t>(next64() >> 32);
   }
 };
 
-// We want our random values to be approximately
-// Output: a random number with the exponent field between min_exp and max_exp,
-// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
-// Caveats:
-//   -EXP_BIAS corresponding to denormal values,
-//   EXP_BIAS + 1 corresponding to inf or nan.
+// We want random floating-point values whose *unbiased* exponent e is
+// approximately uniform in [min_exp, max_exp]. That is,
+//   2^min_exp <= |value| < 2^(max_exp + 1).
+// Caveats / boundaries:
+// - e = -EXP_BIAS  ==> subnormal range (biased exponent = 0). We ensure a
+//                      non-zero mantissa so we don't accidentally produce 0.
+// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
+// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
+//                        defaults to EXP_BIAS.
 template <typename T>
 static T
 get_rand_input(RandomGenerator &rng,
-               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
-               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-
-  // Required to correctly instantiate FPBits for floats and doubles.
-  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
-                                               uint64_t, uint32_t>;
-  RandType bits;
-  if constexpr (cpp::is_same_v<T, uint64_t>)
-    bits = rng.next64();
-  else
-    bits = rng.next32();
-  double scale =
-      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
-  FPBits fp(bits);
-  fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
-  return fp.get_val();
+  using Storage = typename FPBits::StorageType;
+
+  // Sanitize and clamp requested range to what the format supports
+  if (min_exp > max_exp) {
+    auto tmp = min_exp;
+    min_exp = max_exp;
+    max_exp = tmp;
+  };
+  min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS);
+  max_exp = cpp::min(max_exp, FPBits::EXP_BIAS);
+
+  // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
+  // bias
+  auto sample_in_range = [&](uint64_t r) -> int32_t {
+    const uint64_t range = static_cast<uint64_t>(
+        static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
+    const uint64_t threshold = (-range) % range;
+    while (r < threshold)
+      r = rng.next64();
+    return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
+  };
+  const int32_t e = sample_in_range(rng.next64());
+
+  // Start from random bits to get random sign and mantissa
+  FPBits xbits([&] {
+    if constexpr (cpp::is_same_v<T, double>)
+      return FPBits(rng.next64());
+    else
+      return FPBits(rng.next32());
+  }());
+
+  if (e == -FPBits::EXP_BIAS) {
+    // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
+    xbits.set_biased_exponent(Storage(0));
+    if (xbits.get_mantissa() == Storage(0))
+      xbits.set_mantissa(Storage(1));
+  } else {
+    // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
+    const int32_t biased = e + FPBits::EXP_BIAS;
+    xbits.set_biased_exponent(static_cast<Storage>(biased));
+  }
+  return xbits.get_val();
 }
 
 template <typename T> class MathPerf {
-  using FPBits = fputil::FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
-  static constexpr StorageType UIntMax =
-      cpp::numeric_limits<StorageType>::max();
+  static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
+    const uint64_t tid = gpu::get_thread_id();
+    return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL);
+  }
 
 public:
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
-                                          uint64_t seed = N) {
+                                          uint32_t call_index) {
     cpp::array<T, N> inputs;
-    RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
+
+    uint64_t base_seed = static_cast<uint64_t>(call_index);
+    uint64_t salt = static_cast<uint64_t>(N);
+    RandomGenerator rng(make_seed(base_seed, salt));
+
     for (size_t i = 0; i < N; ++i)
       inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
 
@@ -186,14 +222,18 @@ template <typename T> class MathPerf {
     return total_time / N;
   }
 
-  // Throughput benchmarking for functions that take 2 inputs.
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                           int arg1_max_exp, int arg2_min_exp,
-                                          int arg2_max_exp, uint64_t seed = N) {
+                                          int arg2_max_exp,
+                                          uint32_t call_index) {
     cpp::array<T, N> inputs1;
     cpp::array<T, N> inputs2;
-    RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
+
+    uint64_t base_seed = static_cast<uint64_t>(call_index);
+    uint64_t salt = static_cast<uint64_t>(N);
+    RandomGenerator rng(make_seed(base_seed, salt));
+
     for (size_t i = 0; i < N; ++i) {
       inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
       inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
@@ -224,4 +264,5 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-#endif
+
+#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -7,6 +7,7 @@ add_benchmark(
   SRCS
     isalnum_benchmark.cpp
   DEPENDS
+    libc.hdr.stdint_proxy
     libc.src.ctype.isalnum
   LOADER_ARGS
     --threads 64
@@ -19,5 +20,6 @@ add_benchmark(
   SRCS
     isalpha_benchmark.cpp
   DEPENDS
+    libc.hdr.stdint_proxy
     libc.src.ctype.isalpha
 )
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -1,8 +1,9 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "hdr/stdint_proxy.h"
 #include "src/ctype/isalnum.h"
 
-uint64_t BM_IsAlnum() {
+uint64_t BM_IsAlnum(uint32_t /*call_index*/) {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
@@ -12,13 +13,13 @@ SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
 SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
                       BM_IsAlnum);
 
-uint64_t BM_IsAlnumCapital() {
+uint64_t BM_IsAlnumCapital(uint32_t /*call_index*/) {
   char x = 'A';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
 BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
 
-uint64_t BM_IsAlnumNotAlnum() {
+uint64_t BM_IsAlnumNotAlnum(uint32_t /*call_index*/) {
   char x = '{';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -1,8 +1,9 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "hdr/stdint_proxy.h"
 #include "src/ctype/isalpha.h"
 
-uint64_t BM_IsAlpha() {
+uint64_t BM_IsAlpha(uint32_t /*call_index*/) {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
 }

Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,9 @@`
`1`	`1`	`#include "benchmarks/gpu/LibcGpuBenchmark.h"`
`2`	`2`
	`3`	`+#include "hdr/stdint_proxy.h"`
`3`	`4`	`#include "src/ctype/isalpha.h"`
`4`	`5`
`5`		`-uint64_t BM_IsAlpha() {`
	`6`	`+uint64_t BM_IsAlpha(uint32_t /call_index/) {`
`6`	`7`	`char x = 'c';`
`7`	`8`	`return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);`
`8`	`9`	`}`