44#include " benchmarks/gpu/BenchmarkLogger.h"
55#include " benchmarks/gpu/timing/timing.h"
66#include " hdr/stdint_proxy.h"
7+ #include " src/__support/CPP/algorithm.h"
78#include " src/__support/CPP/array.h"
89#include " src/__support/CPP/functional.h"
910#include " src/__support/CPP/limits.h"
@@ -77,17 +78,18 @@ struct BenchmarkResult {
7778 clock_t total_time = 0 ;
7879};
7980
80- BenchmarkResult benchmark (const BenchmarkOptions &options,
81- cpp::function<uint64_t (void )> wrapper_func);
81+ BenchmarkResult
82+ benchmark (const BenchmarkOptions &options,
83+ const cpp::function<uint64_t (uint32_t )> &wrapper_func);
8284
8385class Benchmark {
84- const cpp::function<uint64_t (void )> func;
86+ const cpp::function<uint64_t (uint32_t )> func;
8587 const cpp::string_view suite_name;
8688 const cpp::string_view test_name;
8789 const uint32_t num_threads;
8890
8991public:
90- Benchmark (cpp::function<uint64_t (void )> func, char const *suite_name,
92+ Benchmark (cpp::function<uint64_t (uint32_t )> func, char const *suite_name,
9193 char const *test_name, uint32_t num_threads)
9294 : func(func), suite_name(suite_name), test_name(test_name),
9395 num_threads (num_threads) {
@@ -111,7 +113,7 @@ class Benchmark {
111113class RandomGenerator {
112114 uint64_t state;
113115
114- static inline uint64_t splitmix64 (uint64_t x) noexcept {
116+ static LIBC_INLINE uint64_t splitmix64 (uint64_t x) noexcept {
115117 x += 0x9E3779B97F4A7C15ULL ;
116118 x = (x ^ (x >> 30 )) * 0xBF58476D1CE4E5B9ULL ;
117119 x = (x ^ (x >> 27 )) * 0x94D049BB133111EBULL ;
@@ -120,10 +122,10 @@ class RandomGenerator {
120122 }
121123
122124public:
123- explicit inline RandomGenerator (uint64_t seed) noexcept
125+ explicit LIBC_INLINE RandomGenerator (uint64_t seed) noexcept
124126 : state(splitmix64(seed)) {}
125127
126- inline uint64_t next64 () noexcept {
128+ LIBC_INLINE uint64_t next64 () noexcept {
127129 uint64_t x = state;
128130 x ^= x >> 12 ;
129131 x ^= x << 25 ;
@@ -132,52 +134,86 @@ class RandomGenerator {
132134 return x * 0x2545F4914F6CDD1DULL ;
133135 }
134136
135- inline uint32_t next32 () noexcept {
137+ LIBC_INLINE uint32_t next32 () noexcept {
136138 return static_cast <uint32_t >(next64 () >> 32 );
137139 }
138140};
139141
140- // We want our random values to be approximately
141- // Output: a random number with the exponent field between min_exp and max_exp,
142- // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
143- // Caveats:
144- // -EXP_BIAS corresponding to denormal values,
145- // EXP_BIAS + 1 corresponding to inf or nan.
142+ // We want random floating-point values whose *unbiased* exponent e is
143+ // approximately uniform in [min_exp, max_exp]. That is,
144+ // 2^min_exp <= |value| < 2^(max_exp + 1).
145+ // Caveats / boundaries:
146+ // - e = -EXP_BIAS ==> subnormal range (biased exponent = 0). We ensure a
147+ // non-zero mantissa so we don't accidentally produce 0.
148+ // - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
149+ // - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
150+ // defaults to EXP_BIAS.
146151template <typename T>
147152static T
148153get_rand_input (RandomGenerator &rng,
149- int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
150- int min_exp = - LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
154+ int min_exp = - LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
155+ int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
151156 using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
152-
153- // Required to correctly instantiate FPBits for floats and doubles.
154- using RandType = typename cpp::conditional_t <(cpp::is_same_v<T, double >),
155- uint64_t , uint32_t >;
156- RandType bits;
157- if constexpr (cpp::is_same_v<T, uint64_t >)
158- bits = rng.next64 ();
159- else
160- bits = rng.next32 ();
161- double scale =
162- static_cast <double >(max_exp - min_exp + 1 ) / (2 * FPBits::EXP_BIAS + 1 );
163- FPBits fp (bits);
164- fp.set_biased_exponent (
165- static_cast <uint32_t >(fp.get_biased_exponent () * scale + min_exp));
166- return fp.get_val ();
157+ using Storage = typename FPBits::StorageType;
158+
159+ // Sanitize and clamp requested range to what the format supports
160+ if (min_exp > max_exp) {
161+ auto tmp = min_exp;
162+ min_exp = max_exp;
163+ max_exp = tmp;
164+ };
165+ min_exp = cpp::max (min_exp, -FPBits::EXP_BIAS);
166+ max_exp = cpp::min (max_exp, FPBits::EXP_BIAS);
167+
168+ // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
169+ // bias
170+ auto sample_in_range = [&](uint64_t r) -> int32_t {
171+ const uint64_t range = static_cast <uint64_t >(
172+ static_cast <int64_t >(max_exp) - static_cast <int64_t >(min_exp) + 1 );
173+ const uint64_t threshold = (-range) % range;
174+ while (r < threshold)
175+ r = rng.next64 ();
176+ return static_cast <int32_t >(min_exp + static_cast <int64_t >(r % range));
177+ };
178+ const int32_t e = sample_in_range (rng.next64 ());
179+
180+ // Start from random bits to get random sign and mantissa
181+ FPBits xbits ([&] {
182+ if constexpr (cpp::is_same_v<T, double >)
183+ return FPBits (rng.next64 ());
184+ else
185+ return FPBits (rng.next32 ());
186+ }());
187+
188+ if (e == -FPBits::EXP_BIAS) {
189+ // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
190+ xbits.set_biased_exponent (Storage (0 ));
191+ if (xbits.get_mantissa () == Storage (0 ))
192+ xbits.set_mantissa (Storage (1 ));
193+ } else {
194+ // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
195+ const int32_t biased = e + FPBits::EXP_BIAS;
196+ xbits.set_biased_exponent (static_cast <Storage>(biased));
197+ }
198+ return xbits.get_val ();
167199}
168200
169201template <typename T> class MathPerf {
170- using FPBits = fputil::FPBits<T>;
171- using StorageType = typename FPBits::StorageType ;
172- static constexpr StorageType UIntMax =
173- cpp::numeric_limits<StorageType>::max();
202+ static LIBC_INLINE uint64_t make_seed ( uint64_t base_seed, uint64_t salt) {
203+ const uint64_t tid = gpu::get_thread_id () ;
204+ return base_seed ^ (salt << 32 ) ^ (tid * 0x9E3779B97F4A7C15ULL );
205+ }
174206
175207public:
176208 template <size_t N = 1 >
177209 static uint64_t run_throughput_in_range (T f (T), int min_exp, int max_exp,
178- uint64_t seed = N ) {
210+ uint32_t call_index ) {
179211 cpp::array<T, N> inputs;
180- RandomGenerator rng ((seed << 32 ) ^ gpu::get_thread_id ());
212+
213+ uint64_t base_seed = static_cast <uint64_t >(call_index);
214+ uint64_t salt = static_cast <uint64_t >(N);
215+ RandomGenerator rng (make_seed (base_seed, salt));
216+
181217 for (size_t i = 0 ; i < N; ++i)
182218 inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
183219
@@ -186,14 +222,18 @@ template <typename T> class MathPerf {
186222 return total_time / N;
187223 }
188224
189- // Throughput benchmarking for functions that take 2 inputs.
190225 template <size_t N = 1 >
191226 static uint64_t run_throughput_in_range (T f (T, T), int arg1_min_exp,
192227 int arg1_max_exp, int arg2_min_exp,
193- int arg2_max_exp, uint64_t seed = N) {
228+ int arg2_max_exp,
229+ uint32_t call_index) {
194230 cpp::array<T, N> inputs1;
195231 cpp::array<T, N> inputs2;
196- RandomGenerator rng ((seed << 32 ) ^ gpu::get_thread_id ());
232+
233+ uint64_t base_seed = static_cast <uint64_t >(call_index);
234+ uint64_t salt = static_cast <uint64_t >(N);
235+ RandomGenerator rng (make_seed (base_seed, salt));
236+
197237 for (size_t i = 0 ; i < N; ++i) {
198238 inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
199239 inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
@@ -224,4 +264,5 @@ template <typename T> class MathPerf {
224264#define SINGLE_WAVE_BENCHMARK (SuiteName, TestName, Func ) \
225265 BENCHMARK_N_THREADS (SuiteName, TestName, Func, \
226266 LIBC_NAMESPACE::gpu::get_lane_size ())
227- #endif
267+
268+ #endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
0 commit comments