Skip to content

Commit f36f86f

Browse files
Fix random input generation
1 parent 6ccc76e commit f36f86f

File tree

5 files changed

+98
-49
lines changed

5 files changed

+98
-49
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "LibcGpuBenchmark.h"
2+
#include "hdr/stdint_proxy.h"
23
#include "src/__support/CPP/algorithm.h"
34
#include "src/__support/CPP/array.h"
45
#include "src/__support/CPP/atomic.h"
@@ -160,8 +161,9 @@ void Benchmark::run_benchmarks() {
160161
gpu::sync_threads();
161162
}
162163

163-
BenchmarkResult benchmark(const BenchmarkOptions &options,
164-
cpp::function<uint64_t(void)> wrapper_func) {
164+
BenchmarkResult
165+
benchmark(const BenchmarkOptions &options,
166+
const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
165167
BenchmarkResult result;
166168
RuntimeEstimationProgression rep;
167169
uint32_t total_iterations = 0;
@@ -181,11 +183,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
181183
for (int i = 0; i < overhead_iterations; i++)
182184
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
183185

186+
uint32_t call_index = 0;
187+
184188
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
185189
uint64_t sample_cycles = 0;
186190
const clock_t start = static_cast<double>(clock());
187191
for (uint32_t i = 0; i < iterations; i++) {
188-
auto wrapper_intermediate = wrapper_func();
192+
auto wrapper_intermediate = wrapper_func(call_index++);
189193
uint64_t current_result = wrapper_intermediate - overhead;
190194
max = cpp::max(max, current_result);
191195
min = cpp::min(min, current_result);
@@ -223,7 +227,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
223227
result.total_iterations = total_iterations;
224228
result.total_time = total_time / total_iterations;
225229
return result;
226-
};
230+
}
227231

228232
} // namespace benchmarks
229233
} // namespace LIBC_NAMESPACE_DECL

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 82 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "benchmarks/gpu/BenchmarkLogger.h"
55
#include "benchmarks/gpu/timing/timing.h"
66
#include "hdr/stdint_proxy.h"
7+
#include "src/__support/CPP/algorithm.h"
78
#include "src/__support/CPP/array.h"
89
#include "src/__support/CPP/functional.h"
910
#include "src/__support/CPP/limits.h"
@@ -77,17 +78,18 @@ struct BenchmarkResult {
7778
clock_t total_time = 0;
7879
};
7980

80-
BenchmarkResult benchmark(const BenchmarkOptions &options,
81-
cpp::function<uint64_t(void)> wrapper_func);
81+
BenchmarkResult
82+
benchmark(const BenchmarkOptions &options,
83+
const cpp::function<uint64_t(uint32_t)> &wrapper_func);
8284

8385
class Benchmark {
84-
const cpp::function<uint64_t(void)> func;
86+
const cpp::function<uint64_t(uint32_t)> func;
8587
const cpp::string_view suite_name;
8688
const cpp::string_view test_name;
8789
const uint32_t num_threads;
8890

8991
public:
90-
Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
92+
Benchmark(cpp::function<uint64_t(uint32_t)> func, char const *suite_name,
9193
char const *test_name, uint32_t num_threads)
9294
: func(func), suite_name(suite_name), test_name(test_name),
9395
num_threads(num_threads) {
@@ -111,7 +113,7 @@ class Benchmark {
111113
class RandomGenerator {
112114
uint64_t state;
113115

114-
static inline uint64_t splitmix64(uint64_t x) noexcept {
116+
static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
115117
x += 0x9E3779B97F4A7C15ULL;
116118
x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
117119
x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
@@ -120,10 +122,10 @@ class RandomGenerator {
120122
}
121123

122124
public:
123-
explicit inline RandomGenerator(uint64_t seed) noexcept
125+
explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
124126
: state(splitmix64(seed)) {}
125127

126-
inline uint64_t next64() noexcept {
128+
LIBC_INLINE uint64_t next64() noexcept {
127129
uint64_t x = state;
128130
x ^= x >> 12;
129131
x ^= x << 25;
@@ -132,52 +134,86 @@ class RandomGenerator {
132134
return x * 0x2545F4914F6CDD1DULL;
133135
}
134136

135-
inline uint32_t next32() noexcept {
137+
LIBC_INLINE uint32_t next32() noexcept {
136138
return static_cast<uint32_t>(next64() >> 32);
137139
}
138140
};
139141

140-
// We want our random values to be approximately
141-
// Output: a random number with the exponent field between min_exp and max_exp,
142-
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
143-
// Caveats:
144-
// -EXP_BIAS corresponding to denormal values,
145-
// EXP_BIAS + 1 corresponding to inf or nan.
142+
// We want random floating-point values whose *unbiased* exponent e is
143+
// approximately uniform in [min_exp, max_exp]. That is,
144+
// 2^min_exp <= |value| < 2^(max_exp + 1).
145+
// Caveats / boundaries:
146+
// - e = -EXP_BIAS ==> subnormal range (biased exponent = 0). We ensure a
147+
// non-zero mantissa so we don't accidentally produce 0.
148+
// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
149+
// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
150+
// defaults to EXP_BIAS.
146151
template <typename T>
147152
static T
148153
get_rand_input(RandomGenerator &rng,
149-
int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
150-
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
154+
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
155+
int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
151156
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
152-
153-
// Required to correctly instantiate FPBits for floats and doubles.
154-
using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
155-
uint64_t, uint32_t>;
156-
RandType bits;
157-
if constexpr (cpp::is_same_v<T, uint64_t>)
158-
bits = rng.next64();
159-
else
160-
bits = rng.next32();
161-
double scale =
162-
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
163-
FPBits fp(bits);
164-
fp.set_biased_exponent(
165-
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
166-
return fp.get_val();
157+
using Storage = typename FPBits::StorageType;
158+
159+
// Sanitize and clamp requested range to what the format supports
160+
if (min_exp > max_exp) {
161+
auto tmp = min_exp;
162+
min_exp = max_exp;
163+
max_exp = tmp;
164+
};
165+
min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS);
166+
max_exp = cpp::min(max_exp, FPBits::EXP_BIAS);
167+
168+
// Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
169+
// bias
170+
auto sample_in_range = [&](uint64_t r) -> int32_t {
171+
const uint64_t range = static_cast<uint64_t>(
172+
static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
173+
const uint64_t threshold = (-range) % range;
174+
while (r < threshold)
175+
r = rng.next64();
176+
return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
177+
};
178+
const int32_t e = sample_in_range(rng.next64());
179+
180+
// Start from random bits to get random sign and mantissa
181+
FPBits xbits([&] {
182+
if constexpr (cpp::is_same_v<T, double>)
183+
return FPBits(rng.next64());
184+
else
185+
return FPBits(rng.next32());
186+
}());
187+
188+
if (e == -FPBits::EXP_BIAS) {
189+
// Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
190+
xbits.set_biased_exponent(Storage(0));
191+
if (xbits.get_mantissa() == Storage(0))
192+
xbits.set_mantissa(Storage(1));
193+
} else {
194+
// Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
195+
const int32_t biased = e + FPBits::EXP_BIAS;
196+
xbits.set_biased_exponent(static_cast<Storage>(biased));
197+
}
198+
return xbits.get_val();
167199
}
168200

169201
template <typename T> class MathPerf {
170-
using FPBits = fputil::FPBits<T>;
171-
using StorageType = typename FPBits::StorageType;
172-
static constexpr StorageType UIntMax =
173-
cpp::numeric_limits<StorageType>::max();
202+
static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
203+
const uint64_t tid = gpu::get_thread_id();
204+
return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL);
205+
}
174206

175207
public:
176208
template <size_t N = 1>
177209
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
178-
uint64_t seed = N) {
210+
uint32_t call_index) {
179211
cpp::array<T, N> inputs;
180-
RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
212+
213+
uint64_t base_seed = static_cast<uint64_t>(call_index);
214+
uint64_t salt = static_cast<uint64_t>(N);
215+
RandomGenerator rng(make_seed(base_seed, salt));
216+
181217
for (size_t i = 0; i < N; ++i)
182218
inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
183219

@@ -186,14 +222,18 @@ template <typename T> class MathPerf {
186222
return total_time / N;
187223
}
188224

189-
// Throughput benchmarking for functions that take 2 inputs.
190225
template <size_t N = 1>
191226
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
192227
int arg1_max_exp, int arg2_min_exp,
193-
int arg2_max_exp, uint64_t seed = N) {
228+
int arg2_max_exp,
229+
uint32_t call_index) {
194230
cpp::array<T, N> inputs1;
195231
cpp::array<T, N> inputs2;
196-
RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
232+
233+
uint64_t base_seed = static_cast<uint64_t>(call_index);
234+
uint64_t salt = static_cast<uint64_t>(N);
235+
RandomGenerator rng(make_seed(base_seed, salt));
236+
197237
for (size_t i = 0; i < N; ++i) {
198238
inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
199239
inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
@@ -224,4 +264,5 @@ template <typename T> class MathPerf {
224264
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
225265
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
226266
LIBC_NAMESPACE::gpu::get_lane_size())
227-
#endif
267+
268+
#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H

libc/benchmarks/gpu/src/ctype/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ add_benchmark(
77
SRCS
88
isalnum_benchmark.cpp
99
DEPENDS
10+
libc.hdr.stdint_proxy
1011
libc.src.ctype.isalnum
1112
LOADER_ARGS
1213
--threads 64
@@ -19,5 +20,6 @@ add_benchmark(
1920
SRCS
2021
isalpha_benchmark.cpp
2122
DEPENDS
23+
libc.hdr.stdint_proxy
2224
libc.src.ctype.isalpha
2325
)

libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#include "benchmarks/gpu/LibcGpuBenchmark.h"
22

3+
#include "hdr/stdint_proxy.h"
34
#include "src/ctype/isalnum.h"
45

5-
uint64_t BM_IsAlnum() {
6+
uint64_t BM_IsAlnum(uint32_t /*call_index*/) {
67
char x = 'c';
78
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
89
}
@@ -12,13 +13,13 @@ SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
1213
SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
1314
BM_IsAlnum);
1415

15-
uint64_t BM_IsAlnumCapital() {
16+
uint64_t BM_IsAlnumCapital(uint32_t /*call_index*/) {
1617
char x = 'A';
1718
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
1819
}
1920
BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
2021

21-
uint64_t BM_IsAlnumNotAlnum() {
22+
uint64_t BM_IsAlnumNotAlnum(uint32_t /*call_index*/) {
2223
char x = '{';
2324
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
2425
}

libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#include "benchmarks/gpu/LibcGpuBenchmark.h"
22

3+
#include "hdr/stdint_proxy.h"
34
#include "src/ctype/isalpha.h"
45

5-
uint64_t BM_IsAlpha() {
6+
uint64_t BM_IsAlpha(uint32_t /*call_index*/) {
67
char x = 'c';
78
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
89
}

0 commit comments

Comments
 (0)