diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 6ec64bf270b53..beedac78d4826 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -22,8 +22,6 @@ function(add_benchmark benchmark_name) ${BENCHMARK_LINK_LIBRARIES} DEPENDS libc.src.stdio.printf - libc.src.stdlib.srand - libc.src.stdlib.rand ${BENCHMARK_DEPENDS} ${BENCHMARK_UNPARSED_ARGUMENTS} COMPILE_OPTIONS @@ -51,7 +49,6 @@ add_unittest_framework_library( libc.src.__support.CPP.string libc.src.__support.CPP.string_view libc.src.__support.CPP.type_traits - libc.src.__support.CPP.functional libc.src.__support.CPP.limits libc.src.__support.CPP.algorithm libc.src.__support.CPP.atomic @@ -64,8 +61,6 @@ add_unittest_framework_library( libc.src.__support.FPUtil.sqrt libc.src.__support.fixedvector libc.src.time.clock - libc.src.stdlib.rand - libc.src.stdlib.srand libc.benchmarks.gpu.timing.timing libc.src.stdio.printf ) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 57ff5b9fdb846..ef816c51a87d7 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -1,15 +1,17 @@ #include "LibcGpuBenchmark.h" + +#include "hdr/stdint_proxy.h" #include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" #include "src/__support/CPP/atomic.h" #include "src/__support/CPP/string.h" +#include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/sqrt.h" #include "src/__support/GPU/utils.h" #include "src/__support/fixedvector.h" #include "src/__support/macros/config.h" #include "src/__support/time/gpu/time_utils.h" #include "src/stdio/printf.h" -#include "src/stdlib/srand.h" namespace LIBC_NAMESPACE_DECL { namespace benchmarks { @@ -20,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) { benchmarks.push_back(benchmark); } +static void atomic_add_double(cpp::Atomic &atomic_bits, + double value) { + using FPBits = LIBC_NAMESPACE::fputil::FPBits; + + uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED); + + while (true) { + double current_value = FPBits(expected_bits).get_val(); + double next_value = current_value + value; + + uint64_t desired_bits = FPBits(next_value).uintval(); + if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits, + cpp::MemoryOrder::ACQUIRE, + cpp::MemoryOrder::RELAXED)) + break; + } +} + struct AtomicBenchmarkSums { - cpp::Atomic cycles_sum = 0; - cpp::Atomic standard_deviation_sum = 0; + cpp::Atomic active_threads = 0; + cpp::Atomic iterations_sum = 0; + cpp::Atomic weighted_cycles_sum_bits = 0; + cpp::Atomic weighted_squared_cycles_sum_bits = 0; cpp::Atomic min = UINT64_MAX; cpp::Atomic max = 0; - cpp::Atomic samples_sum = 0; - cpp::Atomic iterations_sum = 0; - cpp::Atomic time_sum = 0; - cpp::Atomic active_threads = 0; void reset() { cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); active_threads.store(0, cpp::MemoryOrder::RELAXED); - cycles_sum.store(0, cpp::MemoryOrder::RELAXED); - standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED); + iterations_sum.store(0, cpp::MemoryOrder::RELAXED); + weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED); + weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED); min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED); max.store(0, cpp::MemoryOrder::RELAXED); - samples_sum.store(0, cpp::MemoryOrder::RELAXED); - iterations_sum.store(0, cpp::MemoryOrder::RELAXED); - time_sum.store(0, cpp::MemoryOrder::RELAXED); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); } void update(const BenchmarkResult &result) { cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED); + iterations_sum.fetch_add(result.total_iterations, + cpp::MemoryOrder::RELAXED); - cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED); - standard_deviation_sum.fetch_add( - static_cast(result.standard_deviation), - cpp::MemoryOrder::RELAXED); + const double n_i = static_cast(result.total_iterations); + const double mean_i = result.cycles; + const double stddev_i = result.standard_deviation; + const double variance_i = stddev_i * stddev_i; + atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i); + atomic_add_double(weighted_squared_cycles_sum_bits, + n_i * (variance_i + mean_i * mean_i)); // Perform a CAS loop to atomically update the min uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED); @@ -66,10 +87,6 @@ struct AtomicBenchmarkSums { cpp::MemoryOrder::RELAXED)) ; - samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED); - iterations_sum.fetch_add(result.total_iterations, - cpp::MemoryOrder::RELAXED); - time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); } }; @@ -79,46 +96,49 @@ constexpr auto GREEN = "\033[32m"; constexpr auto RESET = "\033[0m"; void print_results(Benchmark *b) { - BenchmarkResult result; + using FPBits = LIBC_NAMESPACE::fputil::FPBits; + + BenchmarkResult final_result; cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); - int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED); - result.cycles = - all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; - result.standard_deviation = - all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / - num_threads; - result.min = all_results.min.load(cpp::MemoryOrder::RELAXED); - result.max = all_results.max.load(cpp::MemoryOrder::RELAXED); - result.samples = - all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; - result.total_iterations = - all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; - const uint64_t duration_ns = - all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; - const uint64_t duration_us = duration_ns / 1000; - const uint64_t duration_ms = duration_ns / (1000 * 1000); - uint64_t converted_duration = duration_ns; - const char *time_unit; - if (duration_ms != 0) { - converted_duration = duration_ms; - time_unit = "ms"; - } else if (duration_us != 0) { - converted_duration = duration_us; - time_unit = "us"; + + const uint32_t num_threads = + all_results.active_threads.load(cpp::MemoryOrder::RELAXED); + final_result.total_iterations = + all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED); + + if (final_result.total_iterations > 0) { + const uint64_t s1_bits = + all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED); + const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load( + cpp::MemoryOrder::RELAXED); + + const double S1 = FPBits(s1_bits).get_val(); + const double S2 = FPBits(s2_bits).get_val(); + const double N = static_cast(final_result.total_iterations); + + const double global_mean = S1 / N; + const double global_mean_of_squares = S2 / N; + const double global_variance = + global_mean_of_squares - (global_mean * global_mean); + + final_result.cycles = global_mean; + final_result.standard_deviation = + fputil::sqrt(global_variance < 0.0 ? 0.0 : global_variance); } else { - converted_duration = duration_ns; - time_unit = "ns"; + final_result.cycles = 0.0; + final_result.standard_deviation = 0.0; } - result.total_time = converted_duration; - // result.total_time = - // all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; + + final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED); + final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); LIBC_NAMESPACE::printf( - "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n", - b->get_test_name().data(), result.cycles, result.min, result.max, - result.total_iterations, result.total_time, time_unit, - static_cast(result.standard_deviation), num_threads); + "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n", + b->get_test_name().data(), final_result.cycles, + final_result.standard_deviation, (unsigned long long)final_result.min, + (unsigned long long)final_result.max, + (unsigned long long)final_result.total_iterations, (unsigned)num_threads); } void print_header() { @@ -126,9 +146,8 @@ void print_header() { LIBC_NAMESPACE::printf("Running Suite: %-10s\n", benchmarks[0]->get_suite_name().data()); LIBC_NAMESPACE::printf("%s", RESET); - cpp::string titles = - "Benchmark | Cycles | Min | Max | " - "Iterations | Time / Iteration | Stddev | Threads |\n"; + cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | " + " Min | Max | Iterations | Threads |\n"; LIBC_NAMESPACE::printf(titles.data()); cpp::string separator(titles.size(), '-'); @@ -139,10 +158,8 @@ void print_header() { void Benchmark::run_benchmarks() { uint64_t id = gpu::get_thread_id(); - if (id == 0) { + if (id == 0) print_header(); - LIBC_NAMESPACE::srand(gpu::processor_clock()); - } gpu::sync_threads(); @@ -164,69 +181,63 @@ void Benchmark::run_benchmarks() { } BenchmarkResult benchmark(const BenchmarkOptions &options, - cpp::function wrapper_func) { + const BenchmarkTarget &target) { BenchmarkResult result; RuntimeEstimationProgression rep; - uint32_t total_iterations = 0; uint32_t iterations = options.initial_iterations; + if (iterations < 1u) iterations = 1; uint32_t samples = 0; uint64_t total_time = 0; - uint64_t best_guess = 0; - uint64_t cycles_squared = 0; uint64_t min = UINT64_MAX; uint64_t max = 0; - uint64_t overhead = UINT64_MAX; - int overhead_iterations = 10; - for (int i = 0; i < overhead_iterations; i++) - overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); + uint32_t call_index = 0; for (int64_t time_budget = options.max_duration; time_budget >= 0;) { - uint64_t sample_cycles = 0; - const clock_t start = static_cast(clock()); - for (uint32_t i = 0; i < iterations; i++) { - auto wrapper_intermediate = wrapper_func(); - uint64_t current_result = wrapper_intermediate - overhead; + RefinableRuntimeEstimator sample_estimator; + + const clock_t start = clock(); + while (sample_estimator.get_iterations() < iterations) { + auto current_result = target(call_index++); max = cpp::max(max, current_result); min = cpp::min(min, current_result); - sample_cycles += current_result; + sample_estimator.update(current_result); } const clock_t end = clock(); + const clock_t duration_ns = ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; total_time += duration_ns; time_budget -= duration_ns; samples++; - cycles_squared += sample_cycles * sample_cycles; - total_iterations += iterations; - const double change_ratio = - rep.compute_improvement({iterations, sample_cycles}); - best_guess = rep.current_estimation; + const double change_ratio = rep.compute_improvement(sample_estimator); if (samples >= options.max_samples || iterations >= options.max_iterations) break; + + const auto total_iterations = rep.get_estimator().get_iterations(); + if (total_time >= options.min_duration && samples >= options.min_samples && total_iterations >= options.min_iterations && change_ratio < options.epsilon) break; - iterations *= options.scaling_factor; + iterations = static_cast(iterations * options.scaling_factor); } - result.cycles = best_guess; - result.standard_deviation = fputil::sqrt( - static_cast(cycles_squared) / total_iterations - - static_cast(best_guess * best_guess)); + + const auto &estimator = rep.get_estimator(); + result.total_iterations = estimator.get_iterations(); + result.cycles = estimator.get_mean(); + result.standard_deviation = estimator.get_stddev(); result.min = min; result.max = max; - result.samples = samples; - result.total_iterations = total_iterations; - result.total_time = total_time / total_iterations; + return result; -}; +} } // namespace benchmarks } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index a6cf62dd30ce5..60f69edf86556 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -4,14 +4,14 @@ #include "benchmarks/gpu/BenchmarkLogger.h" #include "benchmarks/gpu/timing/timing.h" #include "hdr/stdint_proxy.h" +#include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" -#include "src/__support/CPP/functional.h" #include "src/__support/CPP/limits.h" #include "src/__support/CPP/string_view.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/sqrt.h" #include "src/__support/macros/config.h" -#include "src/stdlib/rand.h" #include "src/time/clock.h" namespace LIBC_NAMESPACE_DECL { @@ -30,68 +30,136 @@ struct BenchmarkOptions { double scaling_factor = 1.4; }; -struct Measurement { +class RefinableRuntimeEstimator { uint32_t iterations = 0; - uint64_t elapsed_cycles = 0; -}; - -class RefinableRuntimeEstimation { - uint64_t total_cycles = 0; - uint32_t total_iterations = 0; + uint64_t sum_of_cycles = 0; + uint64_t sum_of_squared_cycles = 0; public: - uint64_t update(const Measurement &M) { - total_cycles += M.elapsed_cycles; - total_iterations += M.iterations; - return total_cycles / total_iterations; + void update(uint64_t cycles) noexcept { + iterations += 1; + sum_of_cycles += cycles; + sum_of_squared_cycles += cycles * cycles; + } + + void update(const RefinableRuntimeEstimator &other) noexcept { + iterations += other.iterations; + sum_of_cycles += other.sum_of_cycles; + sum_of_squared_cycles += other.sum_of_squared_cycles; + } + + double get_mean() const noexcept { + if (iterations == 0) + return 0.0; + + return static_cast(sum_of_cycles) / iterations; + } + + double get_variance() const noexcept { + if (iterations == 0) + return 0.0; + + const double num = static_cast(iterations); + const double sum_x = static_cast(sum_of_cycles); + const double sum_x2 = static_cast(sum_of_squared_cycles); + + const double mean_of_squares = sum_x2 / num; + const double mean = sum_x / num; + const double mean_squared = mean * mean; + const double variance = mean_of_squares - mean_squared; + + return variance < 0.0 ? 0.0 : variance; } + + double get_stddev() const noexcept { + return fputil::sqrt(get_variance()); + } + + uint32_t get_iterations() const noexcept { return iterations; } }; // Tracks the progression of the runtime estimation class RuntimeEstimationProgression { - RefinableRuntimeEstimation rre; + RefinableRuntimeEstimator estimator; + double current_mean = 0.0; public: - uint64_t current_estimation = 0; + const RefinableRuntimeEstimator &get_estimator() const noexcept { + return estimator; + } + + double + compute_improvement(const RefinableRuntimeEstimator &sample_estimator) { + if (sample_estimator.get_iterations() == 0) + return 1.0; - double compute_improvement(const Measurement &M) { - const uint64_t new_estimation = rre.update(M); - double ratio = - (static_cast(current_estimation) / new_estimation) - 1.0; + estimator.update(sample_estimator); - // Get absolute value + const double new_mean = estimator.get_mean(); + if (current_mean == 0.0 || new_mean == 0.0) { + current_mean = new_mean; + return 1.0; + } + + double ratio = (current_mean / new_mean) - 1.0; if (ratio < 0) - ratio *= -1; + ratio = -ratio; - current_estimation = new_estimation; + current_mean = new_mean; return ratio; } }; struct BenchmarkResult { - uint64_t cycles = 0; + uint64_t total_iterations = 0; + double cycles = 0; double standard_deviation = 0; uint64_t min = UINT64_MAX; uint64_t max = 0; - uint32_t samples = 0; - uint32_t total_iterations = 0; - clock_t total_time = 0; +}; + +struct BenchmarkTarget { + using IndexedFnPtr = uint64_t (*)(uint32_t); + using IndexlessFnPtr = uint64_t (*)(); + + enum class Kind : uint8_t { Indexed, Indexless } kind; + union { + IndexedFnPtr indexed_fn_ptr; + IndexlessFnPtr indexless_fn_ptr; + }; + + LIBC_INLINE BenchmarkTarget(IndexedFnPtr func) + : kind(Kind::Indexed), indexed_fn_ptr(func) {} + LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func) + : kind(Kind::Indexless), indexless_fn_ptr(func) {} + + LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const { + return kind == Kind::Indexed ? indexed_fn_ptr(call_index) + : indexless_fn_ptr(); + } }; BenchmarkResult benchmark(const BenchmarkOptions &options, - cpp::function wrapper_func); + const BenchmarkTarget &target); class Benchmark { - const cpp::function func; + const BenchmarkTarget target; const cpp::string_view suite_name; const cpp::string_view test_name; const uint32_t num_threads; public: - Benchmark(cpp::function func, char const *suite_name, + Benchmark(uint64_t (*f)(), const char *suite, const char *test, + uint32_t threads) + : target(BenchmarkTarget(f)), suite_name(suite), test_name(test), + num_threads(threads) { + add_benchmark(this); + } + + Benchmark(uint64_t (*f)(uint32_t), char const *suite_name, char const *test_name, uint32_t num_threads) - : func(func), suite_name(suite_name), test_name(test_name), - num_threads(num_threads) { + : target(BenchmarkTarget(f)), suite_name(suite_name), + test_name(test_name), num_threads(num_threads) { add_benchmark(this); } @@ -105,67 +173,139 @@ class Benchmark { private: BenchmarkResult run() { BenchmarkOptions options; - return benchmark(options, func); + return benchmark(options, target); + } +}; + +class RandomGenerator { + uint64_t state; + + static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept { + x += 0x9E3779B97F4A7C15ULL; + x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL; + x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL; + x = (x ^ (x >> 31)); + return x ? x : 0x9E3779B97F4A7C15ULL; + } + +public: + explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept + : state(splitmix64(seed)) {} + + LIBC_INLINE uint64_t next64() noexcept { + uint64_t x = state; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + state = x; + return x * 0x2545F4914F6CDD1DULL; + } + + LIBC_INLINE uint32_t next32() noexcept { + return static_cast(next64() >> 32); } }; -// We want our random values to be approximately -// Output: a random number with the exponent field between min_exp and max_exp, -// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1), -// Caveats: -// -EXP_BIAS corresponding to denormal values, -// EXP_BIAS + 1 corresponding to inf or nan. +// We want random floating-point values whose *unbiased* exponent e is +// approximately uniform in [min_exp, max_exp]. That is, +// 2^min_exp <= |value| < 2^(max_exp + 1). +// Caveats / boundaries: +// - e = -EXP_BIAS ==> subnormal range (biased exponent = 0). We ensure a +// non-zero mantissa so we don't accidentally produce 0. +// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers. +// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp +// defaults to EXP_BIAS. template static T -get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS, - int min_exp = -LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS) { +get_rand_input(RandomGenerator &rng, + int min_exp = -LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS, + int max_exp = LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS) { using FPBits = LIBC_NAMESPACE::fputil::FPBits; - - // Required to correctly instantiate FPBits for floats and doubles. - using RandType = typename cpp::conditional_t<(cpp::is_same_v), - uint64_t, uint32_t>; - RandType bits; - if constexpr (cpp::is_same_v) - bits = (static_cast(LIBC_NAMESPACE::rand()) << 32) | - static_cast(LIBC_NAMESPACE::rand()); - else - bits = LIBC_NAMESPACE::rand(); - double scale = - static_cast(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1); - FPBits fp(bits); - fp.set_biased_exponent( - static_cast(fp.get_biased_exponent() * scale + min_exp)); - return fp.get_val(); + using Storage = typename FPBits::StorageType; + + // Sanitize and clamp requested range to what the format supports + if (min_exp > max_exp) { + auto tmp = min_exp; + min_exp = max_exp; + max_exp = tmp; + }; + min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS); + max_exp = cpp::min(max_exp, FPBits::EXP_BIAS); + + // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo + // bias + auto sample_in_range = [&](uint64_t r) -> int32_t { + const uint64_t range = static_cast( + static_cast(max_exp) - static_cast(min_exp) + 1); + const uint64_t threshold = (-range) % range; + while (r < threshold) + r = rng.next64(); + return static_cast(min_exp + static_cast(r % range)); + }; + const int32_t e = sample_in_range(rng.next64()); + + // Start from random bits to get random sign and mantissa + FPBits xbits([&] { + if constexpr (cpp::is_same_v) + return FPBits(rng.next64()); + else + return FPBits(rng.next32()); + }()); + + if (e == -FPBits::EXP_BIAS) { + // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0 + xbits.set_biased_exponent(Storage(0)); + if (xbits.get_mantissa() == Storage(0)) + xbits.set_mantissa(Storage(1)); + } else { + // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS] + const int32_t biased = e + FPBits::EXP_BIAS; + xbits.set_biased_exponent(static_cast(biased)); + } + return xbits.get_val(); } template class MathPerf { - using FPBits = fputil::FPBits; - using StorageType = typename FPBits::StorageType; - static constexpr StorageType UIntMax = - cpp::numeric_limits::max(); + static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) { + const uint64_t tid = gpu::get_thread_id(); + return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL); + } public: + // Returns cycles-per-call (lower is better) template - static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) { + static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp, + uint32_t call_index) { cpp::array inputs; + + uint64_t base_seed = static_cast(call_index); + uint64_t salt = static_cast(N); + RandomGenerator rng(make_seed(base_seed, salt)); + for (size_t i = 0; i < N; ++i) - inputs[i] = get_rand_input(min_exp, max_exp); + inputs[i] = get_rand_input(rng, min_exp, max_exp); uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); return total_time / N; } - // Throughput benchmarking for functions that take 2 inputs. + // Returns cycles-per-call (lower is better) template static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, int arg1_max_exp, int arg2_min_exp, - int arg2_max_exp) { + int arg2_max_exp, + uint32_t call_index) { cpp::array inputs1; cpp::array inputs2; + + uint64_t base_seed = static_cast(call_index); + uint64_t salt = static_cast(N); + RandomGenerator rng(make_seed(base_seed, salt)); + for (size_t i = 0; i < N; ++i) { - inputs1[i] = get_rand_input(arg1_min_exp, arg1_max_exp); - inputs2[i] = get_rand_input(arg2_min_exp, arg2_max_exp); + inputs1[i] = get_rand_input(rng, arg1_min_exp, arg1_max_exp); + inputs2[i] = get_rand_input(rng, arg2_min_exp, arg2_max_exp); } uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); @@ -193,4 +333,5 @@ template class MathPerf { #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ LIBC_NAMESPACE::gpu::get_lane_size()) -#endif + +#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt index 7a12ce4e61c9e..8417f23c124a0 100644 --- a/libc/benchmarks/gpu/src/math/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt @@ -34,11 +34,6 @@ add_benchmark( libc.hdr.stdint_proxy libc.src.math.sin libc.src.math.sinf - libc.src.stdlib.srand - libc.src.stdlib.rand - libc.src.__support.FPUtil.fp_bits - libc.src.__support.CPP.bit - libc.src.__support.CPP.array COMPILE_OPTIONS ${math_benchmark_flags} LOADER_ARGS @@ -54,11 +49,6 @@ add_benchmark( DEPENDS libc.hdr.stdint_proxy libc.src.math.atan2 - libc.src.stdlib.srand - libc.src.stdlib.rand - libc.src.__support.FPUtil.fp_bits - libc.src.__support.CPP.bit - libc.src.__support.CPP.array COMPILE_OPTIONS ${math_benchmark_flags} LOADER_ARGS diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp index 1f91a9a35c373..82bb0c5d7de49 100644 --- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp +++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp @@ -1,27 +1,27 @@ #include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "hdr/stdint_proxy.h" #include "src/math/atan2.h" -#include "src/stdlib/rand.h" #if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) #include "platform.h" #endif -#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \ - []() { \ +#define BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, N) \ + [](uint32_t call_index) { \ return LIBC_NAMESPACE::benchmarks::MathPerf::run_throughput_in_range< \ - N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP); \ + N>(Func, MinExp, MaxExp, MinExp, MaxExp, call_index); \ } -#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \ +#define BENCH(T, Name, Func, MinExp, MaxExp) \ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1, \ - BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \ + BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1)); \ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128, \ - BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \ + BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 128)); \ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024, \ - BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \ + BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1024)); \ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096, \ - BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096)) + BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 4096)) BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023); BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3); diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp index a759db2e9d33f..5fe95c3f3b268 100644 --- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp +++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp @@ -1,36 +1,28 @@ #include "benchmarks/gpu/LibcGpuBenchmark.h" -#include "src/__support/CPP/array.h" -#include "src/__support/CPP/bit.h" -#include "src/__support/CPP/functional.h" -#include "src/__support/FPUtil/FPBits.h" +#include "hdr/stdint_proxy.h" #include "src/math/sin.h" #include "src/math/sinf.h" -#include "src/stdlib/rand.h" #if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) #include "platform.h" #endif -// BENCHMARK() expects a function that with no parameters that returns a -// uint64_t representing the latency. Defining each benchmark using macro that -// expands to a lambda to allow us to switch the implementation of `sin()` to -// easily register NVPTX benchmarks. -#define BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \ - []() { \ +#define BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, N) \ + [](uint32_t call_index) { \ return LIBC_NAMESPACE::benchmarks::MathPerf::run_throughput_in_range< \ - N>(Func, MIN_EXP, MAX_EXP); \ + N>(Func, MinExp, MaxExp, call_index); \ } -#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \ +#define BENCH(T, Name, Func, MinExp, MaxExp) \ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \ - BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \ + BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1)); \ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \ - BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \ + BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 128)); \ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \ - BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \ + BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1024)); \ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \ - BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096)) + BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 4096)) BENCH(double, Sin, LIBC_NAMESPACE::sin, -1023, 1023); BENCH(double, SinTwoPi, LIBC_NAMESPACE::sin, -10, 3); diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt index dd7c2d342f70f..d6a89d04dab97 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt @@ -7,6 +7,7 @@ add_header_library( libc.src.__support.common libc.src.__support.macros.config libc.src.__support.macros.attributes - libc.src.__support.CPP.type_traits + libc.src.__support.CPP.algorithm libc.src.__support.CPP.array + libc.src.__support.CPP.type_traits ) diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h index 37dbb9af5976b..de721a2d6ce6b 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/timing.h +++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU #include "hdr/stdint_proxy.h" +#include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" #include "src/__support/CPP/atomic.h" #include "src/__support/CPP/type_traits.h" @@ -105,20 +106,80 @@ template return stop - start; } -// Provides throughput benchmarking. -template -[[gnu::noinline]] static LIBC_INLINE uint64_t -throughput(F f, const cpp::array &inputs) { +// Provides the *baseline* for throughput: measures loop and measurement costs +// without calling the f function +template +static LIBC_INLINE uint64_t +throughput_baseline(const cpp::array &inputs) { asm("" ::"v"(&inputs)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); + asm("" ::"s"(start)); + + T result{}; + for (auto input : inputs) { + asm("" ::"v"(input)); + result = input; + asm("" ::"v"(result)); + } + + uint64_t stop = gpu::processor_clock(); + asm("" ::"s"(stop)); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + + volatile auto output = result; + return stop - start; +} + +// Provides throughput benchmarking +template +static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { + uint64_t baseline = UINT64_MAX; + for (int i = 0; i < 5; ++i) + baseline = cpp::min(baseline, throughput_baseline(inputs)); + + asm("" ::"v"(&inputs)); + + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + uint64_t start = gpu::processor_clock(); asm("" ::"s"(start)); + T result{}; for (auto input : inputs) { - auto result = f(input); + asm("" ::"v"(input)); + result = f(input); + asm("" ::"v"(result)); + } + uint64_t stop = gpu::processor_clock(); + asm("" ::"s"(stop)); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + + volatile auto output = result; + + const uint64_t measured = stop - start; + return measured > baseline ? (measured - baseline) : 0; +} + +// Provides the *baseline* for throughput with 2 arguments: measures loop and +// measurement costs without calling the f function +template +static LIBC_INLINE uint64_t throughput_baseline( + const cpp::array &inputs1, const cpp::array &inputs2) { + asm("" ::"v"(&inputs1), "v"(&inputs2)); + + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + uint64_t start = gpu::processor_clock(); + asm("" ::"s"(start)); + + T result{}; + for (size_t i = 0; i < N; i++) { + T x = inputs1[i]; + T y = inputs2[i]; + asm("" ::"v"(x), "v"(y)); + result = x; asm("" ::"v"(result)); } @@ -126,24 +187,31 @@ throughput(F f, const cpp::array &inputs) { asm("" ::"s"(stop)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); - // Return the time elapsed. + volatile auto output = result; + return stop - start; } // Provides throughput benchmarking for 2 arguments (e.g. atan2()) template -[[gnu::noinline]] static LIBC_INLINE uint64_t throughput( - F f, const cpp::array &inputs1, const cpp::array &inputs2) { +static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, + const cpp::array &inputs2) { + uint64_t baseline = UINT64_MAX; + for (int i = 0; i < 5; ++i) + baseline = cpp::min(baseline, throughput_baseline(inputs1, inputs2)); + asm("" ::"v"(&inputs1), "v"(&inputs2)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); - asm("" ::"s"(start)); - for (size_t i = 0; i < inputs1.size(); i++) { - auto result = f(inputs1[i], inputs2[i]); - + T result{}; + for (size_t i = 0; i < N; i++) { + T x = inputs1[i]; + T y = inputs2[i]; + asm("" ::"v"(x), "v"(y)); + result = f(x, y); asm("" ::"v"(result)); } @@ -151,8 +219,10 @@ template asm("" ::"s"(stop)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); - // Return the time elapsed. - return stop - start; + volatile auto output = result; + + const uint64_t measured = stop - start; + return measured > baseline ? (measured - baseline) : 0; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt index a19c16ee4e44d..801080e7a6e98 100644 --- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt +++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt @@ -7,6 +7,7 @@ add_header_library( libc.src.__support.common libc.src.__support.macros.config libc.src.__support.macros.attributes - libc.src.__support.CPP.type_traits + libc.src.__support.CPP.algorithm libc.src.__support.CPP.array + libc.src.__support.CPP.type_traits ) diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 3c729636367aa..133032ca08423 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX #include "hdr/stdint_proxy.h" +#include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" #include "src/__support/CPP/atomic.h" #include "src/__support/CPP/type_traits.h" @@ -95,18 +96,47 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { return stop - start; } -// Provides throughput benchmarking. -template -[[gnu::noinline]] static LIBC_INLINE uint64_t -throughput(F f, const cpp::array &inputs) { +// Provides the *baseline* for throughput: measures loop and measurement costs +// without calling the f function +template +static LIBC_INLINE uint64_t +throughput_baseline(const cpp::array &inputs) { asm("" ::"r"(&inputs)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); + asm("" ::"llr"(start)); + + T result{}; + for (auto input : inputs) { + asm("" ::"r"(input)); + result = input; + asm("" ::"r"(result)); + } + + uint64_t stop = gpu::processor_clock(); + asm("" ::"r"(stop)); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + + volatile auto output = result; + return stop - start; +} + +// Provides throughput benchmarking +template +static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { + uint64_t baseline = UINT64_MAX; + for (int i = 0; i < 5; ++i) + baseline = cpp::min(baseline, throughput_baseline(inputs)); + + asm("" ::"r"(&inputs)); + + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + uint64_t start = gpu::processor_clock(); asm("" ::"llr"(start)); - uint64_t result; + T result{}; for (auto input : inputs) { asm("" ::"r"(input)); result = f(input); @@ -114,39 +144,77 @@ throughput(F f, const cpp::array &inputs) { } uint64_t stop = gpu::processor_clock(); + asm("" ::"r"(stop)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + + volatile auto output = result; + + const uint64_t measured = stop - start; + return measured > baseline ? (measured - baseline) : 0; +} + +// Provides the *baseline* for throughput with 2 arguments: measures loop and +// measurement costs without calling the f function +template +static LIBC_INLINE uint64_t throughput_baseline( + const cpp::array &inputs1, const cpp::array &inputs2) { + asm("" ::"r"(&inputs1), "r"(&inputs2)); + + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + uint64_t start = gpu::processor_clock(); + asm("" ::"llr"(start)); + + T result{}; + for (size_t i = 0; i < N; i++) { + T x = inputs1[i]; + T y = inputs2[i]; + asm("" ::"r"(x), "r"(y)); + result = x; + asm("" ::"r"(result)); + } + + uint64_t stop = gpu::processor_clock(); asm("" ::"r"(stop)); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + volatile auto output = result; - // Return the time elapsed. return stop - start; } // Provides throughput benchmarking for 2 arguments (e.g. atan2()) template -[[gnu::noinline]] static LIBC_INLINE uint64_t throughput( - F f, const cpp::array &inputs1, const cpp::array &inputs2) { +static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, + const cpp::array &inputs2) { + uint64_t baseline = UINT64_MAX; + for (int i = 0; i < 5; ++i) + baseline = cpp::min(baseline, throughput_baseline(inputs1, inputs2)); + asm("" ::"r"(&inputs1), "r"(&inputs2)); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); - asm("" ::"llr"(start)); - uint64_t result; - for (size_t i = 0; i < inputs1.size(); i++) { - result = f(inputs1[i], inputs2[i]); + T result{}; + for (size_t i = 0; i < N; i++) { + T x = inputs1[i]; + T y = inputs2[i]; + asm("" ::"r"(x), "r"(y)); + result = f(x, y); asm("" ::"r"(result)); } uint64_t stop = gpu::processor_clock(); - cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); asm("" ::"r"(stop)); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); + volatile auto output = result; - // Return the time elapsed. - return stop - start; + const uint64_t measured = stop - start; + return measured > baseline ? (measured - baseline) : 0; } + } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX