Skip to content
5 changes: 0 additions & 5 deletions libc/benchmarks/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ function(add_benchmark benchmark_name)
${BENCHMARK_LINK_LIBRARIES}
DEPENDS
libc.src.stdio.printf
libc.src.stdlib.srand
libc.src.stdlib.rand
${BENCHMARK_DEPENDS}
${BENCHMARK_UNPARSED_ARGUMENTS}
COMPILE_OPTIONS
Expand Down Expand Up @@ -51,7 +49,6 @@ add_unittest_framework_library(
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.functional
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
Expand All @@ -64,8 +61,6 @@ add_unittest_framework_library(
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
libc.src.stdlib.rand
libc.src.stdlib.srand
libc.benchmarks.gpu.timing.timing
libc.src.stdio.printf
)
Expand Down
187 changes: 99 additions & 88 deletions libc/benchmarks/gpu/LibcGpuBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#include "LibcGpuBenchmark.h"

#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
#include "src/stdlib/srand.h"

namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
Expand All @@ -20,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}

static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
double value) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;

uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);

while (true) {
double current_value = FPBits(expected_bits).get_val();
double next_value = current_value + value;

uint64_t desired_bits = FPBits(next_value).uintval();
if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
cpp::MemoryOrder::ACQUIRE,
cpp::MemoryOrder::RELAXED))
break;
}
}

struct AtomicBenchmarkSums {
cpp::Atomic<uint64_t> cycles_sum = 0;
cpp::Atomic<uint64_t> standard_deviation_sum = 0;
cpp::Atomic<uint32_t> active_threads = 0;
cpp::Atomic<uint64_t> iterations_sum = 0;
cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
cpp::Atomic<uint64_t> min = UINT64_MAX;
cpp::Atomic<uint64_t> max = 0;
cpp::Atomic<uint32_t> samples_sum = 0;
cpp::Atomic<uint32_t> iterations_sum = 0;
cpp::Atomic<clock_t> time_sum = 0;
cpp::Atomic<uint64_t> active_threads = 0;

void reset() {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
active_threads.store(0, cpp::MemoryOrder::RELAXED);
cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
max.store(0, cpp::MemoryOrder::RELAXED);
samples_sum.store(0, cpp::MemoryOrder::RELAXED);
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
time_sum.store(0, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
}

void update(const BenchmarkResult &result) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
iterations_sum.fetch_add(result.total_iterations,
cpp::MemoryOrder::RELAXED);

cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
standard_deviation_sum.fetch_add(
static_cast<uint64_t>(result.standard_deviation),
cpp::MemoryOrder::RELAXED);
const double n_i = static_cast<double>(result.total_iterations);
const double mean_i = result.cycles;
const double stddev_i = result.standard_deviation;
const double variance_i = stddev_i * stddev_i;
atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
atomic_add_double(weighted_squared_cycles_sum_bits,
n_i * (variance_i + mean_i * mean_i));

// Perform a CAS loop to atomically update the min
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
Expand All @@ -66,10 +87,6 @@ struct AtomicBenchmarkSums {
cpp::MemoryOrder::RELAXED))
;

samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
iterations_sum.fetch_add(result.total_iterations,
cpp::MemoryOrder::RELAXED);
time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
}
};
Expand All @@ -79,56 +96,58 @@ constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";

void print_results(Benchmark *b) {
BenchmarkResult result;
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;

BenchmarkResult final_result;
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
result.cycles =
all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
result.standard_deviation =
all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
num_threads;
result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
result.samples =
all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
result.total_iterations =
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
const uint64_t duration_ns =
all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
const uint64_t duration_us = duration_ns / 1000;
const uint64_t duration_ms = duration_ns / (1000 * 1000);
uint64_t converted_duration = duration_ns;
const char *time_unit;
if (duration_ms != 0) {
converted_duration = duration_ms;
time_unit = "ms";
} else if (duration_us != 0) {
converted_duration = duration_us;
time_unit = "us";

const uint32_t num_threads =
all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
final_result.total_iterations =
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);

if (final_result.total_iterations > 0) {
const uint64_t s1_bits =
all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
cpp::MemoryOrder::RELAXED);

const double S1 = FPBits(s1_bits).get_val();
const double S2 = FPBits(s2_bits).get_val();
const double N = static_cast<double>(final_result.total_iterations);

const double global_mean = S1 / N;
const double global_mean_of_squares = S2 / N;
const double global_variance =
global_mean_of_squares - (global_mean * global_mean);

final_result.cycles = global_mean;
final_result.standard_deviation =
fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
} else {
converted_duration = duration_ns;
time_unit = "ns";
final_result.cycles = 0.0;
final_result.standard_deviation = 0.0;
}
result.total_time = converted_duration;
// result.total_time =
// all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;

final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);

LIBC_NAMESPACE::printf(
"%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
b->get_test_name().data(), result.cycles, result.min, result.max,
result.total_iterations, result.total_time, time_unit,
static_cast<uint64_t>(result.standard_deviation), num_threads);
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
b->get_test_name().data(), final_result.cycles,
final_result.standard_deviation, (unsigned long long)final_result.min,
(unsigned long long)final_result.max,
(unsigned long long)final_result.total_iterations, (unsigned)num_threads);
}

void print_header() {
LIBC_NAMESPACE::printf("%s", GREEN);
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles =
"Benchmark | Cycles | Min | Max | "
"Iterations | Time / Iteration | Stddev | Threads |\n";
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
" Min | Max | Iterations | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());

cpp::string separator(titles.size(), '-');
Expand All @@ -139,10 +158,8 @@ void print_header() {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();

if (id == 0) {
if (id == 0)
print_header();
LIBC_NAMESPACE::srand(gpu::processor_clock());
}

gpu::sync_threads();

Expand All @@ -164,69 +181,63 @@ void Benchmark::run_benchmarks() {
}

BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func) {
const BenchmarkTarget &target) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
uint32_t total_iterations = 0;
uint32_t iterations = options.initial_iterations;

if (iterations < 1u)
iterations = 1;

uint32_t samples = 0;
uint64_t total_time = 0;
uint64_t best_guess = 0;
uint64_t cycles_squared = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;

uint64_t overhead = UINT64_MAX;
int overhead_iterations = 10;
for (int i = 0; i < overhead_iterations; i++)
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
uint32_t call_index = 0;

for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
uint64_t sample_cycles = 0;
const clock_t start = static_cast<double>(clock());
for (uint32_t i = 0; i < iterations; i++) {
auto wrapper_intermediate = wrapper_func();
uint64_t current_result = wrapper_intermediate - overhead;
RefinableRuntimeEstimator sample_estimator;

const clock_t start = clock();
while (sample_estimator.get_iterations() < iterations) {
auto current_result = target(call_index++);
max = cpp::max(max, current_result);
min = cpp::min(min, current_result);
sample_cycles += current_result;
sample_estimator.update(current_result);
}
const clock_t end = clock();

const clock_t duration_ns =
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
total_time += duration_ns;
time_budget -= duration_ns;
samples++;
cycles_squared += sample_cycles * sample_cycles;

total_iterations += iterations;
const double change_ratio =
rep.compute_improvement({iterations, sample_cycles});
best_guess = rep.current_estimation;
const double change_ratio = rep.compute_improvement(sample_estimator);

if (samples >= options.max_samples || iterations >= options.max_iterations)
break;

const auto total_iterations = rep.get_estimator().get_iterations();

if (total_time >= options.min_duration && samples >= options.min_samples &&
total_iterations >= options.min_iterations &&
change_ratio < options.epsilon)
break;

iterations *= options.scaling_factor;
iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
}
result.cycles = best_guess;
result.standard_deviation = fputil::sqrt<double>(
static_cast<double>(cycles_squared) / total_iterations -
static_cast<double>(best_guess * best_guess));

const auto &estimator = rep.get_estimator();
result.total_iterations = estimator.get_iterations();
result.cycles = estimator.get_mean();
result.standard_deviation = estimator.get_stddev();
result.min = min;
result.max = max;
result.samples = samples;
result.total_iterations = total_iterations;
result.total_time = total_time / total_iterations;

return result;
};
}

} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
Loading
Loading