Skip to content

Commit b686082

Browse files
Correct statistics aggregation and reporting
1 parent a11e775 commit b686082

File tree

2 files changed

+78
-69
lines changed

2 files changed

+78
-69
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 76 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "src/__support/CPP/array.h"
66
#include "src/__support/CPP/atomic.h"
77
#include "src/__support/CPP/string.h"
8+
#include "src/__support/FPUtil/FPBits.h"
89
#include "src/__support/FPUtil/sqrt.h"
910
#include "src/__support/GPU/utils.h"
1011
#include "src/__support/fixedvector.h"
@@ -21,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
2122
benchmarks.push_back(benchmark);
2223
}
2324

25+
static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
26+
double value) {
27+
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
28+
29+
uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
30+
31+
while (true) {
32+
double current_value = FPBits(expected_bits).get_val();
33+
double next_value = current_value + value;
34+
35+
uint64_t desired_bits = FPBits(next_value).uintval();
36+
if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
37+
cpp::MemoryOrder::ACQUIRE,
38+
cpp::MemoryOrder::RELAXED))
39+
break;
40+
}
41+
}
42+
2443
struct AtomicBenchmarkSums {
25-
cpp::Atomic<uint64_t> cycles_sum = 0;
26-
cpp::Atomic<uint64_t> standard_deviation_sum = 0;
44+
cpp::Atomic<uint32_t> active_threads = 0;
45+
cpp::Atomic<uint64_t> iterations_sum = 0;
46+
cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
47+
cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
2748
cpp::Atomic<uint64_t> min = UINT64_MAX;
2849
cpp::Atomic<uint64_t> max = 0;
29-
cpp::Atomic<uint32_t> samples_sum = 0;
30-
cpp::Atomic<uint32_t> iterations_sum = 0;
31-
cpp::Atomic<clock_t> time_sum = 0;
32-
cpp::Atomic<uint64_t> active_threads = 0;
3350

3451
void reset() {
3552
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
3653
active_threads.store(0, cpp::MemoryOrder::RELAXED);
37-
cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
38-
standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
54+
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
55+
weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
56+
weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
3957
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
4058
max.store(0, cpp::MemoryOrder::RELAXED);
41-
samples_sum.store(0, cpp::MemoryOrder::RELAXED);
42-
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
43-
time_sum.store(0, cpp::MemoryOrder::RELAXED);
4459
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
4560
}
4661

4762
void update(const BenchmarkResult &result) {
4863
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
4964
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
65+
iterations_sum.fetch_add(result.total_iterations,
66+
cpp::MemoryOrder::RELAXED);
5067

51-
cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
52-
standard_deviation_sum.fetch_add(
53-
static_cast<uint64_t>(result.standard_deviation),
54-
cpp::MemoryOrder::RELAXED);
68+
const double n_i = static_cast<double>(result.total_iterations);
69+
const double mean_i = result.cycles;
70+
const double stddev_i = result.standard_deviation;
71+
const double variance_i = stddev_i * stddev_i;
72+
atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
73+
atomic_add_double(weighted_squared_cycles_sum_bits,
74+
n_i * (variance_i + mean_i * mean_i));
5575

5676
// Perform a CAS loop to atomically update the min
5777
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
@@ -67,10 +87,6 @@ struct AtomicBenchmarkSums {
6787
cpp::MemoryOrder::RELAXED))
6888
;
6989

70-
samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
71-
iterations_sum.fetch_add(result.total_iterations,
72-
cpp::MemoryOrder::RELAXED);
73-
time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
7490
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
7591
}
7692
};
@@ -80,56 +96,58 @@ constexpr auto GREEN = "\033[32m";
8096
constexpr auto RESET = "\033[0m";
8197

8298
void print_results(Benchmark *b) {
83-
BenchmarkResult result;
99+
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
100+
101+
BenchmarkResult final_result;
84102
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
85-
int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
86-
result.cycles =
87-
all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
88-
result.standard_deviation =
89-
all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
90-
num_threads;
91-
result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
92-
result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
93-
result.samples =
94-
all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
95-
result.total_iterations =
96-
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
97-
const uint64_t duration_ns =
98-
all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
99-
const uint64_t duration_us = duration_ns / 1000;
100-
const uint64_t duration_ms = duration_ns / (1000 * 1000);
101-
uint64_t converted_duration = duration_ns;
102-
const char *time_unit;
103-
if (duration_ms != 0) {
104-
converted_duration = duration_ms;
105-
time_unit = "ms";
106-
} else if (duration_us != 0) {
107-
converted_duration = duration_us;
108-
time_unit = "us";
103+
104+
const uint32_t num_threads =
105+
all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
106+
final_result.total_iterations =
107+
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
108+
109+
if (final_result.total_iterations > 0) {
110+
const uint64_t s1_bits =
111+
all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
112+
const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
113+
cpp::MemoryOrder::RELAXED);
114+
115+
const double S1 = FPBits(s1_bits).get_val();
116+
const double S2 = FPBits(s2_bits).get_val();
117+
const double N = static_cast<double>(final_result.total_iterations);
118+
119+
const double global_mean = S1 / N;
120+
const double global_mean_of_squares = S2 / N;
121+
const double global_variance =
122+
global_mean_of_squares - (global_mean * global_mean);
123+
124+
final_result.cycles = global_mean;
125+
final_result.standard_deviation =
126+
fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
109127
} else {
110-
converted_duration = duration_ns;
111-
time_unit = "ns";
128+
final_result.cycles = 0.0;
129+
final_result.standard_deviation = 0.0;
112130
}
113-
result.total_time = converted_duration;
114-
// result.total_time =
115-
// all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
131+
132+
final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
133+
final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
116134
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
117135

118136
LIBC_NAMESPACE::printf(
119-
"%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
120-
b->get_test_name().data(), result.cycles, result.min, result.max,
121-
result.total_iterations, result.total_time, time_unit,
122-
static_cast<uint64_t>(result.standard_deviation), num_threads);
137+
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
138+
b->get_test_name().data(), final_result.cycles,
139+
final_result.standard_deviation, (unsigned long long)final_result.min,
140+
(unsigned long long)final_result.max,
141+
(unsigned long long)final_result.total_iterations, (unsigned)num_threads);
123142
}
124143

125144
void print_header() {
126145
LIBC_NAMESPACE::printf("%s", GREEN);
127146
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
128147
benchmarks[0]->get_suite_name().data());
129148
LIBC_NAMESPACE::printf("%s", RESET);
130-
cpp::string titles =
131-
"Benchmark | Cycles | Min | Max | "
132-
"Iterations | Time / Iteration | Stddev | Threads |\n";
149+
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
150+
" Min | Max | Iterations | Threads |\n";
133151
LIBC_NAMESPACE::printf(titles.data());
134152

135153
cpp::string separator(titles.size(), '-');
@@ -212,18 +230,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
212230
}
213231

214232
const auto &estimator = rep.get_estimator();
215-
result.cycles = static_cast<uint64_t>(estimator.get_mean());
233+
result.total_iterations = estimator.get_iterations();
234+
result.cycles = estimator.get_mean();
216235
result.standard_deviation = estimator.get_stddev();
217-
218236
result.min = min;
219237
result.max = max;
220-
result.samples = samples;
221-
222-
result.total_iterations = estimator.get_iterations();
223-
if (result.total_iterations > 0)
224-
result.total_time = total_time / result.total_iterations;
225-
else
226-
result.total_time = 0;
227238

228239
return result;
229240
}

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,11 @@ class RuntimeEstimationProgression {
111111
};
112112

113113
struct BenchmarkResult {
114-
uint64_t cycles = 0;
114+
uint64_t total_iterations = 0;
115+
double cycles = 0;
115116
double standard_deviation = 0;
116117
uint64_t min = UINT64_MAX;
117118
uint64_t max = 0;
118-
uint32_t samples = 0;
119-
uint32_t total_iterations = 0;
120-
clock_t total_time = 0;
121119
};
122120

123121
struct BenchmarkTarget {

0 commit comments

Comments
 (0)