55#include " src/__support/CPP/array.h"
66#include " src/__support/CPP/atomic.h"
77#include " src/__support/CPP/string.h"
8+ #include " src/__support/FPUtil/FPBits.h"
89#include " src/__support/FPUtil/sqrt.h"
910#include " src/__support/GPU/utils.h"
1011#include " src/__support/fixedvector.h"
@@ -21,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
2122 benchmarks.push_back (benchmark);
2223}
2324
25+ static void atomic_add_double (cpp::Atomic<uint64_t > &atomic_bits,
26+ double value) {
27+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double >;
28+
29+ uint64_t expected_bits = atomic_bits.load (cpp::MemoryOrder::RELAXED);
30+
31+ while (true ) {
32+ double current_value = FPBits (expected_bits).get_val ();
33+ double next_value = current_value + value;
34+
35+ uint64_t desired_bits = FPBits (next_value).uintval ();
36+ if (atomic_bits.compare_exchange_strong (expected_bits, desired_bits,
37+ cpp::MemoryOrder::ACQUIRE,
38+ cpp::MemoryOrder::RELAXED))
39+ break ;
40+ }
41+ }
42+
2443struct AtomicBenchmarkSums {
25- cpp::Atomic<uint64_t > cycles_sum = 0 ;
26- cpp::Atomic<uint64_t > standard_deviation_sum = 0 ;
44+ cpp::Atomic<uint32_t > active_threads = 0 ;
45+ cpp::Atomic<uint64_t > iterations_sum = 0 ;
46+ cpp::Atomic<uint64_t > weighted_cycles_sum_bits = 0 ;
47+ cpp::Atomic<uint64_t > weighted_squared_cycles_sum_bits = 0 ;
2748 cpp::Atomic<uint64_t > min = UINT64_MAX;
2849 cpp::Atomic<uint64_t > max = 0 ;
29- cpp::Atomic<uint32_t > samples_sum = 0 ;
30- cpp::Atomic<uint32_t > iterations_sum = 0 ;
31- cpp::Atomic<clock_t > time_sum = 0 ;
32- cpp::Atomic<uint64_t > active_threads = 0 ;
3350
3451 void reset () {
3552 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
3653 active_threads.store (0 , cpp::MemoryOrder::RELAXED);
37- cycles_sum.store (0 , cpp::MemoryOrder::RELAXED);
38- standard_deviation_sum.store (0 , cpp::MemoryOrder::RELAXED);
54+ iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
55+ weighted_cycles_sum_bits.store (0 , cpp::MemoryOrder::RELAXED);
56+ weighted_squared_cycles_sum_bits.store (0 , cpp::MemoryOrder::RELAXED);
3957 min.store (UINT64_MAX, cpp::MemoryOrder::RELAXED);
4058 max.store (0 , cpp::MemoryOrder::RELAXED);
41- samples_sum.store (0 , cpp::MemoryOrder::RELAXED);
42- iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
43- time_sum.store (0 , cpp::MemoryOrder::RELAXED);
4459 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
4560 }
4661
4762 void update (const BenchmarkResult &result) {
4863 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
4964 active_threads.fetch_add (1 , cpp::MemoryOrder::RELAXED);
65+ iterations_sum.fetch_add (result.total_iterations ,
66+ cpp::MemoryOrder::RELAXED);
5067
51- cycles_sum.fetch_add (result.cycles , cpp::MemoryOrder::RELAXED);
52- standard_deviation_sum.fetch_add (
53- static_cast <uint64_t >(result.standard_deviation ),
54- cpp::MemoryOrder::RELAXED);
68+ const double n_i = static_cast <double >(result.total_iterations );
69+ const double mean_i = result.cycles ;
70+ const double stddev_i = result.standard_deviation ;
71+ const double variance_i = stddev_i * stddev_i;
72+ atomic_add_double (weighted_cycles_sum_bits, n_i * mean_i);
73+ atomic_add_double (weighted_squared_cycles_sum_bits,
74+ n_i * (variance_i + mean_i * mean_i));
5575
5676 // Perform a CAS loop to atomically update the min
5777 uint64_t orig_min = min.load (cpp::MemoryOrder::RELAXED);
@@ -67,10 +87,6 @@ struct AtomicBenchmarkSums {
6787 cpp::MemoryOrder::RELAXED))
6888 ;
6989
70- samples_sum.fetch_add (result.samples , cpp::MemoryOrder::RELAXED);
71- iterations_sum.fetch_add (result.total_iterations ,
72- cpp::MemoryOrder::RELAXED);
73- time_sum.fetch_add (result.total_time , cpp::MemoryOrder::RELAXED);
7490 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
7591 }
7692};
@@ -80,56 +96,58 @@ constexpr auto GREEN = "\033[32m";
8096constexpr auto RESET = " \033 [0m" ;
8197
8298void print_results (Benchmark *b) {
83- BenchmarkResult result;
99+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double >;
100+
101+ BenchmarkResult final_result;
84102 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
85- int num_threads = all_results. active_threads . load (cpp::MemoryOrder::RELAXED);
86- result. cycles =
87- all_results.cycles_sum .load (cpp::MemoryOrder::RELAXED) / num_threads ;
88- result. standard_deviation =
89- all_results.standard_deviation_sum .load (cpp::MemoryOrder::RELAXED) /
90- num_threads;
91- result. min = all_results. min . load (cpp::MemoryOrder::RELAXED);
92- result. max = all_results. max . load (cpp::MemoryOrder::RELAXED);
93- result. samples =
94- all_results.samples_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
95- result. total_iterations =
96- all_results. iterations_sum . load (cpp::MemoryOrder::RELAXED) / num_threads;
97- const uint64_t duration_ns =
98- all_results. time_sum . load (cpp::MemoryOrder::RELAXED) / num_threads ;
99- const uint64_t duration_us = duration_ns / 1000 ;
100- const uint64_t duration_ms = duration_ns / ( 1000 * 1000 );
101- uint64_t converted_duration = duration_ns ;
102- const char *time_unit ;
103- if (duration_ms != 0 ) {
104- converted_duration = duration_ms ;
105- time_unit = " ms " ;
106- } else if (duration_us != 0 ) {
107- converted_duration = duration_us;
108- time_unit = " us " ;
103+
104+ const uint32_t num_threads =
105+ all_results.active_threads .load (cpp::MemoryOrder::RELAXED);
106+ final_result. total_iterations =
107+ all_results.iterations_sum .load (cpp::MemoryOrder::RELAXED);
108+
109+ if (final_result. total_iterations > 0 ) {
110+ const uint64_t s1_bits =
111+ all_results. weighted_cycles_sum_bits . load (cpp::MemoryOrder::RELAXED);
112+ const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits .load (
113+ cpp::MemoryOrder::RELAXED);
114+
115+ const double S1 = FPBits (s1_bits). get_val ();
116+ const double S2 = FPBits (s2_bits). get_val () ;
117+ const double N = static_cast < double >(final_result. total_iterations ) ;
118+
119+ const double global_mean = S1 / N ;
120+ const double global_mean_of_squares = S2 / N ;
121+ const double global_variance =
122+ global_mean_of_squares - (global_mean * global_mean) ;
123+
124+ final_result. cycles = global_mean;
125+ final_result. standard_deviation =
126+ fputil::sqrt< double >(global_variance < 0.0 ? 0.0 : global_variance) ;
109127 } else {
110- converted_duration = duration_ns ;
111- time_unit = " ns " ;
128+ final_result. cycles = 0.0 ;
129+ final_result. standard_deviation = 0.0 ;
112130 }
113- result. total_time = converted_duration;
114- // result.total_time =
115- // all_results.time_sum .load(cpp::MemoryOrder::RELAXED) / num_threads ;
131+
132+ final_result. min = all_results. min . load (cpp::MemoryOrder::RELAXED);
133+ final_result. max = all_results.max .load (cpp::MemoryOrder::RELAXED);
116134 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
117135
118136 LIBC_NAMESPACE::printf (
119- " %-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n " ,
120- b->get_test_name ().data (), result.cycles , result.min , result.max ,
121- result.total_iterations , result.total_time , time_unit,
122- static_cast <uint64_t >(result.standard_deviation ), num_threads);
137+ " %-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n " ,
138+ b->get_test_name ().data (), final_result.cycles ,
139+ final_result.standard_deviation , (unsigned long long )final_result.min ,
140+ (unsigned long long )final_result.max ,
141+ (unsigned long long )final_result.total_iterations , (unsigned )num_threads);
123142}
124143
125144void print_header () {
126145 LIBC_NAMESPACE::printf (" %s" , GREEN);
127146 LIBC_NAMESPACE::printf (" Running Suite: %-10s\n " ,
128147 benchmarks[0 ]->get_suite_name ().data ());
129148 LIBC_NAMESPACE::printf (" %s" , RESET);
130- cpp::string titles =
131- " Benchmark | Cycles | Min | Max | "
132- " Iterations | Time / Iteration | Stddev | Threads |\n " ;
149+ cpp::string titles = " Benchmark | Cycles (Mean) | Stddev | "
150+ " Min | Max | Iterations | Threads |\n " ;
133151 LIBC_NAMESPACE::printf (titles.data ());
134152
135153 cpp::string separator (titles.size (), ' -' );
@@ -212,18 +230,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
212230 }
213231
214232 const auto &estimator = rep.get_estimator ();
215- result.cycles = static_cast <uint64_t >(estimator.get_mean ());
233+ result.total_iterations = estimator.get_iterations ();
234+ result.cycles = estimator.get_mean ();
216235 result.standard_deviation = estimator.get_stddev ();
217-
218236 result.min = min;
219237 result.max = max;
220- result.samples = samples;
221-
222- result.total_iterations = estimator.get_iterations ();
223- if (result.total_iterations > 0 )
224- result.total_time = total_time / result.total_iterations ;
225- else
226- result.total_time = 0 ;
227238
228239 return result;
229240}
0 commit comments