11#include " LibcGpuBenchmark.h"
2+
3+ #include " hdr/stdint_proxy.h"
24#include " src/__support/CPP/algorithm.h"
35#include " src/__support/CPP/array.h"
46#include " src/__support/CPP/atomic.h"
57#include " src/__support/CPP/string.h"
8+ #include " src/__support/FPUtil/FPBits.h"
69#include " src/__support/FPUtil/sqrt.h"
710#include " src/__support/GPU/utils.h"
811#include " src/__support/fixedvector.h"
912#include " src/__support/macros/config.h"
1013#include " src/__support/time/gpu/time_utils.h"
1114#include " src/stdio/printf.h"
12- #include " src/stdlib/srand.h"
1315
1416namespace LIBC_NAMESPACE_DECL {
1517namespace benchmarks {
@@ -20,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
2022 benchmarks.push_back (benchmark);
2123}
2224
25+ static void atomic_add_double (cpp::Atomic<uint64_t > &atomic_bits,
26+ double value) {
27+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double >;
28+
29+ uint64_t expected_bits = atomic_bits.load (cpp::MemoryOrder::RELAXED);
30+
31+ while (true ) {
32+ double current_value = FPBits (expected_bits).get_val ();
33+ double next_value = current_value + value;
34+
35+ uint64_t desired_bits = FPBits (next_value).uintval ();
36+ if (atomic_bits.compare_exchange_strong (expected_bits, desired_bits,
37+ cpp::MemoryOrder::ACQUIRE,
38+ cpp::MemoryOrder::RELAXED))
39+ break ;
40+ }
41+ }
42+
2343struct AtomicBenchmarkSums {
24- cpp::Atomic<uint64_t > cycles_sum = 0 ;
25- cpp::Atomic<uint64_t > standard_deviation_sum = 0 ;
44+ cpp::Atomic<uint32_t > active_threads = 0 ;
45+ cpp::Atomic<uint64_t > iterations_sum = 0 ;
46+ cpp::Atomic<uint64_t > weighted_cycles_sum_bits = 0 ;
47+ cpp::Atomic<uint64_t > weighted_squared_cycles_sum_bits = 0 ;
2648 cpp::Atomic<uint64_t > min = UINT64_MAX;
2749 cpp::Atomic<uint64_t > max = 0 ;
28- cpp::Atomic<uint32_t > samples_sum = 0 ;
29- cpp::Atomic<uint32_t > iterations_sum = 0 ;
30- cpp::Atomic<clock_t > time_sum = 0 ;
31- cpp::Atomic<uint64_t > active_threads = 0 ;
3250
3351 void reset () {
3452 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
3553 active_threads.store (0 , cpp::MemoryOrder::RELAXED);
36- cycles_sum.store (0 , cpp::MemoryOrder::RELAXED);
37- standard_deviation_sum.store (0 , cpp::MemoryOrder::RELAXED);
54+ iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
55+ weighted_cycles_sum_bits.store (0 , cpp::MemoryOrder::RELAXED);
56+ weighted_squared_cycles_sum_bits.store (0 , cpp::MemoryOrder::RELAXED);
3857 min.store (UINT64_MAX, cpp::MemoryOrder::RELAXED);
3958 max.store (0 , cpp::MemoryOrder::RELAXED);
40- samples_sum.store (0 , cpp::MemoryOrder::RELAXED);
41- iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
42- time_sum.store (0 , cpp::MemoryOrder::RELAXED);
4359 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
4460 }
4561
4662 void update (const BenchmarkResult &result) {
4763 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
4864 active_threads.fetch_add (1 , cpp::MemoryOrder::RELAXED);
65+ iterations_sum.fetch_add (result.total_iterations ,
66+ cpp::MemoryOrder::RELAXED);
4967
50- cycles_sum.fetch_add (result.cycles , cpp::MemoryOrder::RELAXED);
51- standard_deviation_sum.fetch_add (
52- static_cast <uint64_t >(result.standard_deviation ),
53- cpp::MemoryOrder::RELAXED);
68+ const double n_i = static_cast <double >(result.total_iterations );
69+ const double mean_i = result.cycles ;
70+ const double stddev_i = result.standard_deviation ;
71+ const double variance_i = stddev_i * stddev_i;
72+ atomic_add_double (weighted_cycles_sum_bits, n_i * mean_i);
73+ atomic_add_double (weighted_squared_cycles_sum_bits,
74+ n_i * (variance_i + mean_i * mean_i));
5475
5576 // Perform a CAS loop to atomically update the min
5677 uint64_t orig_min = min.load (cpp::MemoryOrder::RELAXED);
@@ -66,10 +87,6 @@ struct AtomicBenchmarkSums {
6687 cpp::MemoryOrder::RELAXED))
6788 ;
6889
69- samples_sum.fetch_add (result.samples , cpp::MemoryOrder::RELAXED);
70- iterations_sum.fetch_add (result.total_iterations ,
71- cpp::MemoryOrder::RELAXED);
72- time_sum.fetch_add (result.total_time , cpp::MemoryOrder::RELAXED);
7390 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
7491 }
7592};
@@ -79,56 +96,58 @@ constexpr auto GREEN = "\033[32m";
7996constexpr auto RESET = " \033 [0m" ;
8097
8198void print_results (Benchmark *b) {
82- BenchmarkResult result;
99+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double >;
100+
101+ BenchmarkResult final_result;
83102 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
84- int num_threads = all_results. active_threads . load (cpp::MemoryOrder::RELAXED);
85- result. cycles =
86- all_results.cycles_sum .load (cpp::MemoryOrder::RELAXED) / num_threads ;
87- result. standard_deviation =
88- all_results.standard_deviation_sum .load (cpp::MemoryOrder::RELAXED) /
89- num_threads;
90- result. min = all_results. min . load (cpp::MemoryOrder::RELAXED);
91- result. max = all_results. max . load (cpp::MemoryOrder::RELAXED);
92- result. samples =
93- all_results.samples_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
94- result. total_iterations =
95- all_results. iterations_sum . load (cpp::MemoryOrder::RELAXED) / num_threads;
96- const uint64_t duration_ns =
97- all_results. time_sum . load (cpp::MemoryOrder::RELAXED) / num_threads ;
98- const uint64_t duration_us = duration_ns / 1000 ;
99- const uint64_t duration_ms = duration_ns / ( 1000 * 1000 );
100- uint64_t converted_duration = duration_ns ;
101- const char *time_unit ;
102- if (duration_ms != 0 ) {
103- converted_duration = duration_ms ;
104- time_unit = " ms " ;
105- } else if (duration_us != 0 ) {
106- converted_duration = duration_us;
107- time_unit = " us " ;
103+
104+ const uint32_t num_threads =
105+ all_results.active_threads .load (cpp::MemoryOrder::RELAXED);
106+ final_result. total_iterations =
107+ all_results.iterations_sum .load (cpp::MemoryOrder::RELAXED);
108+
109+ if (final_result. total_iterations > 0 ) {
110+ const uint64_t s1_bits =
111+ all_results. weighted_cycles_sum_bits . load (cpp::MemoryOrder::RELAXED);
112+ const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits .load (
113+ cpp::MemoryOrder::RELAXED);
114+
115+ const double S1 = FPBits (s1_bits). get_val ();
116+ const double S2 = FPBits (s2_bits). get_val () ;
117+ const double N = static_cast < double >(final_result. total_iterations ) ;
118+
119+ const double global_mean = S1 / N ;
120+ const double global_mean_of_squares = S2 / N ;
121+ const double global_variance =
122+ global_mean_of_squares - (global_mean * global_mean) ;
123+
124+ final_result. cycles = global_mean;
125+ final_result. standard_deviation =
126+ fputil::sqrt< double >(global_variance < 0.0 ? 0.0 : global_variance) ;
108127 } else {
109- converted_duration = duration_ns ;
110- time_unit = " ns " ;
128+ final_result. cycles = 0.0 ;
129+ final_result. standard_deviation = 0.0 ;
111130 }
112- result. total_time = converted_duration;
113- // result.total_time =
114- // all_results.time_sum .load(cpp::MemoryOrder::RELAXED) / num_threads ;
131+
132+ final_result. min = all_results. min . load (cpp::MemoryOrder::RELAXED);
133+ final_result. max = all_results.max .load (cpp::MemoryOrder::RELAXED);
115134 cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
116135
117136 LIBC_NAMESPACE::printf (
118- " %-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n " ,
119- b->get_test_name ().data (), result.cycles , result.min , result.max ,
120- result.total_iterations , result.total_time , time_unit,
121- static_cast <uint64_t >(result.standard_deviation ), num_threads);
137+ " %-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n " ,
138+ b->get_test_name ().data (), final_result.cycles ,
139+ final_result.standard_deviation , (unsigned long long )final_result.min ,
140+ (unsigned long long )final_result.max ,
141+ (unsigned long long )final_result.total_iterations , (unsigned )num_threads);
122142}
123143
124144void print_header () {
125145 LIBC_NAMESPACE::printf (" %s" , GREEN);
126146 LIBC_NAMESPACE::printf (" Running Suite: %-10s\n " ,
127147 benchmarks[0 ]->get_suite_name ().data ());
128148 LIBC_NAMESPACE::printf (" %s" , RESET);
129- cpp::string titles =
130- " Benchmark | Cycles | Min | Max | "
131- " Iterations | Time / Iteration | Stddev | Threads |\n " ;
149+ cpp::string titles = " Benchmark | Cycles (Mean) | Stddev | "
150+ " Min | Max | Iterations | Threads |\n " ;
132151 LIBC_NAMESPACE::printf (titles.data ());
133152
134153 cpp::string separator (titles.size (), ' -' );
@@ -139,10 +158,8 @@ void print_header() {
139158void Benchmark::run_benchmarks () {
140159 uint64_t id = gpu::get_thread_id ();
141160
142- if (id == 0 ) {
161+ if (id == 0 )
143162 print_header ();
144- LIBC_NAMESPACE::srand (gpu::processor_clock ());
145- }
146163
147164 gpu::sync_threads ();
148165
@@ -164,69 +181,63 @@ void Benchmark::run_benchmarks() {
164181}
165182
166183BenchmarkResult benchmark (const BenchmarkOptions &options,
167- cpp::function< uint64_t ( void )> wrapper_func ) {
184+ const BenchmarkTarget &target ) {
168185 BenchmarkResult result;
169186 RuntimeEstimationProgression rep;
170- uint32_t total_iterations = 0 ;
171187 uint32_t iterations = options.initial_iterations ;
188+
172189 if (iterations < 1u )
173190 iterations = 1 ;
174191
175192 uint32_t samples = 0 ;
176193 uint64_t total_time = 0 ;
177- uint64_t best_guess = 0 ;
178- uint64_t cycles_squared = 0 ;
179194 uint64_t min = UINT64_MAX;
180195 uint64_t max = 0 ;
181196
182- uint64_t overhead = UINT64_MAX;
183- int overhead_iterations = 10 ;
184- for (int i = 0 ; i < overhead_iterations; i++)
185- overhead = cpp::min (overhead, LIBC_NAMESPACE::overhead ());
197+ uint32_t call_index = 0 ;
186198
187199 for (int64_t time_budget = options.max_duration ; time_budget >= 0 ;) {
188- uint64_t sample_cycles = 0 ;
189- const clock_t start = static_cast < double >( clock ());
190- for ( uint32_t i = 0 ; i < iterations; i++) {
191- auto wrapper_intermediate = wrapper_func ();
192- uint64_t current_result = wrapper_intermediate - overhead ;
200+ RefinableRuntimeEstimator sample_estimator ;
201+
202+ const clock_t start = clock ();
203+ while (sample_estimator. get_iterations () < iterations) {
204+ auto current_result = target (call_index++) ;
193205 max = cpp::max (max, current_result);
194206 min = cpp::min (min, current_result);
195- sample_cycles += current_result;
207+ sample_estimator. update ( current_result) ;
196208 }
197209 const clock_t end = clock ();
210+
198211 const clock_t duration_ns =
199212 ((end - start) * 1000 * 1000 * 1000 ) / CLOCKS_PER_SEC;
200213 total_time += duration_ns;
201214 time_budget -= duration_ns;
202215 samples++;
203- cycles_squared += sample_cycles * sample_cycles;
204216
205- total_iterations += iterations;
206- const double change_ratio =
207- rep.compute_improvement ({iterations, sample_cycles});
208- best_guess = rep.current_estimation ;
217+ const double change_ratio = rep.compute_improvement (sample_estimator);
209218
210219 if (samples >= options.max_samples || iterations >= options.max_iterations )
211220 break ;
221+
222+ const auto total_iterations = rep.get_estimator ().get_iterations ();
223+
212224 if (total_time >= options.min_duration && samples >= options.min_samples &&
213225 total_iterations >= options.min_iterations &&
214226 change_ratio < options.epsilon )
215227 break ;
216228
217- iterations *= options.scaling_factor ;
229+ iterations = static_cast < uint32_t >(iterations * options.scaling_factor ) ;
218230 }
219- result.cycles = best_guess;
220- result.standard_deviation = fputil::sqrt<double >(
221- static_cast <double >(cycles_squared) / total_iterations -
222- static_cast <double >(best_guess * best_guess));
231+
232+ const auto &estimator = rep.get_estimator ();
233+ result.total_iterations = estimator.get_iterations ();
234+ result.cycles = estimator.get_mean ();
235+ result.standard_deviation = estimator.get_stddev ();
223236 result.min = min;
224237 result.max = max;
225- result.samples = samples;
226- result.total_iterations = total_iterations;
227- result.total_time = total_time / total_iterations;
238+
228239 return result;
229- };
240+ }
230241
231242} // namespace benchmarks
232243} // namespace LIBC_NAMESPACE_DECL
0 commit comments