|
1 | 1 | #ifndef BENCHUTIL_H |
2 | 2 | #define BENCHUTIL_H |
3 | 3 |
|
| 4 | +#include "counters/event_counter.h" |
4 | 5 | #include <cfloat> |
5 | 6 | #include <cstdio> |
6 | 7 |
|
7 | | -#if defined(__linux__) || (__APPLE__ && __aarch64__) |
8 | | - #define USING_COUNTERS |
9 | | - #include "counters/event_counter.h" |
10 | | -#else |
11 | | - #include <chrono> |
12 | | -#endif |
| 8 | +#include <atomic> |
| 9 | +event_collector collector; |
13 | 10 |
|
14 | | -#ifdef USING_COUNTERS |
15 | | -template <class T, class Func> |
16 | | -std::vector<event_count> time_it_ns(const std::vector<T> &lines, |
17 | | - Func&& function, size_t repeat) { |
18 | | - std::vector<event_count> aggregate; |
19 | | - event_collector collector; |
20 | | - bool printed_bug = false; |
21 | | - for (size_t i = 0; i < repeat; i++) { |
| 11 | +template <class function_type> |
| 12 | +event_aggregate bench(const function_type &&function, size_t min_repeat = 10, |
| 13 | + size_t min_time_ns = 400'000'000, |
| 14 | + size_t max_repeat = 1000000) { |
| 15 | + size_t N = min_repeat; |
| 16 | + if (N == 0) { |
| 17 | + N = 1; |
| 18 | + } |
| 19 | + volatile double dontoptimize = 0.0; |
| 20 | + // We warmm up first. We warmup for at least 0.4s (by default). This makes |
| 21 | + // sure that the processor is in a consistent state. |
| 22 | + event_aggregate warm_aggregate{}; |
| 23 | + for (size_t i = 0; i < N; i++) { |
| 24 | + std::atomic_thread_fence(std::memory_order_acquire); |
| 25 | + collector.start(); |
| 26 | + dontoptimize = double(function()); |
| 27 | + std::atomic_thread_fence(std::memory_order_release); |
| 28 | + event_count allocate_count = collector.end(); |
| 29 | + warm_aggregate << allocate_count; |
| 30 | + if ((i + 1 == N) && (warm_aggregate.total_elapsed_ns() < min_time_ns) && |
| 31 | + (N < max_repeat)) { |
| 32 | + N *= 10; |
| 33 | + } |
| 34 | + } |
| 35 | + // Actual measure, another 0.4s (by default), this time with a processor |
| 36 | + // warmed up. |
| 37 | + event_aggregate aggregate{}; |
| 38 | + for (size_t i = 0; i < N; i++) { |
| 39 | + std::atomic_thread_fence(std::memory_order_acquire); |
22 | 40 | collector.start(); |
23 | | - if (function(lines) == 0 && !printed_bug) { |
24 | | - printf("bug\n"); |
25 | | - printed_bug = true; |
| 41 | + dontoptimize = double(function()); |
| 42 | + std::atomic_thread_fence(std::memory_order_release); |
| 43 | + event_count allocate_count = collector.end(); |
| 44 | + aggregate << allocate_count; |
| 45 | + if ((i + 1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) && |
| 46 | + (N < max_repeat)) { |
| 47 | + N *= 10; |
26 | 48 | } |
27 | | - aggregate.push_back(collector.end()); |
28 | 49 | } |
29 | 50 | return aggregate; |
30 | 51 | } |
31 | 52 |
|
32 | 53 | template <class T, class Func> |
33 | 54 | void pretty_print(const std::vector<T> &lines, const std::string &name, |
34 | | - Func&& function, size_t repeat = 100) { |
| 55 | + Func &&function, size_t repeat = 100) { |
35 | 56 | const size_t number_of_floats = lines.size(); |
36 | 57 | const double volume = static_cast<double>(function(lines)); |
37 | | - const double volumeMB = volume / (1024. * 1024.); |
38 | | - const std::vector<event_count> events = time_it_ns(lines, function, repeat); |
39 | | - double average_ns{0}; |
40 | | - double min_ns{DBL_MAX}; |
41 | | - double cycles_min{DBL_MAX}; |
42 | | - double instructions_min{DBL_MAX}; |
43 | | - double cycles_avg{0}; |
44 | | - double instructions_avg{0}; |
45 | | - double branches_min{0}; |
46 | | - double branches_avg{0}; |
47 | | - double branch_misses_min{0}; |
48 | | - double branch_misses_avg{0}; |
49 | | - for (event_count e : events) { |
50 | | - const double ns = e.elapsed_ns(); |
51 | | - average_ns += ns; |
52 | | - min_ns = std::min(min_ns, ns); |
53 | | - |
54 | | - const double cycles = e.cycles(); |
55 | | - cycles_avg += cycles; |
56 | | - cycles_min = std::min(cycles_min, cycles); |
57 | | - |
58 | | - const double instructions = e.instructions(); |
59 | | - instructions_avg += instructions; |
60 | | - instructions_min = std::min(instructions_min, instructions); |
61 | | - |
62 | | - const double branches = e.branches(); |
63 | | - branches_avg += branches; |
64 | | - branches_min = std::min(branches_min, branches); |
65 | | - |
66 | | - const double branch_misses = e.missed_branches(); |
67 | | - branch_misses_avg += branch_misses; |
68 | | - branch_misses_min = std::min(branch_misses_min, branch_misses); |
69 | | - } |
70 | | - cycles_avg /= events.size(); |
71 | | - instructions_avg /= events.size(); |
72 | | - average_ns /= events.size(); |
73 | | - branches_avg /= events.size(); |
| 58 | + const double volumeMB = volume / 1'000'000; |
| 59 | + auto agg = bench([&function, &lines]() { return function(lines); }, repeat); |
74 | 60 |
|
75 | 61 | printf("%-30s: %8.2f MB/s (+/- %.1f %%) ", name.data(), |
76 | | - volumeMB * 1000000000 / min_ns, |
77 | | - (average_ns - min_ns) * 100.0 / average_ns); |
| 62 | + volumeMB * 1000'000'000 / agg.fastest_elapsed_ns(), |
| 63 | + (agg.elapsed_ns() - agg.fastest_elapsed_ns()) * 100.0 / |
| 64 | + agg.elapsed_ns()); |
78 | 65 | printf("%8.2f MB ", volumeMB); |
79 | | - printf("%8.2f Mfloat/s ", number_of_floats * 1000 / min_ns); |
80 | | - if (instructions_min > 0) { |
81 | | - printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", instructions_min / volume, |
82 | | - instructions_min / number_of_floats, |
83 | | - (instructions_avg - instructions_min) * 100.0 / instructions_avg); |
| 66 | + printf(" %8.2f ns/f ", agg.fastest_elapsed_ns() / number_of_floats); |
| 67 | + printf("%8.2f Mfloat/s\n", |
| 68 | + number_of_floats * 1000 / agg.fastest_elapsed_ns()); |
| 69 | + // We only print out performance counters if they are available. |
| 70 | + if (collector.has_events()) { |
| 71 | + // Somewhat arbitrarily, we use two new lines for the counters. |
| 72 | + printf(" "); |
| 73 | + printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", |
| 74 | + agg.fastest_instructions() / volume, |
| 75 | + agg.fastest_instructions() / number_of_floats, |
| 76 | + (agg.instructions() - agg.fastest_instructions()) * 100.0 / |
| 77 | + agg.instructions()); |
84 | 78 |
|
85 | | - printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", cycles_min / volume, |
86 | | - cycles_min / number_of_floats, |
87 | | - (cycles_avg - cycles_min) * 100.0 / cycles_avg); |
88 | | - printf(" %8.2f i/c ", instructions_min / cycles_min); |
89 | | - printf(" %8.2f b/f ", branches_avg / number_of_floats); |
90 | | - printf(" %8.2f bm/f ", branch_misses_avg / number_of_floats); |
91 | | - printf(" %8.2f GHz ", cycles_min / min_ns); |
92 | | - } |
93 | | - printf("\n"); |
94 | | -} |
95 | | -#else |
96 | | -template <class T, class Func> |
97 | | -std::pair<double, double> time_it_ns(const std::vector<T> &lines, |
98 | | - Func&& function, size_t repeat) { |
99 | | - typename std::chrono::high_resolution_clock::time_point t1, t2; |
100 | | - double average = 0; |
101 | | - double min_value = DBL_MAX; |
102 | | - bool printed_bug = false; |
103 | | - for (size_t i = 0; i < repeat; i++) { |
104 | | - t1 = std::chrono::high_resolution_clock::now(); |
105 | | - if (function(lines) == 0 && !printed_bug) { |
106 | | - printf("bug\n"); |
107 | | - printed_bug = true; |
108 | | - } |
109 | | - t2 = std::chrono::high_resolution_clock::now(); |
110 | | - const double dif = |
111 | | - std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count(); |
112 | | - average += dif; |
113 | | - min_value = std::min(min_value, dif); |
| 79 | + printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n", |
| 80 | + agg.fastest_cycles() / volume, |
| 81 | + agg.fastest_cycles() / number_of_floats, |
| 82 | + (agg.cycles() - agg.fastest_cycles()) * 100.0 / agg.cycles()); |
| 83 | + printf(" "); |
| 84 | + printf(" %8.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles()); |
| 85 | + printf(" %8.2f b/f ", agg.branches() / number_of_floats); |
| 86 | + printf(" "); |
| 87 | + printf(" %8.2f bm/f ", agg.branch_misses() / number_of_floats); |
| 88 | + printf(" %8.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns()); |
| 89 | + printf("\n"); |
114 | 90 | } |
115 | | - average /= repeat; |
116 | | - return std::make_pair(min_value, average); |
117 | | -} |
118 | | - |
119 | | -template <class T, class Func> |
120 | | -void pretty_print(const std::vector<T> &lines, const std::string &name, |
121 | | - Func&& function, size_t repeat = 100) { |
122 | | - const size_t number_of_floats = lines.size(); |
123 | | - const double volume = static_cast<double>(function(lines)); |
124 | | - const double volumeMB = volume / (1024. * 1024.); |
125 | | - const std::pair<double, double> result = time_it_ns(lines, function, repeat); |
126 | | - |
127 | | - printf("%-30s: %8.2f MB/s (+/- %.1f %%) ", name.data(), |
128 | | - volumeMB * 1000000000 / result.first, |
129 | | - (result.second - result.first) * 100.0 / result.second); |
130 | | - printf("%8.2f MB ", volumeMB); |
131 | | - printf("%8.2f Mfloat/s ", number_of_floats * 1000 / result.first); |
132 | | - printf(" %8.2f ns/f \n", double(result.first) / number_of_floats); |
133 | 91 | } |
134 | | - |
135 | | -#endif |
136 | 92 | #endif //// BENCHUTIL_H |
0 commit comments