Skip to content

Commit 2b01973

Browse files
author
Daniel Lemire
committed
even more careful
1 parent 4e798d5 commit 2b01973

File tree

1 file changed

+25
-5
lines changed

1 file changed

+25
-5
lines changed

benchmarks/benchutil.h

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,35 @@ event_collector collector;
1010

1111
template <class function_type>
1212
event_aggregate bench(const function_type &&function, size_t min_repeat = 10,
13-
size_t min_time_ns = 100000000,
13+
size_t min_time_ns = 400'000'000,
1414
size_t max_repeat = 1000000) {
15-
event_aggregate aggregate{};
1615
size_t N = min_repeat;
1716
if (N == 0) {
1817
N = 1;
1918
}
19+
volatile double dontoptimize = 0.0;
20+
// We warmm up first. We warmup for at least 0.4s (by default). This makes
21+
// sure that the processor is in a consistent state.
22+
event_aggregate warm_aggregate{};
23+
for (size_t i = 0; i < N; i++) {
24+
std::atomic_thread_fence(std::memory_order_acquire);
25+
collector.start();
26+
dontoptimize = double(function());
27+
std::atomic_thread_fence(std::memory_order_release);
28+
event_count allocate_count = collector.end();
29+
warm_aggregate << allocate_count;
30+
if ((i + 1 == N) && (warm_aggregate.total_elapsed_ns() < min_time_ns) &&
31+
(N < max_repeat)) {
32+
N *= 10;
33+
}
34+
}
35+
// Actual measure, another 0.4s (by default), this time with a processor
36+
// warmed up.
37+
event_aggregate aggregate{};
2038
for (size_t i = 0; i < N; i++) {
2139
std::atomic_thread_fence(std::memory_order_acquire);
2240
collector.start();
23-
function();
41+
dontoptimize = double(function());
2442
std::atomic_thread_fence(std::memory_order_release);
2543
event_count allocate_count = collector.end();
2644
aggregate << allocate_count;
@@ -48,15 +66,18 @@ void pretty_print(const std::vector<T> &lines, const std::string &name,
4866
printf(" %8.2f ns/f ", agg.fastest_elapsed_ns() / number_of_floats);
4967
printf("%8.2f Mfloat/s\n",
5068
number_of_floats * 1000 / agg.fastest_elapsed_ns());
69+
// We only print out performance counters if they are available.
5170
if (collector.has_events()) {
71+
// Somewhat arbitrarily, we use two new lines for the counters.
5272
printf(" ");
5373
printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ",
5474
agg.fastest_instructions() / volume,
5575
agg.fastest_instructions() / number_of_floats,
5676
(agg.instructions() - agg.fastest_instructions()) * 100.0 /
5777
agg.instructions());
5878

59-
printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n", agg.fastest_cycles() / volume,
79+
printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n",
80+
agg.fastest_cycles() / volume,
6081
agg.fastest_cycles() / number_of_floats,
6182
(agg.cycles() - agg.fastest_cycles()) * 100.0 / agg.cycles());
6283
printf(" ");
@@ -67,6 +88,5 @@ void pretty_print(const std::vector<T> &lines, const std::string &name,
6788
printf(" %8.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
6889
printf("\n");
6990
}
70-
7191
}
7292
#endif //// BENCHUTIL_H

0 commit comments

Comments
 (0)