@@ -10,17 +10,35 @@ event_collector collector;
1010
1111template <class function_type >
1212event_aggregate bench (const function_type &&function, size_t min_repeat = 10 ,
13- size_t min_time_ns = 100000000 ,
13+ size_t min_time_ns = 400'000'000 ,
1414 size_t max_repeat = 1000000 ) {
15- event_aggregate aggregate{};
1615 size_t N = min_repeat;
1716 if (N == 0 ) {
1817 N = 1 ;
1918 }
19+ volatile double dontoptimize = 0.0 ;
20+ // We warmm up first. We warmup for at least 0.4s (by default). This makes
21+ // sure that the processor is in a consistent state.
22+ event_aggregate warm_aggregate{};
23+ for (size_t i = 0 ; i < N; i++) {
24+ std::atomic_thread_fence (std::memory_order_acquire);
25+ collector.start ();
26+ dontoptimize = double (function ());
27+ std::atomic_thread_fence (std::memory_order_release);
28+ event_count allocate_count = collector.end ();
29+ warm_aggregate << allocate_count;
30+ if ((i + 1 == N) && (warm_aggregate.total_elapsed_ns () < min_time_ns) &&
31+ (N < max_repeat)) {
32+ N *= 10 ;
33+ }
34+ }
35+ // Actual measure, another 0.4s (by default), this time with a processor
36+ // warmed up.
37+ event_aggregate aggregate{};
2038 for (size_t i = 0 ; i < N; i++) {
2139 std::atomic_thread_fence (std::memory_order_acquire);
2240 collector.start ();
23- function ();
41+ dontoptimize = double ( function () );
2442 std::atomic_thread_fence (std::memory_order_release);
2543 event_count allocate_count = collector.end ();
2644 aggregate << allocate_count;
@@ -48,15 +66,18 @@ void pretty_print(const std::vector<T> &lines, const std::string &name,
4866 printf (" %8.2f ns/f " , agg.fastest_elapsed_ns () / number_of_floats);
4967 printf (" %8.2f Mfloat/s\n " ,
5068 number_of_floats * 1000 / agg.fastest_elapsed_ns ());
69+ // We only print out performance counters if they are available.
5170 if (collector.has_events ()) {
71+ // Somewhat arbitrarily, we use two new lines for the counters.
5272 printf (" " );
5373 printf (" %8.2f i/B %8.2f i/f (+/- %.1f %%) " ,
5474 agg.fastest_instructions () / volume,
5575 agg.fastest_instructions () / number_of_floats,
5676 (agg.instructions () - agg.fastest_instructions ()) * 100.0 /
5777 agg.instructions ());
5878
59- printf (" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n " , agg.fastest_cycles () / volume,
79+ printf (" %8.2f c/B %8.2f c/f (+/- %.1f %%)\n " ,
80+ agg.fastest_cycles () / volume,
6081 agg.fastest_cycles () / number_of_floats,
6182 (agg.cycles () - agg.fastest_cycles ()) * 100.0 / agg.cycles ());
6283 printf (" " );
@@ -67,6 +88,5 @@ void pretty_print(const std::vector<T> &lines, const std::string &name,
6788 printf (" %8.2f GHz " , agg.fastest_cycles () / agg.fastest_elapsed_ns ());
6889 printf (" \n " );
6990 }
70-
7191}
7292#endif // // BENCHUTIL_H
0 commit comments