|
| 1 | +#include <chrono> |
| 2 | +#include <cmath> |
| 3 | +#include <ctime> |
| 4 | +#include <iomanip> |
| 5 | +#include <iostream> |
| 6 | +#include <sstream> |
| 7 | + |
| 8 | +#include "ast/ast.h" |
| 9 | +#include "ast/context.h" |
| 10 | +#include "ast/passes/printer.h" |
| 11 | +#include "benchmark.h" |
| 12 | + |
| 13 | +namespace bpftrace { |
| 14 | + |
| 15 | +char TimerError::ID; |
| 16 | +void TimerError::log(llvm::raw_ostream &OS) const |
| 17 | +{ |
| 18 | + OS << "timer error: " << strerror(err_); |
| 19 | +} |
| 20 | + |
| 21 | +using time_point = std::chrono::time_point<std::chrono::steady_clock, |
| 22 | + std::chrono::nanoseconds>; |
| 23 | + |
| 24 | +static Result<time_point> processor_time() |
| 25 | +{ |
| 26 | + struct timespec ts = {}; |
| 27 | + int rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); |
| 28 | + if (rc < 0) { |
| 29 | + return make_error<TimerError>(errno); |
| 30 | + } |
| 31 | + return time_point(std::chrono::seconds(ts.tv_sec) + |
| 32 | + std::chrono::nanoseconds(ts.tv_nsec)); |
| 33 | +} |
| 34 | + |
| 35 | +static int64_t delta(time_point start, time_point end) |
| 36 | +{ |
| 37 | + return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start) |
| 38 | + .count(); |
| 39 | +} |
| 40 | + |
| 41 | +Result<> benchmark(std::ostream &out, ast::PassManager &mgr) |
| 42 | +{ |
| 43 | + ast::PassContext ctx; |
| 44 | + |
| 45 | + // See below; we aggregate at the end. |
| 46 | + int64_t full_mean = 0; |
| 47 | + double full_variance = 0; |
| 48 | + size_t full_count = 0; |
| 49 | + |
| 50 | + // We print out the confidence interval at p95, which corresponds to a |
| 51 | + // z-score of 1.96 (see the `err` value below). |
| 52 | + auto emit = [&](const std::string &name, |
| 53 | + int64_t total, |
| 54 | + int64_t count, |
| 55 | + double variance) { |
| 56 | + size_t mean = total / count; |
| 57 | + auto stddev = std::sqrt(variance); |
| 58 | + auto err = static_cast<int64_t>(1.96 * stddev / |
| 59 | + std::sqrt(static_cast<double>(count))); |
| 60 | + std::string unit = "ns"; |
| 61 | + if (mean > 10000000) { |
| 62 | + unit = "ms"; |
| 63 | + mean /= 1000000; |
| 64 | + err /= 1000000; |
| 65 | + } else if (mean > 10000) { |
| 66 | + unit = "μs"; |
| 67 | + mean /= 1000; |
| 68 | + err /= 1000; |
| 69 | + } |
| 70 | + out << std::left << std::setw(30) << name; |
| 71 | + out << std::left << std::setw(8) << count; |
| 72 | + out << std::left << std::setw(14) << total; |
| 73 | + out << mean << " ± " << err << " " << unit << std::endl; |
| 74 | + }; |
| 75 | + |
| 76 | + auto ok = mgr.foreach([&](auto &pass) -> Result<> { |
| 77 | + // Copy out the AST. We allow passes to mutate the AST, and therefore we |
| 78 | + // copy this out and reset it each time. |
| 79 | + ast::ASTContext saved; |
| 80 | + if (ctx.has<ast::ASTContext>()) { |
| 81 | + auto &ast = ctx.get<ast::ASTContext>(); |
| 82 | + saved.root = saved.clone_node(ast.root, ast::Location()); |
| 83 | + } |
| 84 | + |
| 85 | + // We run the function until we are able to accumulate at least three |
| 86 | + // iterations, and 100 milliseconds (but we never bother doing more than |
| 87 | + // 10,000). This should provide reasonable data for the below. The times |
| 88 | + // are all recorded in process CPU time, only while the pass itself is |
| 89 | + // running. We may accumulate additional time rebuilding the AST, etc. |
| 90 | + int64_t goal = std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 91 | + std::chrono::milliseconds(100)) |
| 92 | + .count(); |
| 93 | + std::vector<int64_t> samples; |
| 94 | + int64_t total = 0; |
| 95 | + while (true) { |
| 96 | + auto start = processor_time(); |
| 97 | + if (!start) { |
| 98 | + return start.takeError(); |
| 99 | + } |
| 100 | + auto ok = pass.run(ctx); |
| 101 | + if (!ok) { |
| 102 | + return ok.takeError(); |
| 103 | + } |
| 104 | + auto end = processor_time(); |
| 105 | + if (!end) { |
| 106 | + return end.takeError(); |
| 107 | + } |
| 108 | + int64_t current = delta(*start, *end); |
| 109 | + samples.push_back(current); |
| 110 | + total += current; |
| 111 | + |
| 112 | + // Do we have enough (or too much)? |
| 113 | + if (samples.size() >= 10000 || (samples.size() > 3 && total >= goal)) { |
| 114 | + break; |
| 115 | + } |
| 116 | + |
| 117 | + // Restore the original tree. |
| 118 | + auto &ast = ctx.get<ast::ASTContext>(); |
| 119 | + ast.clear(); |
| 120 | + ast.root = clone(ast, saved.root, ast::Location()); |
| 121 | + } |
| 122 | + |
| 123 | + // Compute the variance of the samples. |
| 124 | + int64_t mean = total / samples.size(); |
| 125 | + double variance = 0; |
| 126 | + for (const auto &sample : samples) { |
| 127 | + variance += std::pow(static_cast<double>(sample - mean), 2); |
| 128 | + } |
| 129 | + emit(pass.name(), total, samples.size(), variance); |
| 130 | + |
| 131 | + // Aggregate for printing the final stats. Note that we treat each pass as |
| 132 | + // independent, therefore the final variance is the sum of the variances. |
| 133 | + full_mean += mean; |
| 134 | + full_variance += variance; |
| 135 | + full_count++; |
| 136 | + return OK(); |
| 137 | + }); |
| 138 | + if (!ok) { |
| 139 | + out << "FAIL\n"; // See below. |
| 140 | + return ok.takeError(); |
| 141 | + } |
| 142 | + |
| 143 | + // The final `PASS` is emitted when all passes have finished correctly. This |
| 144 | + // makes the output format compatible with `gobench` or other aggregation |
| 145 | + // tools that can compare benchmarks. |
| 146 | + emit("total", full_mean * full_count, full_count, full_variance); |
| 147 | + out << "PASS\n"; |
| 148 | + return OK(); |
| 149 | +} |
| 150 | + |
| 151 | +} // namespace bpftrace |
0 commit comments