Add basic benchmark as a --test option

amscanne · danobi · commit b516f5480ac8 · 2025-05-14T16:08:26.000-04:00
Standardized benchmarks are hard! Since we most often want to use benchmarks in an ad-hoc fashion, can build it directly into the main binary as a `--test` option (a new test mode). The benchmark mechanism is implementation totally transparently to all the internals, and operates exclusively on the `PassManager`. It executes a benchmark for each pass. This shows a structured output with every pass listed: ``` System OS: Linux 6.9.0-0_fbk3_1265_g43ac291a024d #1 SMP Wed Dec 4 07:06:17 PST 2024 Arch: x86_64 Build version: v0.21.0-713-g4969-dirty LLVM: 18.1.8 bfd: yes libdw (DWARF support): yes libsystemd (systemd notify support): no blazesym (advanced symbolization): yes Kernel helpers probe_read: no probe_read_str: no probe_read_user: no probe_read_user_str: no probe_read_kernel: no probe_read_kernel_str: no get_current_cgroup_id: no send_signal: no override_return: no get_boot_ns: no dpath: no skboutput: no get_tai_ns: no get_func_ip: no jiffies64: no for_each_map_elem: no get_ns_current_pid_tgid: no lookup_percpu_elem: no Kernel features Instruction limit: -1 btf: yes module btf: no map batch: yes uprobe refcount: yes Map types hash: yes array: yes percpu array: yes stack_trace: no perf_event_array: yes ringbuf: yes Probe types kprobe: no tracepoint: no perf_event: no fentry: no kprobe_multi: no uprobe_multi: no kprobe_session: no iter: no ast 10000 7205047 720 ± 582 ns bpftrace 10000 7724379 772 ± 576 ns parse 2103 100040389 47 ± 5 μs ConfigAnalyser 3355 100029020 29 ± 3 μs ResolveImports 1478 100015641 67 ± 9 μs ImportScripts 10000 15044124 1504 ± 878 ns UnstableFeature 10000 18684501 1868 ± 972 ns MacroExpansion 10000 19686928 1968 ± 910 ns Deprecated 10000 14804521 1480 ± 921 ns attachpoints 10000 52022211 5202 ± 1792 ns btf 10000 31688754 3168 ± 178502 ns tracepoint 10000 23252392 2325 ± 871 ns FieldAnalyser 10000 30305116 3030 ± 1106 ns ClangParser 4 105528964 26 ± 3 ms CMacroExpansion 10000 17427514 1742 ± 815 ns MapSugar 10000 36848365 3684 ± 1439 ns FoldLiterals 10000 19861081 1986 ± 833 ns PidFilter 10000 16219805 1621 ± 911 ns Semantic 10000 37571204 3757 ± 2062 ns ResourceAnalyser 10000 56294908 5629 ± 1611 ns RecursionCheck 10000 21742022 2174 ± 1016 ns ReturnPath 10000 10208386 1020 ± 581 ns Probe 10000 24246794 2424 ± 1238 ns llvm-init 10000 58527480 5852 ± 2056 ns compile 1974 100048050 50 ± 30 μs optimize 567 100030093 176 ± 58 μs object 115 100232684 871 ± 167 μs extern 10000 14664920 1466 ± 806 ns link 4511 100005119 22 ± 7 μs total 29 803344776 27 ± 3 ms PASS ``` Additional docs have been added for how to use this functionality, with appropriate caveats. Signed-off-by: Adin Scannell <amscanne@meta.com> stack-info: PR: bpftrace#3998, branch: user/amscanne/map_pipelines2/3
diff --git a/docs/developers.md b/docs/developers.md
@@ -40,6 +40,15 @@ The distro build is documented in [INSTALL.md](../INSTALL.md#generic-build-proce
 Every contribution should (1) not break the existing tests and (2) introduce new
 tests if relevant. See existing tests for inspiration on how to write new ones. [Read more on the different kinds and how to run them](../tests/README.md).
 
+## Performance
+
+We aim to not be wasteful, but always keep in mind that performance of the BPF
+programs and runtime are the things in the critical path. Often, simplicity and
+understandability on non-critical paths is often more important than
+performance. That said, occasionally it is useful to measure the performance of
+different parts of the pipeline. You may run bpftrace using `--test benchmark`
+in order to see the performance of the various passes during compilation.
+
 ## Continuous integration
 
 CI executes the above tests in a matrix of different LLVM versions on NixOS.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -55,6 +55,7 @@ set_target_properties(libbpftrace PROPERTIES PREFIX "")
 
 add_executable(bpftrace
   main.cpp
+  benchmark.cpp
 )
 
 # TODO: Honor `STATIC_LINKING` properly.
diff --git a/src/ast/context.cpp b/src/ast/context.cpp
@@ -31,4 +31,11 @@ ASTContext::ASTContext() : ASTContext("", "")
 {
 }
 
+void ASTContext::clear()
+{
+  root = nullptr;
+  nodes_.clear();
+  diagnostics_->clear();
+}
+
 } // namespace bpftrace::ast
diff --git a/src/ast/context.h b/src/ast/context.h
@@ -87,6 +87,11 @@ class ASTContext : public ast::State<"ast"> {
     return source_;
   }
 
+  // clears all the nodes and diagnostics, but does not affect the underlying
+  // `ASTSource` object. This is useful if you want to e.g. reparse the full
+  // syntax tree in place.
+  void clear();
+
   Program *root = nullptr;
 
 private:
diff --git a/src/ast/pass_manager.h b/src/ast/pass_manager.h
@@ -30,7 +30,9 @@ class PassContext {
     char value[N];
     std::string str() const
     {
-      return std::string(value, sizeof(value));
+      // N.B. the value here includes the trailing zero, so when constructing a
+      // string we truncate this zero.
+      return std::string(value, sizeof(value) - 1);
     }
   };
 
@@ -87,6 +89,14 @@ class PassContext {
     no_object_failure(type_id);
   }
 
+  // has indicates whether the given state is present or not.
+  template <typename T>
+  bool has()
+  {
+    int type_id = TypeId<T>::type_id();
+    return state_.contains(type_id) || extern_state_.contains(type_id);
+  }
+
 private:
   // for the failed lookup path, see above.
   [[noreturn]] static void no_object_failure(int type_id);
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
@@ -0,0 +1,151 @@
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "ast/ast.h"
+#include "ast/context.h"
+#include "ast/passes/printer.h"
+#include "benchmark.h"
+
+namespace bpftrace {
+
+char TimerError::ID;
+void TimerError::log(llvm::raw_ostream &OS) const
+{
+  OS << "timer error: " << strerror(err_);
+}
+
+using time_point = std::chrono::time_point<std::chrono::steady_clock,
+                                           std::chrono::nanoseconds>;
+
+static Result<time_point> processor_time()
+{
+  struct timespec ts = {};
+  int rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  if (rc < 0) {
+    return make_error<TimerError>(errno);
+  }
+  return time_point(std::chrono::seconds(ts.tv_sec) +
+                    std::chrono::nanoseconds(ts.tv_nsec));
+}
+
+static int64_t delta(time_point start, time_point end)
+{
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+      .count();
+}
+
+Result<> benchmark(std::ostream &out, ast::PassManager &mgr)
+{
+  ast::PassContext ctx;
+
+  // See below; we aggregate at the end.
+  int64_t full_mean = 0;
+  double full_variance = 0;
+  size_t full_count = 0;
+
+  // We print out the confidence interval at p95, which corresponds to a
+  // z-score of 1.96 (see the `err` value below).
+  auto emit = [&](const std::string &name,
+                  int64_t total,
+                  int64_t count,
+                  double variance) {
+    size_t mean = total / count;
+    auto stddev = std::sqrt(variance);
+    auto err = static_cast<int64_t>(1.96 * stddev /
+                                    std::sqrt(static_cast<double>(count)));
+    std::string unit = "ns";
+    if (mean > 10000000) {
+      unit = "ms";
+      mean /= 1000000;
+      err /= 1000000;
+    } else if (mean > 10000) {
+      unit = "μs";
+      mean /= 1000;
+      err /= 1000;
+    }
+    out << std::left << std::setw(30) << name;
+    out << std::left << std::setw(8) << count;
+    out << std::left << std::setw(14) << total;
+    out << mean << " ± " << err << " " << unit << std::endl;
+  };
+
+  auto ok = mgr.foreach([&](auto &pass) -> Result<> {
+    // Copy out the AST. We allow passes to mutate the AST, and therefore we
+    // copy this out and reset it each time.
+    ast::ASTContext saved;
+    if (ctx.has<ast::ASTContext>()) {
+      auto &ast = ctx.get<ast::ASTContext>();
+      saved.root = saved.clone_node(ast.root, ast::Location());
+    }
+
+    // We run the function until we are able to accumulate at least three
+    // iterations, and 100 milliseconds (but we never bother doing more than
+    // 10,000). This should provide reasonable data for the below. The times
+    // are all recorded in process CPU time, only while the pass itself is
+    // running. We may accumulate additional time rebuilding the AST, etc.
+    int64_t goal = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                       std::chrono::milliseconds(100))
+                       .count();
+    std::vector<int64_t> samples;
+    int64_t total = 0;
+    while (true) {
+      auto start = processor_time();
+      if (!start) {
+        return start.takeError();
+      }
+      auto ok = pass.run(ctx);
+      if (!ok) {
+        return ok.takeError();
+      }
+      auto end = processor_time();
+      if (!end) {
+        return end.takeError();
+      }
+      int64_t current = delta(*start, *end);
+      samples.push_back(current);
+      total += current;
+
+      // Do we have enough (or too much)?
+      if (samples.size() >= 10000 || (samples.size() > 3 && total >= goal)) {
+        break;
+      }
+
+      // Restore the original tree.
+      auto &ast = ctx.get<ast::ASTContext>();
+      ast.clear();
+      ast.root = clone(ast, saved.root, ast::Location());
+    }
+
+    // Compute the variance of the samples.
+    int64_t mean = total / samples.size();
+    double variance = 0;
+    for (const auto &sample : samples) {
+      variance += std::pow(static_cast<double>(sample - mean), 2);
+    }
+    emit(pass.name(), total, samples.size(), variance);
+
+    // Aggregate for printing the final stats. Note that we treat each pass as
+    // independent, therefore the final variance is the sum of the variances.
+    full_mean += mean;
+    full_variance += variance;
+    full_count++;
+    return OK();
+  });
+  if (!ok) {
+    out << "FAIL\n"; // See below.
+    return ok.takeError();
+  }
+
+  // The final `PASS` is emitted when all passes have finished correctly. This
+  // makes the output format compatible with `gobench` or other aggregation
+  // tools that can compare benchmarks.
+  emit("total", full_mean * full_count, full_count, full_variance);
+  out << "PASS\n";
+  return OK();
+}
+
+} // namespace bpftrace
diff --git a/src/benchmark.h b/src/benchmark.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <iostream>
+
+#include "ast/pass_manager.h"
+#include "util/result.h"
+
+namespace bpftrace {
+
+class TimerError : public ErrorInfo<TimerError> {
+public:
+  TimerError(int err) : err_(err) {};
+  static char ID;
+  void log(llvm::raw_ostream &OS) const override;
+
+private:
+  int err_;
+};
+
+Result<OK> benchmark(std::ostream &out, ast::PassManager &mgr);
+
+} // namespace bpftrace
diff --git a/src/main.cpp b/src/main.cpp
@@ -32,6 +32,7 @@
 #include "ast/passes/resource_analyser.h"
 #include "ast/passes/return_path_analyser.h"
 #include "ast/passes/semantic_analyser.h"
+#include "benchmark.h"
 #include "bpffeature.h"
 #include "bpftrace.h"
 #include "btf.h"
@@ -62,8 +63,9 @@ enum class OutputBufferConfig {
 };
 
 enum class TestMode {
-  UNSET = 0,
+  NONE = 0,
   CODEGEN,
+  BENCHMARK,
 };
 
 enum class BuildMode {
@@ -335,7 +337,7 @@ struct Args {
   bool usdt_file_activation = false;
   int helper_check_level = 1;
   bool no_warnings = false;
-  TestMode test_mode = TestMode::UNSET;
+  TestMode test_mode = TestMode::NONE;
   std::string script;
   std::string search;
   std::string filename;
@@ -465,8 +467,10 @@ Args parse_args(int argc, char* argv[])
         args.helper_check_level = 0;
         break;
       case Options::TEST: // --test
-        if (std::strcmp(optarg, "codegen") == 0)
+        if (std::strcmp(optarg, "codegen") == 0) {
           args.test_mode = TestMode::CODEGEN;
+        } else if (std::strcmp(optarg, "benchmark") == 0)
+          args.test_mode = TestMode::BENCHMARK;
         else {
           LOG(ERROR) << "USAGE: --test can only be 'codegen'.";
           exit(1);
@@ -829,7 +833,7 @@ int main(int argc, char* argv[])
   }
 
   // If we are not running anything, then we don't require root.
-  if (args.test_mode != TestMode::CODEGEN) {
+  if (args.test_mode == TestMode::NONE) {
     check_is_root();
 
     auto lockdown_state = lockdown::detect();
@@ -949,6 +953,16 @@ int main(int argc, char* argv[])
   pm.add(ast::CreateExternObjectPass());
   pm.add(ast::CreateLinkPass());
 
+  if (args.test_mode == TestMode::BENCHMARK) {
+    info(args.no_feature);
+    auto ok = benchmark(std::cout, pm);
+    if (!ok) {
+      std::cerr << "Benchmark error: " << ok.takeError();
+      return 1;
+    }
+    return 0;
+  }
+
   auto pmresult = pm.run();
   if (!pmresult) {
     std::cerr << pmresult.takeError() << "\n";

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ set_target_properties(libbpftrace PROPERTIES PREFIX "")`
`55`	`55`
`56`	`56`	`add_executable(bpftrace`
`57`	`57`	`main.cpp`
	`58`	`+ benchmark.cpp`
`58`	`59`	`)`
`59`	`60`
`60`	`61`	# TODO: Honor `STATIC_LINKING` properly.
Original file line number	Diff line number	Diff line change
`@@ -31,4 +31,11 @@ ASTContext::ASTContext() : ASTContext("", "")`
`31`	`31`	`{`
`32`	`32`	`}`
`33`	`33`
	`34`	`+void ASTContext::clear()`
	`35`	`+{`
	`36`	`+ root = nullptr;`
	`37`	`+ nodes_.clear();`
	`38`	`+ diagnostics_->clear();`
	`39`	`+}`
	`40`	`+`
`34`	`41`	`} // namespace bpftrace::ast`