Add experimental support for perf (via LinuxPerf.jl)

DilumAluthge · Zentrik · commit c1db1e6516a7 · 2023-12-28T19:21:57.000Z
diff --git a/Project.toml b/Project.toml
@@ -4,9 +4,11 @@ version = "1.4.0"
 
 [deps]
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+LinuxPerf = "b4c46c6c-4fb0-484d-a11a-41bc3392d094"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
@@ -20,6 +22,7 @@ Profile = "<0.0.1, 1"
 Statistics = "<0.0.1, 1"
 Test = "<0.0.1, 1"
 UUIDs = "<0.0.1, 1"
+LinuxPerf = "= 0.3.4"
 julia = "1"
 
 [extras]
diff --git a/src/BenchmarkTools.jl b/src/BenchmarkTools.jl
@@ -9,6 +9,13 @@ using UUIDs: uuid4
 using Printf
 using Profile
 
+import LinuxPerf
+import Random
+
+# TODO: delete the following line once https://github.com/JuliaPerf/LinuxPerf.jl/pull/32
+# is merged and a new release of LinuxPerf has been made.
+const parse_groups = LinuxPerf.parse_groups
+
 const BENCHMARKTOOLS_VERSION = v"1.0.0"
 
 ##############
diff --git a/src/execution.jl b/src/execution.jl
@@ -107,8 +107,8 @@ function _run(b::Benchmark, p::Parameters; verbose=false, pad="", kwargs...)
     trial = Trial(params)
     params.gcsample && gcscrub()
     s = b.samplefunc(b.quote_vals, params)
-    push!(trial, s[1:(end - 1)]...)
-    return_val = s[end]
+    push!(trial, s[1:end-1]...)
+    return_val = s.__return_val
     iters = 2
     while (Base.time() - start_time) < params.seconds && iters ≤ params.samples
         params.gcsample && gcscrub()
@@ -533,50 +533,66 @@ function generate_benchmark_definition(
         core_body = :($(core); $(returns))
     end
     @static if isdefined(Base, :donotdelete)
-        invocation = :(
-            let x = $invocation
-                Base.donotdelete(x)
-                x
-            end
-        )
+        invocation = :(let x = $invocation
+                           Base.donotdelete(x)
+                           x
+                       end)
     end
-    return Core.eval(
-        eval_module,
-        quote
-            @noinline $(signature_def) = begin
-                $(core_body)
+    experimental_enable_linux_perf = true # TODO: take this as input from the user
+    # TODO: let the user actually provide these options.
+    linux_perf_opts = LinuxPerf.parse_pstats_options([])
+    return Core.eval(eval_module, quote
+        @noinline $(signature_def) = begin $(core_body) end
+        @noinline function $(samplefunc)($(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters)
+            $(setup)
+            __evals = __params.evals
+            __gc_start = Base.gc_num()
+            __start_time = time_ns()
+            __return_val = $(invocation)
+            for __iter in 2:__evals
+                $(invocation)
             end
-            @noinline function $(samplefunc)(
-                $(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters
-            )
-                $(setup)
-                __evals = __params.evals
-                __gc_start = Base.gc_num()
-                __start_time = time_ns()
-                __return_val = $(invocation)
-                for __iter in 2:__evals
-                    $(invocation)
-                end
-                __sample_time = time_ns() - __start_time
-                __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
-                $(teardown)
-                __time = max((__sample_time / __evals) - __params.overhead, 0.001)
-                __gctime = max((__gcdiff.total_time / __evals) - __params.overhead, 0.0)
-                __memory = Int(Base.fld(__gcdiff.allocd, __evals))
-                __allocs = Int(
-                    Base.fld(
-                        __gcdiff.malloc +
-                        __gcdiff.realloc +
-                        __gcdiff.poolalloc +
-                        __gcdiff.bigalloc,
-                        __evals,
-                    ),
+            __sample_time = time_ns() - __start_time
+            __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
+            $(teardown)
+            __time = max((__sample_time / __evals) - __params.overhead, 0.001)
+            __gctime = max((__gcdiff.total_time / __evals) - __params.overhead, 0.0)
+            __memory = Int(Base.fld(__gcdiff.allocd, __evals))
+            __allocs = Int(Base.fld(__gcdiff.malloc + __gcdiff.realloc +
+                               __gcdiff.poolalloc + __gcdiff.bigalloc,
+                               __evals))
+            if $(experimental_enable_linux_perf)
+                # Based on https://github.com/JuliaPerf/LinuxPerf.jl/blob/a7fee0ff261a5b5ce7a903af7b38d1b5c27dd931/src/LinuxPerf.jl#L1043-L1061
+                __linux_perf_groups = LinuxPerf.set_default_spaces(
+                    $(linux_perf_opts.events),
+                    $(linux_perf_opts.spaces),
+                )
+                __linux_perf_bench = LinuxPerf.make_bench_threaded(
+                    __linux_perf_groups;
+                    threads = $(linux_perf_opts.threads),
                 )
-                return __time, __gctime, __memory, __allocs, __return_val
+                LinuxPerf.enable!(__linux_perf_bench)
+                # We'll just run it one time.
+                __return_val_2 = $(invocation)
+                LinuxPerf.disable!(__linux_perf_bench)
+                # trick the compiler not to eliminate the code
+                if rand() < 0
+                    __linux_perf_stats =  __return_val_2
+                else
+                    __linux_perf_stats =  LinuxPerf.Stats(__linux_perf_bench)
+                end
             end
-            $BenchmarkTools.Benchmark($(samplefunc), $(quote_vals), $(params))
-        end,
-    )
+            return (;
+                __time,
+                __gctime,
+                __memory,
+                __allocs,
+                __return_val,
+                __linux_perf_stats,
+            )
+        end
+        $BenchmarkTools.Benchmark($(samplefunc), $(quote_vals), $(params))
+    end)
 end
 
 ######################
diff --git a/src/trials.jl b/src/trials.jl
@@ -8,6 +8,7 @@ mutable struct Trial
     gctimes::Vector{Float64}
     memory::Int
     allocs::Int
+    linux_perf_stats::Union{LinuxPerf.Stats, Nothing}
 end
 
 Trial(params::Parameters) = Trial(params, Float64[], Float64[], typemax(Int), typemax(Int))
@@ -24,11 +25,25 @@ function Base.copy(t::Trial)
     return Trial(copy(t.params), copy(t.times), copy(t.gctimes), t.memory, t.allocs)
 end
 
-function Base.push!(t::Trial, time, gctime, memory, allocs)
+const TrialContents = NamedTuple{(
+    :__time,
+    :__gctime,
+    :__memory,
+    :__allocs,
+    :__return_val,
+    :__linux_perf_stats,
+)}
+
+function Base.push!(t::Trial, trial_contents::TrialContents)
+    time = trial_contents.__time
+    gctime = trial_contents.__gctime
+    memory =  trial_contents.__memory
+    allocs = trial_contents.__allocs
     push!(t.times, time)
     push!(t.gctimes, gctime)
     memory < t.memory && (t.memory = memory)
     allocs < t.allocs && (t.allocs = allocs)
+    trial.linux_perf_stats = trial_contents.__linux_perf_stats
     return t
 end