Skip to content

Commit 59d31c6

Browse files
authored
perf: benchmarking CI (#136)
* ci(buildkite): add benchmark runners * perf: initial ViT benchmarking * fix: path names * ci: run CPU benchmarks on larger machine * ci: try fixing CUDA bench * fix: aggregation script * ci: run GC to rule out allocations
1 parent d786975 commit 59d31c6

File tree

8 files changed

+319
-2
lines changed

8 files changed

+319
-2
lines changed

.buildkite/pipeline.yml

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,77 @@ steps:
2020
if: build.message !~ /\[skip tests\]/
2121
timeout_in_minutes: 60
2222

23+
- group: ":racehorse: Benchmarks"
24+
steps:
25+
- label: "CPU: Run Benchmarks"
26+
plugins:
27+
- JuliaCI/julia#v1:
28+
version: "1"
29+
command: |
30+
julia --project=benchmark -e 'println("--- :julia: Instantiating project")
31+
using Pkg
32+
Pkg.develop([PackageSpec(path=pwd())])'
33+
34+
julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
35+
include("benchmark/runbenchmarks.jl")'
36+
artifact_paths:
37+
- "benchmark/results/*"
38+
agents:
39+
# Models are quite large so we need a decent sized machine. Don't tell Chris we
40+
# are stealing SciMLBenchmarks machine :P
41+
queue: "juliaecosystem"
42+
sandbox_capable: true
43+
exclusive: true
44+
arch: "x86_64"
45+
env:
46+
BENCHMARK_GROUP: CPU
47+
JULIA_NUM_THREADS: "auto"
48+
timeout_in_minutes: 120
49+
50+
- label: "CUDA: Run Benchmarks"
51+
plugins:
52+
- JuliaCI/julia#v1:
53+
version: "1"
54+
command: |
55+
julia --project=benchmark -e 'println("--- :julia: Instantiating project")
56+
using Pkg
57+
Pkg.develop([PackageSpec(path=pwd())])'
58+
59+
julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
60+
include("benchmark/runbenchmarks.jl")'
61+
artifact_paths:
62+
- "benchmark/results/*"
63+
agents:
64+
queue: "benchmark"
65+
gpu: "rtx4070"
66+
cuda: "*"
67+
env:
68+
BENCHMARK_GROUP: CUDA
69+
JULIA_NUM_THREADS: "auto"
70+
timeout_in_minutes: 120
71+
72+
- wait: ~
73+
continue_on_failure: true
74+
75+
- label: "Combine benchmarks"
76+
plugins:
77+
- JuliaCI/julia#v1:
78+
version: "1"
79+
command: |
80+
buildkite-agent artifact download "benchmark/results/*" .
81+
82+
julia -e 'println("--- :julia: Instantiating project")
83+
using Pkg
84+
Pkg.add("BenchmarkTools")
85+
86+
println("--- :julia: Combining Benchmarks")
87+
include("benchmark/aggregate.jl")'
88+
artifact_paths:
89+
- "benchmark/results/combinedbenchmarks.json"
90+
agents:
91+
queue: "juliagpu"
92+
timeout_in_minutes: 10
93+
2394
# - label: "AMDGPU Julia v{{matrix.version}}"
2495
# matrix:
2596
# setup:
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: Benchmarks
2+
permissions:
3+
contents: write # contents permission to update benchmark contents in gh-pages branch
4+
statuses: read
5+
deployments: write # deployments permission to deploy GitHub pages website
6+
pull-requests: write
7+
8+
on:
9+
pull_request:
10+
11+
push:
12+
branches:
13+
- main
14+
15+
jobs:
16+
benchmark:
17+
if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
18+
runs-on: ubuntu-latest
19+
steps:
20+
- uses: actions/checkout@v4
21+
- name: Download Buildkite Artifacts
22+
id: download
23+
uses: EnricoMi/download-buildkite-artifact-action@v1
24+
with:
25+
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
26+
output_path: artifacts
27+
28+
- name: Locate Benchmarks Artifact
29+
id: locate
30+
if: ${{ steps.download.outputs.download-state == 'success' }}
31+
run: echo "path=$(find artifacts -type f -name combinedbenchmarks.json 2>/dev/null)" >> $GITHUB_OUTPUT
32+
33+
- name: Upload Benchmark Results
34+
if: ${{ steps.locate.outputs.path != '' }}
35+
uses: benchmark-action/github-action-benchmark@v1
36+
with:
37+
name: Reactant.jl Benchmarks
38+
tool: "julia"
39+
output-file-path: ${{ steps.locate.outputs.path }}
40+
benchmark-data-dir-path: "benchmarks"
41+
github-token: ${{ secrets.GITHUB_TOKEN }}
42+
comment-always: true
43+
summary-always: true
44+
alert-threshold: "150%"
45+
fail-on-alert: false
46+
auto-push: ${{ github.event_name != 'pull_request' }}

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,3 +273,5 @@ deps/ReactantExtra/MODULE.bazel.lock
273273
external
274274

275275
archive/
276+
277+
benchmark/results/*

benchmark/Project.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
[deps]
2+
AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
23
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
34
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
5+
CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
46
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
7+
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
8+
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
59
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
10+
LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
11+
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
12+
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
613
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
714
Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
15+
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
16+
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
817

918
[compat]
1019
BenchmarkTools = "1.5"
@@ -13,3 +22,9 @@ Enzyme = "0.13"
1322
Lux = "1.1"
1423
Random = "1.10"
1524
julia = "1.10"
25+
26+
[extras]
27+
CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
28+
29+
[preferences.CUDA_Driver_jll]
30+
compat = false

benchmark/aggregate.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
using BenchmarkTools
2+
3+
const BACKENDS = ["CPU", "CUDA"]
4+
5+
const CPU_Results = joinpath(dirname(@__FILE__), "results", "CPUbenchmarks.json")
6+
@assert(ispath(CPU_Results))
7+
8+
const RESULTS = BenchmarkTools.load(CPU_Results)[1]
9+
@assert RESULTS isa BenchmarkTools.BenchmarkGroup
10+
11+
for backend in BACKENDS[2:end]
12+
@info "Aggregating results for $(backend)"
13+
filename = string(backend, "benchmarks.json")
14+
filepath = joinpath(dirname(@__FILE__), "results", filename)
15+
if !ispath(filepath)
16+
@warn "No file found at path: $(filepath)"
17+
else
18+
backend_results = BenchmarkTools.load(filepath)[1]
19+
if backend_results isa BenchmarkTools.BenchmarkGroup
20+
# <benchmark name>/<forward or reverse>/<backend>/<reactant or package>
21+
for benchmark in keys(RESULTS)
22+
for pass in keys(RESULTS[benchmark])
23+
for pkg in keys(backend_results[benchmark][pass][backend])
24+
RESULTS[benchmark][pass][backend][pkg] = backend_results[benchmark][pass][backend][pkg]
25+
end
26+
end
27+
end
28+
else
29+
@warn "Unexpected file format for file at path: $(filepath)"
30+
end
31+
end
32+
end
33+
34+
BenchmarkTools.save(
35+
joinpath(dirname(@__FILE__), "results", "combinedbenchmarks.json"), RESULTS
36+
)

benchmark/runbenchmarks.jl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Accelerator Support for testing non-Reactant performance
2+
using LuxCUDA
3+
4+
using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable
5+
using CpuId: CpuId
6+
using InteractiveUtils: versioninfo
7+
using LinearAlgebra: BLAS
8+
using Reactant: Reactant
9+
using Statistics: median
10+
11+
# To run benchmarks on a specific GPU backend, add AMDGPU / CUDA / Metal / oneAPI
12+
# to benchmarks/Project.toml and change BENCHMARK_GROUP to the backend name
13+
const BENCHMARK_GROUP = get(ENV, "BENCHMARK_GROUP", "CPU")
14+
@info "Running benchmarks for $BENCHMARK_GROUP"
15+
16+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20
17+
18+
if BENCHMARK_GROUP == "CPU"
19+
if Sys.isapple() && (Sys.ARCH == :aarch64 || Sys.ARCH == :arm64)
20+
@info "Running benchmarks on Apple with ARM CPUs. Using `AppleAccelerate.jl`."
21+
using AppleAccelerate: AppleAccelerate
22+
end
23+
24+
if Sys.ARCH == :x86_64 && occursin("intel", lowercase(CpuId.cpubrand()))
25+
@info "Running benchmarks on Intel CPUs. Loading `MKL.jl`."
26+
using MKL: MKL
27+
end
28+
end
29+
30+
const BENCHMARK_CPU_THREADS = Threads.nthreads()
31+
BLAS.set_num_threads(BENCHMARK_CPU_THREADS)
32+
33+
@info sprint(versioninfo)
34+
@info "BLAS threads: $(BLAS.get_num_threads())"
35+
36+
const SUITE = BenchmarkGroup()
37+
38+
if BENCHMARK_GROUP == "CUDA"
39+
Reactant.set_default_backend("gpu")
40+
@info "Running CUDA benchmarks" maxlog = 1
41+
CUDA.versioninfo()
42+
else
43+
@info "Running CPU benchmarks with $(BENCHMARK_CPU_THREADS) thread(s)" maxlog = 1
44+
end
45+
46+
# Main benchmark files
47+
include("setup.jl")
48+
setup_benchmarks!(SUITE, BENCHMARK_GROUP)
49+
50+
results = BenchmarkTools.run(SUITE; verbose=true)
51+
52+
filepath = joinpath(dirname(@__FILE__), "results")
53+
mkpath(filepath)
54+
filename = string(BENCHMARK_GROUP, "benchmarks.json")
55+
BenchmarkTools.save(joinpath(filepath, filename), median(results))
56+
57+
@info "Saved results to $(joinpath(filepath, filename))"

benchmark/setup.jl

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
using Boltz: Vision
2+
using Lux: Lux
3+
using MLDataDevices: AbstractDevice, CPUDevice, CUDADevice
4+
using Random: Random
5+
using Reactant: Reactant, @compile
6+
7+
using Enzyme: Enzyme
8+
using Zygote: Zygote
9+
10+
# Helper Functions
11+
@inline synchronize(::CPUDevice) = nothing
12+
@inline synchronize(::CUDADevice) = CUDA.synchronize()
13+
14+
@inline reclaim(::CPUDevice) = GC.gc()
15+
@inline reclaim(::CUDADevice) = CUDA.reclaim()
16+
17+
@inline sumabs2(model, x, p, st) = sum(abs2, first(Lux.apply(model, x, p, st)))
18+
@inline sumabs2(model, x) = sum(abs2, model(x))
19+
20+
function benchmark_group_to_backend(benchmark_group::String)
21+
benchmark_group == "CPU" && return CPUDevice()
22+
benchmark_group == "CUDA" && return CUDADevice()
23+
return error("Unknown backend: $(benchmark_group)")
24+
end
25+
26+
function general_lux_setup(model, x_dims)
27+
rng = Random.default_rng() # don't use any other rng
28+
ps, st = Lux.setup(rng, model)
29+
x_dims === nothing && return ps, st
30+
x = randn(rng, Float32, x_dims)
31+
return x, ps, st
32+
end
33+
34+
function setup_benchmarks!(suite::BenchmarkGroup, backend::String)
35+
dev = benchmark_group_to_backend(backend)
36+
37+
setup_vit_benchmark!(suite, backend, dev)
38+
39+
return nothing
40+
end
41+
42+
# Lux Benchmarks
43+
function setup_vit_benchmark!(suite::BenchmarkGroup, backend, dev::AbstractDevice)
44+
for mode in (:tiny, :small, :base), bsize in (4, 16, 32)
45+
benchmark_name = "ViT $(mode) (256 x 256 x 3 x $(bsize))"
46+
47+
setup_lux_forward_pass_benchmark!(
48+
suite, benchmark_name, backend, Vision.ViT(mode), (256, 256, 3, bsize), dev
49+
)
50+
end
51+
end
52+
53+
function setup_lux_forward_pass_benchmark!(
54+
suite::BenchmarkGroup,
55+
benchmark_name::String,
56+
backend::String,
57+
model,
58+
x_dims,
59+
dev::AbstractDevice,
60+
)
61+
suite[benchmark_name]["forward"][backend]["Lux"] = @benchmarkable begin
62+
Lux.apply($model, x, ps, st_test)
63+
synchronize($dev)
64+
end setup = begin
65+
GC.gc()
66+
reclaim($dev)
67+
x, ps, st = $dev(general_lux_setup($model, $x_dims))
68+
st_test = Lux.testmode(st)
69+
GC.gc()
70+
reclaim($dev)
71+
end
72+
73+
suite[benchmark_name]["forward"][backend]["Reactant"] = @benchmarkable begin
74+
y, _ = apply_compiled($model, x_ra, ps_ra, st_test_ra)
75+
Reactant.synchronize(y)
76+
end setup = begin
77+
GC.gc()
78+
reclaim($dev)
79+
x, ps, st = general_lux_setup($model, $x_dims)
80+
st_test = Lux.testmode(st)
81+
x_ra = Reactant.to_rarray(x)
82+
ps_ra = Reactant.to_rarray(ps)
83+
st_test_ra = Reactant.to_rarray(st_test)
84+
apply_compiled = @compile Lux.apply($model, x_ra, ps_ra, st_test_ra)
85+
GC.gc()
86+
reclaim($dev)
87+
end
88+
89+
return nothing
90+
end

src/XLA.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ end
446446
const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing)
447447

448448
@inline function await(buffer::AsyncBuffer)::Nothing
449-
if buffer.future == nothing
449+
if buffer.future === nothing
450450
return nothing
451451
else
452452
future = buffer.future
@@ -457,7 +457,7 @@ const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing)
457457
end
458458

459459
@inline function synced_buffer(buffer::AsyncBuffer)
460-
if buffer.future != nothing
460+
if buffer.future !== nothing
461461
future = buffer.future
462462
buffer.future = nothing
463463
await(future::Future)

0 commit comments

Comments
 (0)