perf: benchmarking CI (#136)

avik-pal · web-flow · commit 59d31c6ac271 · 2024-10-01T14:58:53.000-04:00
* ci(buildkite): add benchmark runners

* perf: initial ViT benchmarking

* fix: path names

* ci: run CPU benchmarks on larger machine

* ci: try fixing CUDA bench

* fix: aggregation script

* ci: run GC to rule out allocations
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -20,6 +20,77 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
 
+  - group: ":racehorse: Benchmarks"
+    steps:
+      - label: "CPU: Run Benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1"
+        command: |
+          julia --project=benchmark -e 'println("--- :julia: Instantiating project")
+              using Pkg
+              Pkg.develop([PackageSpec(path=pwd())])'
+
+          julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
+              include("benchmark/runbenchmarks.jl")'
+        artifact_paths:
+          - "benchmark/results/*"
+        agents:
+          # Models are quite large so we need a decent sized machine. Don't tell Chris we
+          # are stealing SciMLBenchmarks machine :P
+          queue: "juliaecosystem"
+          sandbox_capable: true
+          exclusive: true
+          arch: "x86_64"
+        env:
+          BENCHMARK_GROUP: CPU
+          JULIA_NUM_THREADS: "auto"
+        timeout_in_minutes: 120
+
+      - label: "CUDA: Run Benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1"
+        command: |
+          julia --project=benchmark -e 'println("--- :julia: Instantiating project")
+              using Pkg
+              Pkg.develop([PackageSpec(path=pwd())])'
+
+          julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
+              include("benchmark/runbenchmarks.jl")'
+        artifact_paths:
+          - "benchmark/results/*"
+        agents:
+          queue: "benchmark"
+          gpu: "rtx4070"
+          cuda: "*"
+        env:
+          BENCHMARK_GROUP: CUDA
+          JULIA_NUM_THREADS: "auto"
+        timeout_in_minutes: 120
+
+      - wait: ~
+        continue_on_failure: true
+
+      - label: "Combine benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1"
+        command: |
+          buildkite-agent artifact download "benchmark/results/*" .
+
+          julia -e 'println("--- :julia: Instantiating project")
+              using Pkg
+              Pkg.add("BenchmarkTools")
+
+              println("--- :julia: Combining Benchmarks")
+              include("benchmark/aggregate.jl")'
+        artifact_paths:
+          - "benchmark/results/combinedbenchmarks.json"
+        agents:
+          queue: "juliagpu"
+        timeout_in_minutes: 10
+
   # - label: "AMDGPU Julia v{{matrix.version}}"
   #   matrix:
   #     setup:
diff --git a/.github/workflows/benchmark_aggregate.yml b/.github/workflows/benchmark_aggregate.yml
@@ -0,0 +1,46 @@
+name: Benchmarks
+permissions:
+  contents: write # contents permission to update benchmark contents in gh-pages branch
+  statuses: read
+  deployments: write # deployments permission to deploy GitHub pages website
+  pull-requests: write
+
+on:
+  pull_request:
+
+  push:
+    branches:
+      - main
+
+jobs:
+  benchmark:
+    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Download Buildkite Artifacts
+      id: download
+      uses: EnricoMi/download-buildkite-artifact-action@v1
+      with:
+        buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+        output_path: artifacts
+
+    - name: Locate Benchmarks Artifact
+      id: locate
+      if: ${{ steps.download.outputs.download-state == 'success' }}
+      run: echo "path=$(find artifacts -type f -name combinedbenchmarks.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+    - name: Upload Benchmark Results
+      if: ${{ steps.locate.outputs.path != '' }}
+      uses: benchmark-action/github-action-benchmark@v1
+      with:
+        name: Reactant.jl Benchmarks
+        tool: "julia"
+        output-file-path: ${{ steps.locate.outputs.path }}
+        benchmark-data-dir-path: "benchmarks"
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        comment-always: true
+        summary-always: true
+        alert-threshold: "150%"
+        fail-on-alert: false
+        auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/.gitignore b/.gitignore
@@ -273,3 +273,5 @@ deps/ReactantExtra/MODULE.bazel.lock
 external
 
 archive/
+
+benchmark/results/*
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -1,10 +1,19 @@
 [deps]
+AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
+MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 BenchmarkTools = "1.5"
@@ -13,3 +22,9 @@ Enzyme = "0.13"
 Lux = "1.1"
 Random = "1.10"
 julia = "1.10"
+
+[extras]
+CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+
+[preferences.CUDA_Driver_jll]
+compat = false
diff --git a/benchmark/aggregate.jl b/benchmark/aggregate.jl
@@ -0,0 +1,36 @@
+using BenchmarkTools
+
+const BACKENDS = ["CPU", "CUDA"]
+
+const CPU_Results = joinpath(dirname(@__FILE__), "results", "CPUbenchmarks.json")
+@assert(ispath(CPU_Results))
+
+const RESULTS = BenchmarkTools.load(CPU_Results)[1]
+@assert RESULTS isa BenchmarkTools.BenchmarkGroup
+
+for backend in BACKENDS[2:end]
+    @info "Aggregating results for $(backend)"
+    filename = string(backend, "benchmarks.json")
+    filepath = joinpath(dirname(@__FILE__), "results", filename)
+    if !ispath(filepath)
+        @warn "No file found at path: $(filepath)"
+    else
+        backend_results = BenchmarkTools.load(filepath)[1]
+        if backend_results isa BenchmarkTools.BenchmarkGroup
+            # <benchmark name>/<forward or reverse>/<backend>/<reactant or package>
+            for benchmark in keys(RESULTS)
+                for pass in keys(RESULTS[benchmark])
+                    for pkg in keys(backend_results[benchmark][pass][backend])
+                        RESULTS[benchmark][pass][backend][pkg] = backend_results[benchmark][pass][backend][pkg]
+                    end
+                end
+            end
+        else
+            @warn "Unexpected file format for file at path: $(filepath)"
+        end
+    end
+end
+
+BenchmarkTools.save(
+    joinpath(dirname(@__FILE__), "results", "combinedbenchmarks.json"), RESULTS
+)
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
@@ -0,0 +1,57 @@
+# Accelerator Support for testing non-Reactant performance
+using LuxCUDA
+
+using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable
+using CpuId: CpuId
+using InteractiveUtils: versioninfo
+using LinearAlgebra: BLAS
+using Reactant: Reactant
+using Statistics: median
+
+# To run benchmarks on a specific GPU backend, add AMDGPU / CUDA / Metal / oneAPI
+# to benchmarks/Project.toml and change BENCHMARK_GROUP to the backend name
+const BENCHMARK_GROUP = get(ENV, "BENCHMARK_GROUP", "CPU")
+@info "Running benchmarks for $BENCHMARK_GROUP"
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20
+
+if BENCHMARK_GROUP == "CPU"
+    if Sys.isapple() && (Sys.ARCH == :aarch64 || Sys.ARCH == :arm64)
+        @info "Running benchmarks on Apple with ARM CPUs. Using `AppleAccelerate.jl`."
+        using AppleAccelerate: AppleAccelerate
+    end
+
+    if Sys.ARCH == :x86_64 && occursin("intel", lowercase(CpuId.cpubrand()))
+        @info "Running benchmarks on Intel CPUs. Loading `MKL.jl`."
+        using MKL: MKL
+    end
+end
+
+const BENCHMARK_CPU_THREADS = Threads.nthreads()
+BLAS.set_num_threads(BENCHMARK_CPU_THREADS)
+
+@info sprint(versioninfo)
+@info "BLAS threads: $(BLAS.get_num_threads())"
+
+const SUITE = BenchmarkGroup()
+
+if BENCHMARK_GROUP == "CUDA"
+    Reactant.set_default_backend("gpu")
+    @info "Running CUDA benchmarks" maxlog = 1
+    CUDA.versioninfo()
+else
+    @info "Running CPU benchmarks with $(BENCHMARK_CPU_THREADS) thread(s)" maxlog = 1
+end
+
+# Main benchmark files
+include("setup.jl")
+setup_benchmarks!(SUITE, BENCHMARK_GROUP)
+
+results = BenchmarkTools.run(SUITE; verbose=true)
+
+filepath = joinpath(dirname(@__FILE__), "results")
+mkpath(filepath)
+filename = string(BENCHMARK_GROUP, "benchmarks.json")
+BenchmarkTools.save(joinpath(filepath, filename), median(results))
+
+@info "Saved results to $(joinpath(filepath, filename))"
diff --git a/benchmark/setup.jl b/benchmark/setup.jl
@@ -0,0 +1,90 @@
+using Boltz: Vision
+using Lux: Lux
+using MLDataDevices: AbstractDevice, CPUDevice, CUDADevice
+using Random: Random
+using Reactant: Reactant, @compile
+
+using Enzyme: Enzyme
+using Zygote: Zygote
+
+# Helper Functions
+@inline synchronize(::CPUDevice) = nothing
+@inline synchronize(::CUDADevice) = CUDA.synchronize()
+
+@inline reclaim(::CPUDevice) = GC.gc()
+@inline reclaim(::CUDADevice) = CUDA.reclaim()
+
+@inline sumabs2(model, x, p, st) = sum(abs2, first(Lux.apply(model, x, p, st)))
+@inline sumabs2(model, x) = sum(abs2, model(x))
+
+function benchmark_group_to_backend(benchmark_group::String)
+    benchmark_group == "CPU" && return CPUDevice()
+    benchmark_group == "CUDA" && return CUDADevice()
+    return error("Unknown backend: $(benchmark_group)")
+end
+
+function general_lux_setup(model, x_dims)
+    rng = Random.default_rng()  # don't use any other rng
+    ps, st = Lux.setup(rng, model)
+    x_dims === nothing && return ps, st
+    x = randn(rng, Float32, x_dims)
+    return x, ps, st
+end
+
+function setup_benchmarks!(suite::BenchmarkGroup, backend::String)
+    dev = benchmark_group_to_backend(backend)
+
+    setup_vit_benchmark!(suite, backend, dev)
+
+    return nothing
+end
+
+# Lux Benchmarks
+function setup_vit_benchmark!(suite::BenchmarkGroup, backend, dev::AbstractDevice)
+    for mode in (:tiny, :small, :base), bsize in (4, 16, 32)
+        benchmark_name = "ViT $(mode) (256 x 256 x 3 x $(bsize))"
+
+        setup_lux_forward_pass_benchmark!(
+            suite, benchmark_name, backend, Vision.ViT(mode), (256, 256, 3, bsize), dev
+        )
+    end
+end
+
+function setup_lux_forward_pass_benchmark!(
+    suite::BenchmarkGroup,
+    benchmark_name::String,
+    backend::String,
+    model,
+    x_dims,
+    dev::AbstractDevice,
+)
+    suite[benchmark_name]["forward"][backend]["Lux"] = @benchmarkable begin
+        Lux.apply($model, x, ps, st_test)
+        synchronize($dev)
+    end setup = begin
+        GC.gc()
+        reclaim($dev)
+        x, ps, st = $dev(general_lux_setup($model, $x_dims))
+        st_test = Lux.testmode(st)
+        GC.gc()
+        reclaim($dev)
+    end
+
+    suite[benchmark_name]["forward"][backend]["Reactant"] = @benchmarkable begin
+        y, _ = apply_compiled($model, x_ra, ps_ra, st_test_ra)
+        Reactant.synchronize(y)
+    end setup = begin
+        GC.gc()
+        reclaim($dev)
+        x, ps, st = general_lux_setup($model, $x_dims)
+        st_test = Lux.testmode(st)
+        x_ra = Reactant.to_rarray(x)
+        ps_ra = Reactant.to_rarray(ps)
+        st_test_ra = Reactant.to_rarray(st_test)
+        apply_compiled = @compile Lux.apply($model, x_ra, ps_ra, st_test_ra)
+        GC.gc()
+        reclaim($dev)
+    end
+
+    return nothing
+end
diff --git a/src/XLA.jl b/src/XLA.jl
@@ -446,7 +446,7 @@ end
 const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing)
 
 @inline function await(buffer::AsyncBuffer)::Nothing
-    if buffer.future == nothing
+    if buffer.future === nothing
         return nothing
     else
         future = buffer.future
@@ -457,7 +457,7 @@ const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing)
 end
 
 @inline function synced_buffer(buffer::AsyncBuffer)
-    if buffer.future != nothing
+    if buffer.future !== nothing
         future = buffer.future
         buffer.future = nothing
         await(future::Future)