EnzymeAD
diff --git a/‎perf/common.jl
Lines changed: 324 additions & 0 deletions b/‎perf/common.jl
Lines changed: 324 additions & 0 deletions
diff --git a/‎perf/neuraloperators/Project.toml
Lines changed: 25 additions & 0 deletions b/‎perf/neuraloperators/Project.toml
Lines changed: 25 additions & 0 deletions
diff --git a/‎perf/neuraloperators/deeponet_nvidia_geforce_rtx_5090_2025_07_12_10_58_09.pdf
18.7 KB b/‎perf/neuraloperators/deeponet_nvidia_geforce_rtx_5090_2025_07_12_10_58_09.pdf
18.7 KB
diff --git a/‎perf/neuraloperators/fno_nvidia_geforce_rtx_5090_2025_07_12_11_20_45.pdf
18.7 KB b/‎perf/neuraloperators/fno_nvidia_geforce_rtx_5090_2025_07_12_11_20_45.pdf
18.7 KB
@@ -0,0 +1,324 @@
+using BenchmarkTools: @benchmark
+using Reactant, Enzyme, PrettyTables, Statistics
+using CairoMakie, AlgebraOfGraphics, CSV, DataFrames, Dates
+const AoG = AlgebraOfGraphics
+
+AoG.set_aog_theme!()
+
+function simple_mse_loss(model, x, z, ps, st)
+    y, _ = Lux.apply(model, x, ps, st)
+    return MSELoss()(y, z)
+end
+
+function simple_mse_loss_gradient(model, x, z, ps, st)
+    return Enzyme.gradient(
+        Enzyme.Reverse, simple_mse_loss, Const(model), Const(x), Const(z), ps, Const(st)
+    )
+end
+
+function benchmark_nn_primal(
+    model, x, z, ps, st; disable_scatter_gather_bench=true, disable_pad_bench=true
+)
+    results = Vector{Tuple{String,String,Float64,Float64,Float64}}()
+
+    # Only XLA
+    compiled_fwd_xla = @compile compile_options = Reactant.DefaultXLACompileOptions(;
+        sync=true
+    ) simple_mse_loss(model, x, z, ps, st)
+    bench = @benchmark $compiled_fwd_xla($model, $x, $z, $ps, $st) setup = (GC.gc(true))
+    push!(results, ("Primal", "Only XLA", mean(bench).time, std(bench).time, 1.0))
+    baseline = mean(bench).time
+
+    # Default
+    compiled_fwd = @compile compile_options = CompileOptions(;
+        sync=true, no_nan=true, all_finite=true
+    ) simple_mse_loss(model, x, z, ps, st)
+    bench = @benchmark $compiled_fwd($model, $x, $z, $ps, $st) setup = (GC.gc(true))
+    push!(
+        results,
+        ("Primal", "All", mean(bench).time, std(bench).time, mean(bench).time / baseline),
+    )
+
+    # Disable Scatter
+    if disable_scatter_gather_bench
+        compiled_fwd_no_scatter = @compile compile_options = CompileOptions(;
+            disable_scatter_gather_optimization_passes=true,
+            sync=true,
+            no_nan=true,
+            all_finite=true,
+        ) simple_mse_loss(model, x, z, ps, st)
+        bench = @benchmark $compiled_fwd_no_scatter($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Primal",
+                "No Scatter/Gather Optimizations",
+                mean(bench).time,
+                std(bench).time,
+                mean(bench).time / baseline,
+            ),
+        )
+    end
+
+    # Disable Pad
+    if disable_pad_bench
+        compiled_fwd_no_pad = @compile compile_options = CompileOptions(;
+            disable_pad_optimization_passes=true, sync=true, no_nan=true, all_finite=true
+        ) simple_mse_loss(model, x, z, ps, st)
+        bench = @benchmark $compiled_fwd_no_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Primal",
+                "No Pad Optimizations",
+                mean(bench).time,
+                std(bench).time,
+                mean(bench).time / baseline,
+            ),
+        )
+    end
+
+    # Disable Scatter and Pad
+    if disable_scatter_gather_bench && disable_pad_bench
+        compiled_fwd_no_scatter_pad = @compile compile_options = CompileOptions(;
+            disable_scatter_gather_optimization_passes=true,
+            disable_pad_optimization_passes=true,
+            sync=true,
+            no_nan=true,
+            all_finite=true,
+        ) simple_mse_loss(model, x, z, ps, st)
+        bench = @benchmark $compiled_fwd_no_scatter_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Primal",
+                "No Scatter/Gather/Pad Optimizations",
+                mean(bench).time,
+                std(bench).time,
+                mean(bench).time / baseline,
+            ),
+        )
+    end
+
+    sort!(results; by=x -> x[3])
+    return results
+end
+
+function benchmark_nn_gradient(model, x, z, ps, st; kwargs...)
+    return vcat(
+        [
+            benchmark_nn_gradient_internal(model, x, z, ps, st, mode; kwargs...) for
+            mode in [:all, :before_enzyme, :after_enzyme]
+        ]...,
+    )
+end
+
+function benchmark_nn_gradient_internal(
+    model, x, z, ps, st, mode; disable_scatter_gather_bench=true, disable_pad_bench=true
+)
+    @info "Benchmarking gradient with mode: $(Meta.quot(mode))"
+
+    results = Vector{Tuple{String,String,Float64,Float64,Float64}}()
+
+    # Only XLA
+    compiled_grad_xla = @compile compile_options = Reactant.DefaultXLACompileOptions(;
+        sync=true
+    ) simple_mse_loss_gradient(model, x, z, ps, st)
+    bench = @benchmark $compiled_grad_xla($model, $x, $z, $ps, $st) setup = (GC.gc(true))
+    push!(results, ("Gradient ($mode)", "Only XLA", mean(bench).time, std(bench).time, 1.0))
+    baseline = mean(bench).time
+
+    # Default
+    compiled_grad = @compile compile_options = CompileOptions(;
+        sync=true, no_nan=true, all_finite=true, optimization_passes=mode
+    ) simple_mse_loss_gradient(model, x, z, ps, st)
+    bench = @benchmark $compiled_grad($model, $x, $z, $ps, $st) setup = (GC.gc(true))
+    push!(
+        results,
+        (
+            "Gradient ($mode)",
+            "All",
+            mean(bench).time,
+            std(bench).time,
+            mean(bench).time / baseline,
+        ),
+    )
+
+    # Disable Scatter
+    if disable_scatter_gather_bench
+        compiled_grad_no_scatter = @compile compile_options = CompileOptions(;
+            disable_scatter_gather_optimization_passes=true,
+            optimization_passes=mode,
+            sync=true,
+            no_nan=true,
+            all_finite=true,
+        ) simple_mse_loss_gradient(model, x, z, ps, st)
+        bench = @benchmark $compiled_grad_no_scatter($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Gradient ($mode)",
+                "No Scatter/Gather Optimizations",
+                mean(bench).time,
+                std(bench).time,
+                mean(bench).time / baseline,
+            ),
+        )
+    end
+
+    # Disable Pad
+    if disable_pad_bench
+        compiled_grad_no_pad = @compile compile_options = CompileOptions(;
+            disable_pad_optimization_passes=true,
+            optimization_passes=mode,
+            sync=true,
+            no_nan=true,
+            all_finite=true,
+        ) simple_mse_loss_gradient(model, x, z, ps, st)
+        bench = @benchmark $compiled_grad_no_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Gradient ($mode)",
+                "No Pad Optimizations",
+                mean(bench).time,
+                std(bench).time,
+                mean(bench).time / baseline,
+            ),
+        )
+    end
+
+    # Disable Pad and Scatter
+    if disable_scatter_gather_bench && disable_pad_bench
+        compiled_grad_no_scatter_no_pad = @compile compile_options = CompileOptions(;
+            disable_scatter_gather_optimization_passes=true,
+            disable_pad_optimization_passes=true,
+            optimization_passes=mode,
+            sync=true,
+            no_nan=true,
+            all_finite=true,
+        ) simple_mse_loss_gradient(model, x, z, ps, st)
+        bench = @benchmark $compiled_grad_no_scatter_no_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Gradient ($mode)",
+                "No Scatter/Gather/Pad Optimizations",
+                mean(bench).time,
+                std(bench).time,
+                mean(bench).time / baseline,
+            ),
+        )
+    end
+
+    sort!(results; by=x -> x[3])
+    return results
+end
+
+function pretty_print_table(results)
+    header = (
+        ["Mode", "Optimization Passes", "Mean Time", "Std. Dev. Time", "Relative Timing"],
+        ["", "", "s", "s", "Time / XLA Time"],
+    )
+
+    results = copy(results)
+    results[:, 3] ./= 1e9
+    results[:, 4] ./= 1e9
+
+    hl_r = Highlighter((data, i, j) -> j == 5 && data[i, j] > 1.0, crayon"bold red")
+    hl_g = Highlighter((data, i, j) -> j == 5 && data[i, j] < 1.0, crayon"bold green")
+    display(
+        pretty_table(
+            results;
+            header,
+            header_crayon=crayon"yellow bold",
+            highlighters=(hl_r, hl_g),
+            tf=tf_unicode_rounded,
+        ),
+    )
+    return nothing
+end
+
+function save_benchmark_results(
+    results::Matrix,
+    tag;
+    savedir=tempname(; cleanup=false),
+    device_tag=lowercase(
+        replace(Reactant.XLA.device_kind(Reactant.devices()[1]), " " => "_")
+    ),
+    plot_title="",
+)
+    IN_VSCODE = isdefined(Main, :VSCodeServer)
+
+    short_forms = Dict(
+        "All" => "All",
+        "Only XLA" => "Only XLA",
+        "No Pad Optimizations" => "- Pad Opt",
+        "No Scatter/Gather Optimizations" => "- S.G. Opt",
+        "No Scatter/Gather/Pad Optimizations" => "- S.G. + Pad Opt",
+        "No Scatter/Gather and Pad Optimizations" => "- S.G. + Pad Opt",
+    )
+
+    mkpath(savedir)
+    file_name_base = "$(tag)_$(device_tag)_$(Dates.format(now(), "yyyy_mm_dd_HH_MM_SS"))"
+
+    df = DataFrame(
+        results,
+        ["Mode", "Optimization Passes", "Mean Time", "Std. Dev. Time", "Relative Timing"],
+    )
+
+    csv_results_file_name = joinpath(savedir, "$(file_name_base).csv")
+    CSV.write(csv_results_file_name, df)
+
+    @info "Saving timings to $(csv_results_file_name)"
+
+    df[!, "μ - σ"] = df[!, "Mean Time"] .- df[!, "Std. Dev. Time"]
+    df[!, "μ + σ"] = df[!, "Mean Time"] .+ df[!, "Std. Dev. Time"]
+
+    fig = draw(
+        (
+            data(df) *
+            mapping(
+                "Mode",
+                "Mean Time";
+                dodge="Optimization Passes" => "",
+                color="Optimization Passes" => x -> short_forms[x],
+            ) *
+            visual(BarPlot; strokewidth=2)
+        ) + (
+            data(df) *
+            mapping("Mode", "μ - σ", "μ + σ"; dodge_x="Optimization Passes" => "") *
+            visual(Rangebars; linewidth=2, whiskerwidth=10)
+        ),
+        scales(; Color=(; palette=:tab10));
+        figure=(; size=(1000, 500), title=plot_title, titlealign=:center),
+        legend=(; position=:bottom),
+    )
+
+    IN_VSCODE && display(fig)
+
+    plots_file_name = joinpath(savedir, "$(file_name_base).pdf")
+    save(plots_file_name, fig)
+
+    @info "Saving plots to $(plots_file_name)"
+
+    return nothing
+end
@@ -0,0 +1,25 @@
+[deps]
+AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+NeuralOperators = "ea5c82af-86e5-48da-8ee1-382d6ad7af4b"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+
+[sources]
+Reactant = {path = "../../"}
+
+[compat]
+BenchmarkTools = "1.6"
+CSV = "0.10.15"
+Lux = "1.13.4"
+NeuralOperators = "0.6"
+PrettyTables = "2.4.0"
+Random = "1.11"
+julia = "1.11"