perf: grad

avik-pal · avik-pal · commit a93b09f6118f · 2025-06-21T22:02:35.000-04:00
diff --git a/perf/common.jl b/perf/common.jl
@@ -1,27 +1,33 @@
 using BenchmarkTools: @benchmark
 using Reactant, Enzyme, PrettyTables, Statistics
 
-function simple_mse_loss(model, x, ps, st)
+function simple_mse_loss(model, x, z, ps, st)
     y, _ = Lux.apply(model, x, ps, st)
-    return sum(abs2, y)
+    return MSELoss()(y, z)
+end
+
+function simple_mse_loss_gradient(model, x, z, ps, st)
+    return Enzyme.gradient(
+        Reverse, simple_mse_loss, Const(model), Const(x), Const(z), ps, Const(st)
+    )
 end
 
 function benchmark_nn_primal(
-    model, x, ps, st; disable_scatter_gather_bench=true, disable_pad_bench=true
+    model, x, z, ps, st; disable_scatter_gather_bench=true, disable_pad_bench=true
 )
     results = Vector{Tuple{String,String,Float64,Float64,Float64}}()
 
     # Only XLA
     compiled_fwd_xla = @compile sync = true compile_options = Reactant.DefaultXLACompileOptions() simple_mse_loss(
-        model, x, ps, st
+        model, x, z, ps, st
     )
-    bench = @benchmark $compiled_fwd_xla($model, $x, $ps, $st)
+    bench = @benchmark $compiled_fwd_xla($model, $x, $z, $ps, $st) setup = (GC.gc(true))
     push!(results, ("Primal", "Only XLA", median(bench).time, std(bench).time, 1.0))
     baseline = median(bench).time
 
     # Default
-    compiled_fwd = @compile sync = true simple_mse_loss(model, x, ps, st)
-    bench = @benchmark $compiled_fwd($model, $x, $ps, $st)
+    compiled_fwd = @compile sync = true simple_mse_loss(model, x, z, ps, st)
+    bench = @benchmark $compiled_fwd($model, $x, $z, $ps, $st) setup = (GC.gc(true))
     push!(
         results,
         (
@@ -37,8 +43,10 @@ function benchmark_nn_primal(
     if disable_scatter_gather_bench
         compiled_fwd_no_scatter = @compile sync = true compile_options = CompileOptions(;
             disable_scatter_gather_optimization_passes=true
-        ) simple_mse_loss(model, x, ps, st)
-        bench = @benchmark $compiled_fwd_no_scatter($model, $x, $ps, $st)
+        ) simple_mse_loss(model, x, z, ps, st)
+        bench = @benchmark $compiled_fwd_no_scatter($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
 
         push!(
             results,
@@ -56,8 +64,10 @@ function benchmark_nn_primal(
     if disable_pad_bench
         compiled_fwd_no_pad = @compile sync = true compile_options = CompileOptions(;
             disable_pad_optimization_passes=true
-        ) simple_mse_loss(model, x, ps, st)
-        bench = @benchmark $compiled_fwd_no_pad($model, $x, $ps, $st)
+        ) simple_mse_loss(model, x, z, ps, st)
+        bench = @benchmark $compiled_fwd_no_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
 
         push!(
             results,
@@ -76,8 +86,10 @@ function benchmark_nn_primal(
         compiled_fwd_no_scatter_pad = @compile sync = true compile_options = CompileOptions(;
             disable_scatter_gather_optimization_passes=true,
             disable_pad_optimization_passes=true,
-        ) simple_mse_loss(model, x, ps, st)
-        bench = @benchmark $compiled_fwd_no_scatter_pad($model, $x, $ps, $st)
+        ) simple_mse_loss(model, x, z, ps, st)
+        bench = @benchmark $compiled_fwd_no_scatter_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
 
         push!(
             results,
@@ -95,6 +107,127 @@ function benchmark_nn_primal(
     return results
 end
 
+function benchmark_nn_gradient(model, x, z, ps, st; kwargs...)
+    return vcat(
+        [
+            benchmark_nn_gradient_internal(model, x, z, ps, st, mode; kwargs...) for
+            mode in [:all, :before_enzyme, :after_enzyme]
+        ]...,
+    )
+end
+
+function benchmark_nn_gradient_internal(
+    model, x, z, ps, st, mode; disable_scatter_gather_bench=true, disable_pad_bench=true
+)
+    @info "Benchmarking gradient with mode: $(Meta.quot(mode))"
+
+    results = Vector{Tuple{String,String,Float64,Float64,Float64}}()
+
+    # Only XLA
+    compiled_grad_xla = @compile sync = true compile_options = Reactant.DefaultXLACompileOptions() simple_mse_loss_gradient(
+        model, x, z, ps, st
+    )
+    bench = @benchmark $compiled_grad_xla($model, $x, $z, $ps, $st) setup = (GC.gc(true))
+    push!(
+        results, ("Gradient ($mode)", "Only XLA", median(bench).time, std(bench).time, 1.0)
+    )
+    baseline = median(bench).time
+
+    display(results[end])
+
+    # Default
+    compiled_grad = @compile sync = true optimize = mode simple_mse_loss_gradient(
+        model, x, z, ps, st
+    )
+    bench = @benchmark $compiled_grad($model, $x, $z, $ps, $st) setup = (GC.gc(true))
+    push!(
+        results,
+        (
+            "Gradient ($mode)",
+            "All",
+            median(bench).time,
+            std(bench).time,
+            median(bench).time / baseline,
+        ),
+    )
+
+    display(results[end])
+
+    # Disable Scatter
+    if disable_scatter_gather_bench
+        compiled_grad_no_scatter = @compile sync = true compile_options = CompileOptions(;
+            disable_scatter_gather_optimization_passes=true, optimization_passes=mode
+        ) simple_mse_loss_gradient(model, x, z, ps, st)
+        bench = @benchmark $compiled_grad_no_scatter($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Gradient ($mode)",
+                "No Scatter/Gather Optimizations",
+                median(bench).time,
+                std(bench).time,
+                median(bench).time / baseline,
+            ),
+        )
+
+        display(results[end])
+    end
+
+    # Disable Pad
+    if disable_pad_bench
+        compiled_grad_no_pad = @compile sync = true compile_options = CompileOptions(;
+            disable_pad_optimization_passes=true, optimization_passes=mode
+        ) simple_mse_loss_gradient(model, x, z, ps, st)
+        bench = @benchmark $compiled_grad_no_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Gradient ($mode)",
+                "No Pad Optimizations",
+                median(bench).time,
+                std(bench).time,
+                median(bench).time / baseline,
+            ),
+        )
+
+        display(results[end])
+    end
+
+    # Disable Pad and Scatter
+    if disable_scatter_gather_bench && disable_pad_bench
+        compiled_grad_no_scatter_no_pad = @compile sync = true compile_options = CompileOptions(;
+            disable_scatter_gather_optimization_passes=true,
+            disable_pad_optimization_passes=true,
+            optimization_passes=mode,
+        ) simple_mse_loss_gradient(model, x, z, ps, st)
+        bench = @benchmark $compiled_grad_no_scatter_no_pad($model, $x, $z, $ps, $st) setup = (GC.gc(
+            true
+        ))
+
+        push!(
+            results,
+            (
+                "Gradient ($mode)",
+                "No Scatter/Gather/Pad Optimizations",
+                median(bench).time,
+                std(bench).time,
+                median(bench).time / baseline,
+            ),
+        )
+
+        display(results[end])
+    end
+
+    sort!(results; by=x -> x[3])
+    return results
+end
+
 function pretty_print_table(results)
     header = (
         ["Mode", "Optimization Passes", "Median Time", "Std. Dev. Time", "Relative Timing"],
diff --git a/perf/neuraloperators/main.jl b/perf/neuraloperators/main.jl
@@ -8,14 +8,15 @@ function run_deeponet_benchmarks()
     @info "Running DeepONet benchmarks"
 
     model = DeepONet(;
-        branch=(64, ntuple(Returns(256), 5)..., 16),
-        trunk=(1, ntuple(Returns(256), 5)..., 16),
+        branch=(64, ntuple(Returns(256), 4)..., 16),
+        trunk=(1, ntuple(Returns(256), 4)..., 16),
         branch_activation=gelu,
         trunk_activation=gelu,
     )
     ps, st = xdev(Lux.setup(Random.default_rng(), model))
     u = xdev(rand(Float32, 64, 1024))
     y = xdev(rand(Float32, 1, 128))
+    z = xdev(rand(Float32, 128, 1024))
 
     primal_timings = Reactant.with_config(;
         dot_general_precision=PrecisionConfig.HIGH,
@@ -24,14 +25,31 @@ function run_deeponet_benchmarks()
         benchmark_nn_primal(
             model,
             (u, y),
+            z,
             ps,
             st;
             disable_scatter_gather_bench=true,
             disable_pad_bench=true,
         )
     end
 
-    pretty_print_table(permutedims(hcat([[t...] for t in primal_timings]...), (2, 1)))
+    gradient_timings = Reactant.with_config(;
+        dot_general_precision=PrecisionConfig.HIGH,
+        convolution_precision=PrecisionConfig.HIGH,
+    ) do
+        benchmark_nn_gradient(
+            model,
+            (u, y),
+            z,
+            ps,
+            st;
+            disable_scatter_gather_bench=true,
+            disable_pad_bench=true,
+        )
+    end
+
+    timings = vcat(primal_timings, gradient_timings)
+    pretty_print_table(permutedims(hcat([[t...] for t in timings]...), (2, 1)))
 
     return nothing
 end
@@ -42,20 +60,43 @@ function run_fno_benchmarks()
     model = FourierNeuralOperator((16, 16), 3, 8, 64)
     ps, st = xdev(Lux.setup(Random.default_rng(), model))
     x = xdev(rand(Float32, 64, 64, 1, 256))
+    z = xdev(rand(Float32, 64, 64, 8, 256))
 
     primal_timings = Reactant.with_config(;
         dot_general_precision=PrecisionConfig.HIGH,
         convolution_precision=PrecisionConfig.HIGH,
     ) do
         benchmark_nn_primal(
-            model, x, ps, st; disable_scatter_gather_bench=true, disable_pad_bench=true
+            model,
+            x,
+            z,
+            ps,
+            st;
+            disable_scatter_gather_bench=true,
+            disable_pad_bench=true,
         )
     end
 
-    pretty_print_table(permutedims(hcat([[t...] for t in primal_timings]...), (2, 1)))
+    gradient_timings = Reactant.with_config(;
+        dot_general_precision=PrecisionConfig.HIGH,
+        convolution_precision=PrecisionConfig.HIGH,
+    ) do
+        benchmark_nn_gradient(
+            model,
+            x,
+            z,
+            ps,
+            st;
+            disable_scatter_gather_bench=true,
+            disable_pad_bench=true,
+        )
+    end
+
+    timings = vcat(primal_timings, gradient_timings)
+    pretty_print_table(permutedims(hcat([[t...] for t in timings]...), (2, 1)))
 
     return nothing
 end
 
-run_fno_benchmarks()
 run_deeponet_benchmarks()
+run_fno_benchmarks()
diff --git a/src/CompileOptions.jl b/src/CompileOptions.jl
@@ -138,6 +138,9 @@ Fine-grained control over the compilation options for the Reactant compiler.
   - `assert_nonallocating`: If `true`, we make sure that no new buffers are
     returned by the function. Any buffer returned must be donated from the inputs. Defaults
     to `false`.
+  - `sync`: Reactant computations are asynchronous by default. If `true`, the computation
+    will be executed synchronously, blocking till the computation is complete. This is
+    recommended when benchmarking.
 
 # Extended Help
 
@@ -175,6 +178,7 @@ struct CompileOptions
     # julia codegen options
     assert_nonallocating::Bool
     donated_args::Symbol
+    sync::Bool
     ## private options for ablation studies
     disable_scatter_gather_optimization_passes::Bool
     disable_pad_optimization_passes::Bool
@@ -197,6 +201,7 @@ function CompileOptions(;
     optimize_communications::Union{Bool,OptimizeCommunicationOptions}=true,
     assert_nonallocating::Bool=false,
     donated_args::Symbol=:auto,
+    sync::Bool=false,
     disable_scatter_gather_optimization_passes::Bool=false,
     disable_pad_optimization_passes::Bool=false,
 )
@@ -243,6 +248,7 @@ function CompileOptions(;
         optimize_communications,
         assert_nonallocating,
         donated_args,
+        sync,
         disable_scatter_gather_optimization_passes,
         disable_pad_optimization_passes,
     )
@@ -282,6 +288,7 @@ function __compile_options_with_reversed_propagation(compile_options::CompileOpt
         compile_options.optimize_communications,
         compile_options.assert_nonallocating,
         compile_options.donated_args,
+        compile_options.sync,
         compile_options.disable_scatter_gather_optimization_passes,
         compile_options.disable_pad_optimization_passes,
     )
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1260,6 +1260,7 @@ function __get_compile_options_and_kwargs(;
     optimize_communications::Union{Bool,OptimizeCommunicationOptions}=true,
     assert_nonallocating::Bool=false,
     donated_args::Symbol=:auto,
+    sync::Bool=false,
     kwargs...,
 )
     return (
@@ -1281,6 +1282,7 @@ function __get_compile_options_and_kwargs(;
             optimize_communications,
             assert_nonallocating,
             donated_args,
+            sync,
         ),
         kwargs,
     )