feat: batchnorm ops (#1336)

avik-pal · web-flow · commit d245ae549544 · 2025-05-27T11:24:22.000-04:00
* feat: batchnorm ops

* fix: use jll with bn grad fix

* test: bn

* feat: add the grad op as well
diff --git a/Project.toml b/Project.toml
@@ -87,7 +87,7 @@ PythonCall = "0.9"
 Random = "1.10"
 Random123 = "1.7"
 ReactantCore = "0.1.10"
-Reactant_jll = "0.0.188"
+Reactant_jll = "0.0.189"
 ScopedValues = "1.3.0"
 Scratch = "1.2"
 Sockets = "1.10"
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -2968,4 +2968,130 @@ end
     ]
 end
 
+@noinline function batch_norm_inference(
+    operand::TracedRArray{T,N},
+    scale::Union{TracedRArray{T,1},Nothing},
+    offset::Union{TracedRArray{T,1},Nothing},
+    mean::TracedRArray{T,1},
+    variance::TracedRArray{T,1};
+    epsilon,
+    feature_index::Int64,
+    location=mlir_stacktrace("batch_norm_inference", @__FILE__, @__LINE__),
+) where {T,N}
+    len = size(operand, feature_index)
+    @assert length(mean) == length(variance) == len
+
+    if scale === nothing
+        scale = fill(T(1), len; location)
+    else
+        @assert size(scale) == (len,)
+    end
+
+    if offset === nothing
+        offset = fill(T(0), len; location)
+    else
+        @assert size(offset) == (len,)
+    end
+
+    return TracedRArray{T,N}(
+        (),
+        MLIR.IR.result(
+            stablehlo.batch_norm_inference(
+                operand.mlir_data,
+                scale.mlir_data,
+                offset.mlir_data,
+                mean.mlir_data,
+                variance.mlir_data;
+                epsilon=Float32(epsilon),
+                feature_index=feature_index - 1,
+                location,
+            ),
+            1,
+        ),
+        size(operand),
+    )
+end
+
+@noinline function batch_norm_training(
+    operand::TracedRArray{T,N},
+    scale::Union{TracedRArray{T,1},Nothing},
+    offset::Union{TracedRArray{T,1},Nothing};
+    epsilon,
+    feature_index::Int64,
+    location=mlir_stacktrace("batch_norm_training", @__FILE__, @__LINE__),
+) where {T,N}
+    len = size(operand, feature_index)
+
+    if scale === nothing
+        scale = fill(T(1), len; location)
+    else
+        @assert size(scale) == (len,)
+    end
+
+    if offset === nothing
+        offset = fill(T(0), len; location)
+    else
+        @assert size(offset) == (len,)
+    end
+
+    batch_norm_train_op = stablehlo.batch_norm_training(
+        operand.mlir_data,
+        scale.mlir_data,
+        offset.mlir_data;
+        epsilon=Float32(epsilon),
+        feature_index=feature_index - 1,
+        location,
+    )
+
+    return (
+        TracedRArray{T,N}((), MLIR.IR.result(batch_norm_train_op, 1), size(operand)),
+        TracedRArray{T,1}((), MLIR.IR.result(batch_norm_train_op, 2), (len,)),
+        TracedRArray{T,1}((), MLIR.IR.result(batch_norm_train_op, 3), (len,)),
+    )
+end
+
+@noinline function batch_norm_grad(
+    operand::TracedRArray{T,N},
+    scale::Union{TracedRArray{T,1},Nothing},
+    mean::TracedRArray{T,1},
+    variance::TracedRArray{T,1},
+    grad_output::TracedRArray{T,N};
+    epsilon,
+    feature_index::Int64,
+    location=mlir_stacktrace("batch_norm_grad", @__FILE__, @__LINE__),
+) where {T,N}
+    len = size(operand, feature_index)
+    @assert length(mean) == length(variance) == len
+    @assert size(grad_output) == size(operand)
+
+    has_affine = scale !== nothing
+
+    if !has_affine
+        scale = fill(T(1), len; location)
+    else
+        @assert size(scale) == (len,)
+    end
+
+    batch_norm_grad_op = stablehlo.batch_norm_grad(
+        operand.mlir_data,
+        scale.mlir_data,
+        mean.mlir_data,
+        variance.mlir_data,
+        grad_output.mlir_data;
+        epsilon=Float32(epsilon),
+        feature_index=feature_index - 1,
+        location,
+    )
+
+    grad_operand = TracedRArray{T,N}(
+        (), MLIR.IR.result(batch_norm_grad_op, 1), size(operand)
+    )
+    grad_scale = TracedRArray{T,1}((), MLIR.IR.result(batch_norm_grad_op, 2), (len,))
+    grad_offset = TracedRArray{T,1}((), MLIR.IR.result(batch_norm_grad_op, 3), (len,))
+
+    return (
+        grad_operand, has_affine ? grad_scale : nothing, has_affine ? grad_offset : nothing
+    )
+end
+
 end # module Ops
diff --git a/test/ops.jl b/test/ops.jl
@@ -1192,3 +1192,104 @@ end
         @test @jit(recon_from_lu(lu_ra)) ≈ @jit(apply_permutation(x_ra, perm))
     end
 end
+
+@testset "batch norm" begin
+    @testset "training" begin
+        @testset for affine in [false, true]
+            x = Reactant.to_rarray(randn(2, 3, 4, 5))
+            if affine
+                scale = Reactant.to_rarray(randn(3))
+                offset = Reactant.to_rarray(randn(3))
+            else
+                scale, offset = nothing, nothing
+            end
+
+            hlo = @code_hlo Ops.batch_norm_training(
+                x, scale, offset; epsilon=1e-5, feature_index=2
+            )
+            @test occursin("stablehlo.batch_norm_training", repr(hlo))
+
+            if !affine
+                @test occursin(
+                    "stablehlo.constant dense<0.000000e+00> : tensor<3xf64>", repr(hlo)
+                )
+                @test occursin(
+                    "stablehlo.constant dense<1.000000e+00> : tensor<3xf64>", repr(hlo)
+                )
+            end
+
+            res, m, v = @jit Ops.batch_norm_training(
+                x, scale, offset; epsilon=1e-5, feature_index=2
+            )
+            @test size(res) == size(x)
+            @test size(m) == (3,)
+            @test size(v) == (3,)
+        end
+    end
+
+    @testset "inference" begin
+        @testset for affine in [false, true]
+            x = Reactant.to_rarray(randn(2, 3, 4, 5))
+            if affine
+                scale = Reactant.to_rarray(randn(3))
+                offset = Reactant.to_rarray(randn(3))
+            else
+                scale, offset = nothing, nothing
+            end
+
+            rm = Reactant.to_rarray(randn(3))
+            rv = Reactant.to_rarray(rand(3))
+
+            hlo = @code_hlo Ops.batch_norm_inference(
+                x, scale, offset, rm, rv; epsilon=1e-5, feature_index=2
+            )
+            @test occursin("stablehlo.batch_norm_inference", repr(hlo))
+            if !affine
+                @test occursin(
+                    "stablehlo.constant dense<0.000000e+00> : tensor<3xf64>", repr(hlo)
+                )
+                @test occursin(
+                    "stablehlo.constant dense<1.000000e+00> : tensor<3xf64>", repr(hlo)
+                )
+            end
+
+            res = @jit Ops.batch_norm_inference(
+                x, scale, offset, rm, rv; epsilon=1e-5, feature_index=2
+            )
+            @test size(res) == size(x)
+        end
+    end
+
+    @testset "batch_norm_grad" begin
+        @testset for affine in [false, true]
+            x = Reactant.to_rarray(randn(2, 3, 4, 5))
+            scale = affine ? Reactant.to_rarray(randn(3)) : nothing
+            rm = Reactant.to_rarray(randn(3))
+            rv = Reactant.to_rarray(rand(3))
+            gx = Reactant.to_rarray(randn(2, 3, 4, 5))
+
+            hlo = @code_hlo Ops.batch_norm_grad(
+                x, scale, rm, rv, gx; epsilon=1e-5, feature_index=2
+            )
+            @test occursin("stablehlo.batch_norm_grad", repr(hlo))
+
+            if !affine
+                @test occursin(
+                    "stablehlo.constant dense<1.000000e+00> : tensor<3xf64>", repr(hlo)
+                )
+            end
+
+            gres, gscale, goffset = @jit Ops.batch_norm_grad(
+                x, scale, rm, rv, gx; epsilon=1e-5, feature_index=2
+            )
+            @test size(gres) == size(x)
+            if !affine
+                @test gscale === nothing
+                @test goffset === nothing
+            else
+                @test size(gscale) == (3,)
+                @test size(goffset) == (3,)
+            end
+        end
+    end
+end