EnzymeAD
diff --git a/‎benchmark/Project.toml
Lines changed: 1 addition & 0 deletions b/‎benchmark/Project.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmark/benchmarks.jl
Lines changed: 48 additions & 7 deletions b/‎benchmark/benchmarks.jl
Lines changed: 48 additions & 7 deletions
diff --git a/‎ext/ReactantNNlibExt.jl
Lines changed: 54 additions & 7 deletions b/‎ext/ReactantNNlibExt.jl
Lines changed: 54 additions & 7 deletions
diff --git a/‎src/ConcreteRArray.jl
Lines changed: 5 additions & 0 deletions b/‎src/ConcreteRArray.jl
Lines changed: 5 additions & 0 deletions
@@ -4,6 +4,7 @@ Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 
 [compat]
 BenchmarkTools = "1.5"
 
@@ -5,27 +5,68 @@ using Boltz, Lux, Random
 
 const SUITE = BenchmarkGroup()
 
+SUITE["runtime"] = BenchmarkGroup()
 SUITE["comptime"] = BenchmarkGroup()
 
 SUITE["comptime"]["basics"] = BenchmarkGroup()
 SUITE["comptime"]["basics"]["2D sum"] = @benchmarkable Reactant.compile(sum, (a,)) setup = (
     a = Reactant.ConcreteRArray(ones(2, 10))
 )
-SUITE["comptime"]["basics"]["Basic cos"] = @benchmarkable Reactant.compile(cos, (a,)) setup = (
+
+bcast_cos(x) = cos.(x)
+
+SUITE["comptime"]["basics"]["cos.(x)"] = @benchmarkable begin
+    Reactant.compile(bcast_cos, (a,))
+end setup = begin
     a = Reactant.ConcreteRArray(ones(2, 10))
-)
+end
 
+SUITE["runtime"]["lux neural networks"] = BenchmarkGroup()
 SUITE["comptime"]["lux neural networks"] = BenchmarkGroup()
 
-for depth in [11, 13, 16, 19]
-    SUITE["comptime"]["lux neural networks"]["vgg$depth"] = @benchmarkable Reactant.compile(
-        vgg, (x, ps_concrete, st_concrete)
-    ) setup = begin
-        vgg = Vision.VGG($depth; pretrained=false, batchnorm=false)
+for depth in [11, 13, 16, 19], batchnorm in [false]#  true] <-- not working yet
+    SUITE["comptime"]["lux neural networks"]["vgg$(depth) bn=$(batchnorm)"] = @benchmarkable begin
+        @compile vgg(x, ps_concrete, st_concrete)
+    end setup = begin
+        vgg = Vision.VGG($depth; pretrained=false, batchnorm=$(batchnorm))
+        ps, st = Lux.setup(Random.default_rng(), vgg)
+        ps_concrete = Reactant.to_rarray(ps)
+        st_concrete = Reactant.to_rarray(Lux.testmode(st))
+        x = Reactant.to_rarray(rand(Float32, 224, 224, 3, 16))
+    end
+
+    SUITE["runtime"]["lux neural networks"]["vgg$(depth) bn=$(batchnorm) (compiled)"] = @benchmarkable begin
+        vgg_compiled(x, ps_concrete, st_concrete)
+    end setup = begin
+        vgg = Vision.VGG($depth; pretrained=false, batchnorm=$(batchnorm))
         ps, st = Lux.setup(Random.default_rng(), vgg)
         ps_concrete = Reactant.to_rarray(ps)
         st_concrete = Reactant.to_rarray(Lux.testmode(st))
         x = Reactant.to_rarray(rand(Float32, 224, 224, 3, 16))
+        vgg_compiled = @compile vgg(x, ps_concrete, st_concrete)
+    end
+end
+
+for version in (:tiny, :base)
+    SUITE["comptime"]["lux neural networks"]["ViT $(version)"] = @benchmarkable begin
+        @compile vit(x, ps_concrete, st_concrete)
+    end setup = begin
+        vit = Vision.ViT($(Meta.quot(version)))
+        ps, st = Lux.setup(Random.default_rng(), vit)
+        ps_concrete = Reactant.to_rarray(ps)
+        st_concrete = Reactant.to_rarray(Lux.testmode(st))
+        x = Reactant.to_rarray(rand(Float32, 256, 256, 3, 16))
+    end
+
+    SUITE["runtime"]["lux neural networks"]["ViT $(version) (compiled)"] = @benchmarkable begin
+        vit_compiled(x, ps_concrete, st_concrete)
+    end setup = begin
+        vit = Vision.ViT($(Meta.quot(version)))
+        ps, st = Lux.setup(Random.default_rng(), vit)
+        ps_concrete = Reactant.to_rarray(ps)
+        st_concrete = Reactant.to_rarray(Lux.testmode(st))
+        x = Reactant.to_rarray(rand(Float32, 256, 256, 3, 16))
+        vit_compiled = @compile vit(x, ps_concrete, st_concrete)
     end
 end
 
 
@@ -1,7 +1,7 @@
 module ReactantNNlibExt
 
 using NNlib
-using Reactant: Reactant, TracedRArray, AnyTracedRArray, materialize_traced_array
+using Reactant: Reactant, TracedRArray, AnyTracedRArray, materialize_traced_array, MLIR
 
 for (jlop, hloop) in (
     (:(NNlib.tanh_fast), :tanh),
@@ -19,12 +19,11 @@ for (jlop, hloop) in (
     end
 end
 
-NNlib.relu(x::TracedRArray{T,0}) where {T} = max(x, zero(T))
-
-function NNlib.gelu(x::TracedRArray{T,0}) where {T}
-    α = T(0.044715)
-    λλ = T(√(8 / π))
-    return x * sigmoid(λλ * x * muladd(x^2, α, one(T)))
+# Don't confuse our poor scalar arrays, we no like numbers we like 0D arrays
+for nnlib_op in setdiff(Tuple(NNlib.ACTIVATIONS), (:tanh_fast, :sigmoid_fast, :sigmoid, :σ))
+    @eval function NNlib.$(nnlib_op)(x::TracedRArray{T,0}) where {T}
+        return invoke(NNlib.$(nnlib_op), Tuple{Any}, x)
+    end
 end
 
 # TODO handle non finite cases
@@ -206,4 +205,52 @@ end
 NNlib.batched_transpose(x::AnyTracedRArray{T,3}) where {T} = permutedims(x, (2, 1, 3))
 NNlib.batched_adjoint(x::AnyTracedRArray{<:Real,3}) = NNlib.batched_transpose(x)
 
+function NNlib.batched_mul(x::AnyTracedRArray{T,3}, y::AnyTracedRArray{T,3}) where {T}
+    if (size(x, 3) != size(y, 3) && size(x, 3) != 1 && size(y, 3) != 1) ||
+        (size(x, 2) != size(y, 1))
+        throw(
+            DimensionMismatch(
+                lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul.",
+            ),
+        )
+    end
+    x = permutedims(x, (3, 1, 2))
+    y = permutedims(y, (3, 1, 2))
+
+    B = max(size(x, 1), size(y, 1))
+    out_shape = (B, size(x, 2), size(y, 3))
+    resty = MLIR.IR.TensorType(out_shape, eltype(MLIR.IR.type(x.mlir_data)))
+
+    if size(x, 1) != size(y, 1)
+        if size(x, 1) == 1
+            x = Reactant.broadcast_to_size(x, (B, size(x, 2), size(x, 3)))
+        elseif size(y, 1) == 1
+            y = Reactant.broadcast_to_size(y, (B, size(y, 2), size(y, 3)))
+        end
+    end
+
+    dot_dimension_numbers = MLIR.API.stablehloDotDimensionNumbersGet(
+        MLIR.IR.context(), 1, [0], 1, [0], 1, [2], 1, [1]
+    )
+
+    prec = MLIR.IR.Attribute(
+        MLIR.API.stablehloPrecisionAttrGet(MLIR.IR.context(), "DEFAULT")
+    )
+    res = TracedRArray{T,3}(
+        (),
+        MLIR.IR.result(
+            MLIR.Dialects.stablehlo.dot_general(
+                x.mlir_data,
+                y.mlir_data;
+                result_0=resty,
+                dot_dimension_numbers=dot_dimension_numbers,
+                precision_config=prec,
+            ),
+            1,
+        ),
+        size(resty),
+    )
+    return permutedims(res, (2, 3, 1))
+end
+
 end # module ReactantNNlibExt
@@ -48,6 +48,11 @@ function Base.convert(::Type{T}, X::ConcreteRArray{ElType,N}) where {T<:Array,El
     # XLA.from_row_major(data)
 end
 
+function synchronize(x::ConcreteRArray)
+    XLA.synced_buffer(x.data)
+    return nothing
+end
+
 # function Base.similar(x::ConcreteRArray{T,N}, ::Type{T2}) where {T,N,T2}
 #     return ConcreteRArray{T,N}(x.data)
 # end