EnzymeAD
diff --git a/‎.github/workflows/benchmark_pr.yml
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/benchmark_pr.yml
Lines changed: 1 addition & 3 deletions
diff --git a/‎benchmark/Project.toml
Lines changed: 14 additions & 0 deletions b/‎benchmark/Project.toml
Lines changed: 14 additions & 0 deletions
diff --git a/‎benchmark/benchmarks.jl
Lines changed: 0 additions & 8 deletions b/‎benchmark/benchmarks.jl
Lines changed: 0 additions & 8 deletions
diff --git a/‎ext/ReactantNNlibExt.jl
Lines changed: 5 additions & 1 deletion b/‎ext/ReactantNNlibExt.jl
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/TracedRArray.jl
Lines changed: 1 addition & 1 deletion b/‎src/TracedRArray.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Project.toml
Lines changed: 3 additions & 0 deletions b/‎test/Project.toml
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/bcast.jl
Lines changed: 0 additions & 31 deletions b/‎test/bcast.jl
Lines changed: 0 additions & 31 deletions
diff --git a/‎test/nn.jl
Lines changed: 0 additions & 116 deletions b/‎test/nn.jl
Lines changed: 0 additions & 116 deletions
diff --git a/‎test/nn/flux.jl
Lines changed: 25 additions & 0 deletions b/‎test/nn/flux.jl
Lines changed: 25 additions & 0 deletions
diff --git a/‎test/nn/lux.jl
Lines changed: 75 additions & 0 deletions b/‎test/nn/lux.jl
Lines changed: 75 additions & 0 deletions
@@ -36,9 +36,7 @@ jobs:
           echo $PATH
           ls -l ~/.julia/bin
           mkdir results
-          benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.repository.default_branch}}" --output-dir=results/ -s="benchmark/benchmarks.jl" --tune --add="Enzyme,Lux,Boltz,Random"
-        env:
-          JULIA_PKG_SERVER: ""
+          benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.pull_request.head.sha}}" --output-dir=results/ --tune --exeflags="-O3 --threads=auto"
       - name: Create plots from benchmarks
         run: |
           mkdir -p plots
 
@@ -0,0 +1,14 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[compat]
+BenchmarkTools = "1.5"
+Boltz = "1"
+Enzyme = "0.13"
+Lux = "1.1"
+Random = "1.10"
+julia = "1.10"
@@ -1,11 +1,3 @@
-# To run:
-# using PkgBenchmark, Reactant
-# result = benchmarkpkg(KernelAbstractions)
-# export_markdown("benchmark/perf.md", result)
-
-# Note: if you change this file you will need to delete an regenerate tune.json
-# Your "v1.x" environment needs to have BenchmarkTools and PkgBenchmark installed.
-
 using BenchmarkTools
 using Reactant
 using Enzyme
 
@@ -21,7 +21,11 @@ end
 
 NNlib.relu(x::Reactant.TracedRArray{T,0}) where {T} = max(x, zero(T))
 
-NNlib.gelu(x::Reactant.TracedRArray{T,0}) where {T} = x * sigmoid(T(1.702) * x)
+function NNlib.gelu(x::Reactant.TracedRArray{T,0}) where {T}
+    α = T(0.044715)
+    λλ = T(√(8 / π))
+    return x * sigmoid(λλ * x * muladd(x^2, α, one(T)))
+end
 
 # TODO handle non finite cases
 function NNlib.softmax!(
 
@@ -89,7 +89,7 @@ end
 
 Base.size(x::TracedRArray) = x.shape
 
-Base.copy(A::TracedRArray{T,N}) where {T,N} = TracedRArray((), A.mlir_data, size(A))
+Base.copy(A::TracedRArray{T,N}) where {T,N} = TracedRArray{T,N}((), A.mlir_data, size(A))
 
 function Base.similar(x::TracedRArray{T,N}, ::Type{T2}) where {T,N,T2}
     return TracedRArray{T2,N}((), nothing, size(x))
 
@@ -2,13 +2,16 @@
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reactant_jll = "0192cb87-2b54-54ad-80e0-3be72ad8a3c0"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -57,37 +57,6 @@ function test()
 end
 test()
 
-@testset "Activation Functions" begin
-    sumabs2(f, x) = sum(abs2, f.(x))
-
-    function ∇sumabs2(f, x)
-        dx = Enzyme.make_zero(x)
-        Enzyme.autodiff(Reverse, sumabs2, Active, Const(f), Duplicated(x, dx))
-        return dx
-    end
-
-    x_act = randn(Float32, 10, 10)
-    x_act_ca = Reactant.ConcreteRArray(x_act)
-
-    @testset "Activation: $act" for act in (
-        identity, relu, sigmoid, tanh, tanh_fast, sigmoid_fast, gelu, abs2
-    )
-        f_compile = @compile sumabs2(act, x_act)
-
-        y_simple = sumabs2(act, x_act)
-        y_compile = f_compile(act, x_act_ca)
-
-        ∂x_enz = Enzyme.make_zero(x_act)
-        Enzyme.autodiff(Reverse, sumabs2, Active, Const(act), Duplicated(x_act, ∂x_enz))
-
-        ∇sumabs2_compiled = @compile ∇sumabs2(act, x_act_ca)
-
-        ∂x_compile = ∇sumabs2_compiled(act, x_act_ca)
-
-        @test y_simple ≈ y_compile
-    end
-end
-
 @testset "ConcreteRArray broadcasting" begin
     x = ones(10, 10)
     y = ones(10, 10)
 
@@ -0,0 +1,25 @@
+using Reactant, Flux
+
+@testset "Flux.jl Integration" begin
+    # Generate some data for the XOR problem: vectors of length 2, as columns of a matrix:
+    noisy = rand(Float32, 2, 1000)                                        # 2×1000 Matrix{Float32}
+    truth = [xor(col[1] > 0.5, col[2] > 0.5) for col in eachcol(noisy)]   # 1000-element Vector{Bool}
+
+    # Define our model, a multi-layer perceptron with one hidden layer of size 3:
+    model = Chain(
+        Dense(2 => 3, tanh),   # activation function inside layer
+        BatchNorm(3),
+        Dense(3 => 2),
+        softmax,
+    )
+
+    origout = model(noisy)
+
+    cmodel = Reactant.to_rarray(model)
+    cnoisy = Reactant.ConcreteRArray(noisy)
+
+    f = Reactant.compile((a, b) -> a(b), (cmodel, cnoisy))
+
+    comp = f(cmodel, cnoisy)
+    @test origout ≈ comp
+end
@@ -0,0 +1,75 @@
+using Reactant, Lux, Random, Statistics, Enzyme, Functors, OneHotArrays
+
+function crossentropy(ŷ, y)
+    logŷ = log.(ŷ)
+    result = y .* logŷ
+    return -sum(result)
+end
+
+function loss_function(model, x, y, ps, st)
+    y_hat, _ = model(x, ps, st)
+    # return CrossEntropyLoss()(y_hat, y) # <-- needs handling of xlogx xlogy from LuxOps
+    return crossentropy(y_hat, y)
+end
+
+function gradient_loss_function(model, x, y, ps, st)
+    dps = Enzyme.make_zero(ps)
+    _, res = Enzyme.autodiff(
+        ReverseWithPrimal,
+        loss_function,
+        Active,
+        Const(model),
+        Const(x),
+        Const(y),
+        Duplicated(ps, dps),
+        Const(st),
+    )
+    return res, dps
+end
+
+@testset "Lux.jl Integration" begin
+    # Generate some data for the XOR problem: vectors of length 2, as columns of a matrix:
+    noisy = rand(Float32, 2, 1000)                                        # 2×1000 Matrix{Float32}
+    truth = [xor(col[1] > 0.5, col[2] > 0.5) for col in eachcol(noisy)]   # 1000-element Vector{Bool}
+
+    # Define our model, a multi-layer perceptron with one hidden layer of size 3:
+    model = Lux.Chain(
+        Lux.Dense(2 => 3, tanh),   # activation function inside layer
+        Lux.BatchNorm(3, sigmoid),
+        Lux.Dense(3 => 2),
+        softmax,
+    )
+    ps, st = Lux.setup(Xoshiro(123), model)
+
+    origout, _ = model(noisy, ps, Lux.testmode(st))
+
+    cmodel = Reactant.to_rarray(model)
+    cps = Reactant.to_rarray(ps)
+    cst = Reactant.to_rarray(Lux.testmode(st))
+    cst2 = Reactant.to_rarray(st)
+    cnoisy = Reactant.ConcreteRArray(noisy)
+
+    f = Reactant.compile((a, b, c, d) -> first(a(b, c, d)), (cmodel, cnoisy, cps, cst))
+
+    comp = f(cmodel, cnoisy, cps, cst)
+
+    @test comp ≈ origout atol = 1e-5 rtol = 1e-2
+
+    target = onehotbatch(truth, [true, false])                   # 2×1000 OneHotMatrix
+
+    ctarget = Reactant.ConcreteRArray(Array{Float32}(target))
+    # ctarget = Reactant.to_rarray(target)
+
+    res, dps = gradient_loss_function(model, noisy, target, ps, st)
+
+    compiled_gradient = Reactant.compile(
+        gradient_loss_function, (cmodel, cnoisy, ctarget, cps, cst2)
+    )
+
+    res_reactant, dps_reactant = compiled_gradient(cmodel, cnoisy, ctarget, cps, cst2)
+
+    @test res ≈ res_reactant atol = 1e-5 rtol = 1e-2
+    for (dps1, dps2) in zip(fleaves(dps), fleaves(dps_reactant))
+        @test dps1 ≈ dps2 atol = 1e-5 rtol = 1e-2
+    end
+end