wip debug mooncake on coupling layers

zuhengxu · zuhengxu · commit e39b8a8fe1c4 · 2025-07-23T16:37:58.000-07:00
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "NormalizingFlows"
 uuid = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -34,4 +34,4 @@ Functors = "0.5.2"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
-julia = "1.10"
+julia = "1.11"
diff --git a/example/Project.toml b/example/Project.toml
@@ -3,8 +3,10 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
@@ -48,7 +48,9 @@ sample_per_iter = 16
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
 # TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
-adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+adtype = ADTypes.AutoMooncake(; config = nothing)
+# adtype = ADTypes.AutoZygote()
+
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
     rng, 
diff --git a/example/test_n.jl b/example/test_n.jl
@@ -0,0 +1,167 @@
+using Flux
+using Bijectors
+using Bijectors: partition, combine, PartitionMask
+
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Mooncake, Zygote, Enzyme, ADTypes
+import NormalizingFlows as NF
+
+import DifferentiationInterface as DI
+
+
+pt = Float64
+inputdim = 4
+outputdim = 3
+
+x = randn(pt, inputdim)
+
+bs = 64
+xs = randn(pt, inputdim, 64)
+
+# compose two fully connected networks
+m1 = NF.fnn(inputdim, [16, 16], outputdim; output_activation=nothing, paramtype=pt)
+m2 = NF.fnn(outputdim, [16, 16], inputdim; output_activation=Flux.tanh, paramtype=pt)
+mm = reduce(∘, (m2, m1))
+psm, stm = Optimisers.destructure(mm)
+
+function lsm(ps, st, x)
+    model = st(ps)
+    y = model(x)
+    return sum(y) # just a dummy loss
+end
+
+adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+
+val, grad = DI.value_and_gradient(
+    lsm, adtype, 
+    psm, DI.Cache(stm), DI.Constant(xs)
+)
+
+
+acl = NF.AffineCoupling( inputdim, [16, 16], 1:2:inputdim, pt)
+psacl,stacl = Optimisers.destructure(acl)
+
+function loss(ps, st, x)
+    model = st(ps)
+    y = model(x)
+    return sum(y) # just a dummy loss
+end
+
+val, grad = DI.value_and_gradient(
+    loss, 
+    ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
+
+# val, grad = DI.value_and_gradient(
+#     loss, 
+#     ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+#     psacl, DI.Cache(stacl), DI.Constant(x)
+# )
+
+function loss_acl_manual(ps, st, x)
+    acl = st(ps)
+    s_net = acl.s
+    t_net = acl.t
+    mask = acl.mask
+    x₁, x₂, x₃ = partition(mask, x)
+    y₁ = exp.(s_net(x₂)) .* x₁ .+ t_net(x₂)
+    y = combine(mask, y₁, x₂, x₃)
+    # println("y = ", y)
+    return sum(y)
+end
+
+val, grad = DI.value_and_gradient(
+    loss_acl_manual, 
+    # ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+    # ADTypes.AutoEnzyme(;
+    #         mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+    #         function_annotation=Enzyme.Const,
+    #     ),
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
+
+
+
+function mlp3(
+    input_dim::Int, 
+    hidden_dims::Int, 
+    output_dim::Int; 
+    activation=Flux.leakyrelu,
+    paramtype::Type{T} = Float64
+) where {T<:AbstractFloat}
+    m = Chain(
+        Flux.Dense(input_dim, hidden_dims, activation),
+        Flux.Dense(hidden_dims, hidden_dims, activation),
+        Flux.Dense(hidden_dims, output_dim),
+    )
+    return Flux._paramtype(paramtype, m)
+end
+
+function ls_msk(ps, st, x, mask)
+    t_net = st(ps)
+    x₁, x₂, x₃ = partition(mask, x)
+    y₁ = x₁ .+ t_net(x₂)
+    y = combine(mask, y₁, x₂, x₃)
+    # println("y = ", y)
+    return sum(abs2, y)
+end
+
+inputdim = 4
+mask_idx = 1:2:inputdim
+mask = PartitionMask(inputdim, mask_idx)
+cdim = length(mask_idx)
+
+x = randn(inputdim)
+
+t_net = mlp3(cdim, 16, cdim; paramtype = Float64)
+ps, st = Optimisers.destructure(t_net)
+
+ls_msk(ps, st, x, mask) # 3.0167880799441793
+
+val, grad = DI.value_and_gradient(
+    ls_msk, 
+    ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+    ps, DI.Cache(st), DI.Constant(x), DI.Constant(mask)
+)
+
+
+struct ACL
+    mask::Bijectors.PartitionMask
+    t::Flux.Chain
+end
+@functor ACL (t, )
+
+acl = ACL(mask, t_net)
+psacl, stacl = Optimisers.destructure(acl)
+
+function loss_acl(ps, st, x)
+    acl = st(ps)
+    t_net = acl.t
+    mask = acl.mask
+    x₁, x₂, x₃ = partition(mask, x)
+    y₁ = x₁ .+ t_net(x₂)
+    y = combine(mask, y₁, x₂, x₃)
+    return sum(abs2, y)
+end
+loss_acl(psacl, stacl, x) # 3.0167880799441793
+
+val, grad = DI.value_and_gradient(
+    loss_acl, 
+    ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
+
+val, grad = DI.value_and_gradient(
+    loss_acl, 
+    ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
@@ -38,7 +38,7 @@ function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
     return combine(af.mask, y₁, x₂, x₃)
 end
 
-function (af::AffineCoupling)(x::AbstractArray)
+function (af::AffineCoupling)(x::AbstractVecOrMat)
     return transform(af, x)
 end
 
@@ -191,4 +191,4 @@ In *NeurIPS*.
 """
 realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp(
     q0, [32, 32], 10; paramtype=paramtype
-)
+)
diff --git a/test/Project.toml b/test/Project.toml
@@ -17,4 +17,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-Mooncake = "0.4.101"
+Mooncake = "0.4.140"
diff --git a/test/ad.jl b/test/ad.jl
@@ -84,7 +84,7 @@ end
             mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
             function_annotation=Enzyme.Const,
         ),
-        ADTypes.AutoMooncake(; config=Mooncake.Config()),
+        # ADTypes.AutoMooncake(; config=nothing),
     ]
         @testset "$T" for T in [Float32, Float64]
             μ = 10 * ones(T, 2)

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ end`
`84`	`84`	`mode=Enzyme.set_runtime_activity(Enzyme.Reverse),`
`85`	`85`	`function_annotation=Enzyme.Const,`
`86`	`86`	`),`
`87`		`- ADTypes.AutoMooncake(; config=Mooncake.Config()),`
	`87`	`+ # ADTypes.AutoMooncake(; config=nothing),`
`88`	`88`	`]`
`89`	`89`	`@testset "$T" for T in [Float32, Float64]`
`90`	`90`	`μ = 10 * ones(T, 2)`