update demos, debugging mooncake with elbo

zuhengxu · zuhengxu · commit 55fb607a1409 · 2025-07-13T22:27:11.000-07:00
diff --git a/example/Project.toml b/example/Project.toml
@@ -17,6 +17,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extras]
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
@@ -11,114 +11,6 @@ using NormalizingFlows
 include("SyntheticTargets.jl")
 include("utils.jl")
 
-##################################
-# define affine coupling layer using Bijectors.jl interface
-#################################
-struct AffineCoupling <: Bijectors.Bijector
-    dim::Int
-    mask::Bijectors.PartitionMask
-    s::Flux.Chain
-    t::Flux.Chain
-end
-
-# let params track field s and t
-@functor AffineCoupling (s, t)
-
-function AffineCoupling(
-    dim::Int,  # dimension of input
-    hdims::Int, # dimension of hidden units for s and t
-    mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-)
-    cdims = length(mask_idx) # dimension of parts used to construct coupling law
-    s = mlp3(cdims, hdims, cdims)
-    t = mlp3(cdims, hdims, cdims)
-    mask = PartitionMask(dim, mask_idx)
-    return AffineCoupling(dim, mask, s, t)
-end
-
-function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
-    # partition vector using 'af.mask::PartitionMask`
-    x₁, x₂, x₃ = partition(af.mask, x)
-    y₁ = x₁ .* af.s(x₂) .+ af.t(x₂)
-    return combine(af.mask, y₁, x₂, x₃)
-end
-
-function (af::AffineCoupling)(x::AbstractArray)
-    return transform(af, x)
-end
-
-function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2)) # this is a scalar
-    return combine(af.mask, y_1, x_2, x_3), logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix)
-    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2); dims = 1) # 1 × size(x, 2)
-    return combine(af.mask, y_1, x_2, x_3), vec(logjac)
-end
-
-
-function Bijectors.with_logabsdet_jacobian(
-    iaf::Inverse{<:AffineCoupling}, y::AbstractVector
-)
-    af = iaf.orig
-    # partition vector using `af.mask::PartitionMask`
-    y_1, y_2, y_3 = partition(af.mask, y)
-    # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2))
-    return combine(af.mask, x_1, y_2, y_3), logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(
-    iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix
-)
-    af = iaf.orig
-    # partition vector using `af.mask::PartitionMask`
-    y_1, y_2, y_3 = partition(af.mask, y)
-    # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2); dims = 1)
-    return combine(af.mask, x_1, y_2, y_3), vec(logjac)
-end
-
-################### 
-# an equivalent definition of AffineCoupling using Bijectors.Coupling 
-# (see https://github.com/TuringLang/Bijectors.jl/blob/74d52d4eda72a6149b1a89b72524545525419b3f/src/bijectors/coupling.jl#L188C1-L188C1)
-###################
-
-# struct AffineCoupling <: Bijectors.Bijector
-#     dim::Int
-#     mask::Bijectors.PartitionMask
-#     s::Flux.Chain
-#     t::Flux.Chain
-# end
-
-# # let params track field s and t
-# @functor AffineCoupling (s, t)
-
-# function AffineCoupling(dim, mask, s, t)
-#     return Bijectors.Coupling(θ -> Bijectors.Shift(t(θ)) ∘ Bijectors.Scale(s(θ)), mask)
-# end
-
-# function AffineCoupling(
-#     dim::Int,  # dimension of input
-#     hdims::Int, # dimension of hidden units for s and t
-#     mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-# )
-#     cdims = length(mask_idx) # dimension of parts used to construct coupling law
-#     s = mlp3(cdims, hdims, cdims)
-#     t = mlp3(cdims, hdims, cdims)
-#     mask = PartitionMask(dim, mask_idx)
-#     return AffineCoupling(dim, mask, s, t)
-# end
-
-
-
 ##################################
 # start demo
 #################################
@@ -132,29 +24,30 @@ T = Float32
 target = Banana(2, 1.0f0, 100.0f0)
 logp = Base.Fix1(logpdf, target)
 
+
 ######################################
 # learn the target using Affine coupling flow
 ######################################
 @leaf MvNormal
-q0 = MvNormal(zeros(T, 2), ones(T, 2))
+q0 = MvNormal(zeros(T, 2), I)
 
 d = 2
-hdims = 32
-
-# alternating the coupling layers
-Ls = [AffineCoupling(d, hdims, [1]) ∘ AffineCoupling(d, hdims, [2]) for i in 1:3]
+hdims = [16, 16]
+nlayers = 3
 
-flow = create_flow(Ls, q0)
+# use NormalizingFlows.realnvp to create a RealNVP flow
+flow = realnvp(q0, hdims, nlayers; paramtype=T)
 flow_untrained = deepcopy(flow)
 
 
 ######################################
 # start training
 ######################################
-sample_per_iter = 64
+sample_per_iter = 16
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
 adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
@@ -11,104 +11,6 @@ using NormalizingFlows
 include("SyntheticTargets.jl")
 include("utils.jl")
 
-##################################
-# define neural spline layer using Bijectors.jl interface
-#################################
-"""
-Neural Rational quadratic Spline layer 
-
-# References
-[1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
-"""
-struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector
-    dim::Int                # dimension of input
-    K::Int                  # number of knots
-    n_dims_transferred::Int  # number of dimensions that are transformed
-    nn::A   # networks that parmaterize the knots and derivatives
-    B::T                    # bound of the knots
-    mask::Bijectors.PartitionMask
-end
-
-function NeuralSplineLayer(
-    dim::T1,                # dimension of input
-    hdims::T1,              # dimension of hidden units for s and t
-    K::T1,                  # number of knots
-    B::T2,                  # bound of the knots
-    mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
-) where {T1<:Int,T2<:Real}
-    num_of_transformed_dims = length(mask_idx)
-    input_dims = dim - num_of_transformed_dims
-    
-    # output dim of the NN
-    output_dims = (3K - 1)*num_of_transformed_dims
-    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
-    nn = mlp3(input_dims, hdims, output_dims)
-
-    mask = Bijectors.PartitionMask(dim, mask_idx)
-    return NeuralSplineLayer(dim, K, num_of_transformed_dims, nn, B, mask)
-end
-
-@functor NeuralSplineLayer (nn,)
-
-# define forward and inverse transformation
-"""
-Build a rational quadratic spline from the nn output
-Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
-
-we just need to map the nn output to the knots and derivatives of the RQS
-"""
-function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector)
-    K, B = nsl.K, nsl.B
-    nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :)
-    ws = @view nnoutput[:, 1:K]
-    hs = @view nnoutput[:, (K + 1):(2K)]
-    ds = @view nnoutput[:, (2K + 1):(3K - 1)]
-    return Bijectors.RationalQuadraticSpline(ws, hs, ds, B)
-end
-
-function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    # instantiate rqs knots and derivatives
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1 = Bijectors.transform(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3)
-end
-
-function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, y3 = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    x1 = Bijectors.transform(Inverse(rqs), y1)
-    return Bijectors.combine(nsl.mask, x1, y2, y3)
-end
-
-function (nsl::NeuralSplineLayer)(x::AbstractVector)
-    return Bijectors.transform(nsl, x)
-end
-
-# define logabsdetjac
-function Bijectors.logabsdetjac(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, _ = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    logjac = logabsdetjac(rqs, x_1)
-    return logjac
-end
-
-function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, _ = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    logjac = logabsdetjac(Inverse(rqs), y1)
-    return logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1, logjac = with_logabsdet_jacobian(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac
-end
-
 ##################################
 # start demo
 #################################
@@ -148,6 +50,7 @@ sample_per_iter = 64
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
 adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
diff --git a/example/utils.jl b/example/utils.jl
@@ -13,11 +13,6 @@ function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux
     )
 end
 
-function create_flow(Ls, q₀)
-    ts =  reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
-
 function compare_trained_and_untrained_flow(
     flow_trained::Bijectors.MultivariateTransformed,
     flow_untrained::Bijectors.MultivariateTransformed,