From 8db828f93ecc53606f4b0b8f9e2b040f0ff91a5c Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 4 Jul 2025 21:55:11 -0700
Subject: [PATCH 01/45] make realnvp and nsf layers as part of the pkg

---
 Project.toml              |   4 ++
 src/NormalizingFlows.jl   |   7 +++
 src/flows/neuralspline.jl |  98 +++++++++++++++++++++++++++++++++++
 src/flows/realnvp.jl      | 106 ++++++++++++++++++++++++++++++++++++++
 src/flows/utils.jl        |  18 +++++++
 5 files changed, 233 insertions(+)
 create mode 100644 src/flows/neuralspline.jl
 create mode 100644 src/flows/realnvp.jl
 create mode 100644 src/flows/utils.jl

diff --git a/Project.toml b/Project.toml
index f03ebb6..3812492 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,8 @@ Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -27,6 +29,8 @@ CUDA = "5"
 DifferentiationInterface = "0.6, 0.7"
 Distributions = "0.25"
 DocStringExtensions = "0.9"
+Flux = "0.16"
+Functors = "0.5.2"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 72318df..8700aa8 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -123,4 +123,11 @@ function _device_specific_rand(
     return Random.rand(rng, td, n)
 end
 
+
+# interface of contructing common flow layers
+include("flows/utils.jl")
+include("flows/realnvp.jl")
+include("flows/neuralspline.jl")
+
+
 end
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
new file mode 100644
index 0000000..a42ee37
--- /dev/null
+++ b/src/flows/neuralspline.jl
@@ -0,0 +1,98 @@
+##################################
+# define neural spline layer using Bijectors.jl interface
+#################################
+"""
+Neural Rational quadratic Spline layer 
+
+# References
+[1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
+"""
+struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector
+    dim::Int                # dimension of input
+    K::Int                  # number of knots
+    n_dims_transferred::Int  # number of dimensions that are transformed
+    nn::A   # networks that parmaterize the knots and derivatives
+    B::T                    # bound of the knots
+    mask::Bijectors.PartitionMask
+end
+
+function NeuralSplineLayer(
+    dim::T1,                # dimension of input
+    hdims::T1,              # dimension of hidden units for s and t
+    K::T1,                  # number of knots
+    B::T2,                  # bound of the knots
+    mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
+) where {T1<:Int,T2<:Real}
+    num_of_transformed_dims = length(mask_idx)
+    input_dims = dim - num_of_transformed_dims
+    
+    # output dim of the NN
+    output_dims = (3K - 1)*num_of_transformed_dims
+    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
+    nn = mlp3(input_dims, hdims, output_dims)
+
+    mask = Bijectors.PartitionMask(dim, mask_idx)
+    return NeuralSplineLayer(dim, K, num_of_transformed_dims, nn, B, mask)
+end
+
+@functor NeuralSplineLayer (nn,)
+
+# define forward and inverse transformation
+"""
+Build a rational quadratic spline from the nn output
+Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
+
+we just need to map the nn output to the knots and derivatives of the RQS
+"""
+function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector)
+    K, B = nsl.K, nsl.B
+    nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :)
+    ws = @view nnoutput[:, 1:K]
+    hs = @view nnoutput[:, (K + 1):(2K)]
+    ds = @view nnoutput[:, (2K + 1):(3K - 1)]
+    return Bijectors.RationalQuadraticSpline(ws, hs, ds, B)
+end
+
+function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector)
+    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
+    # instantiate rqs knots and derivatives
+    rqs = instantiate_rqs(nsl, x_2)
+    y_1 = Bijectors.transform(rqs, x_1)
+    return Bijectors.combine(nsl.mask, y_1, x_2, x_3)
+end
+
+function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
+    nsl = insl.orig
+    y1, y2, y3 = partition(nsl.mask, y)
+    rqs = instantiate_rqs(nsl, y2)
+    x1 = Bijectors.transform(Inverse(rqs), y1)
+    return Bijectors.combine(nsl.mask, x1, y2, y3)
+end
+
+function (nsl::NeuralSplineLayer)(x::AbstractVector)
+    return Bijectors.transform(nsl, x)
+end
+
+# define logabsdetjac
+function Bijectors.logabsdetjac(nsl::NeuralSplineLayer, x::AbstractVector)
+    x_1, x_2, _ = Bijectors.partition(nsl.mask, x)
+    rqs = instantiate_rqs(nsl, x_2)
+    logjac = logabsdetjac(rqs, x_1)
+    return logjac
+end
+
+function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
+    nsl = insl.orig
+    y1, y2, _ = partition(nsl.mask, y)
+    rqs = instantiate_rqs(nsl, y2)
+    logjac = logabsdetjac(Inverse(rqs), y1)
+    return logjac
+end
+
+function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineLayer, x::AbstractVector)
+    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
+    rqs = instantiate_rqs(nsl, x_2)
+    y_1, logjac = with_logabsdet_jacobian(rqs, x_1)
+    return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac
+end
+
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
new file mode 100644
index 0000000..3282163
--- /dev/null
+++ b/src/flows/realnvp.jl
@@ -0,0 +1,106 @@
+##################################
+# define affine coupling layer using Bijectors.jl interface
+#################################
+struct AffineCoupling <: Bijectors.Bijector
+    dim::Int
+    mask::Bijectors.PartitionMask
+    s::Flux.Chain
+    t::Flux.Chain
+end
+
+# let params track field s and t
+@functor AffineCoupling (s, t)
+
+function AffineCoupling(
+    dim::Int,  # dimension of input
+    hdims::Int, # dimension of hidden units for s and t
+    mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
+)
+    cdims = length(mask_idx) # dimension of parts used to construct coupling law
+    s = mlp3(cdims, hdims, cdims)
+    t = mlp3(cdims, hdims, cdims)
+    mask = PartitionMask(dim, mask_idx)
+    return AffineCoupling(dim, mask, s, t)
+end
+
+function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
+    # partition vector using 'af.mask::PartitionMask`
+    x₁, x₂, x₃ = partition(af.mask, x)
+    y₁ = x₁ .* af.s(x₂) .+ af.t(x₂)
+    return combine(af.mask, y₁, x₂, x₃)
+end
+
+function (af::AffineCoupling)(x::AbstractArray)
+    return transform(af, x)
+end
+
+function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector)
+    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
+    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
+    logjac = sum(log ∘ abs, af.s(x_2)) # this is a scalar
+    return combine(af.mask, y_1, x_2, x_3), logjac
+end
+
+function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix)
+    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
+    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
+    logjac = sum(log ∘ abs, af.s(x_2); dims = 1) # 1 × size(x, 2)
+    return combine(af.mask, y_1, x_2, x_3), vec(logjac)
+end
+
+
+function Bijectors.with_logabsdet_jacobian(
+    iaf::Inverse{<:AffineCoupling}, y::AbstractVector
+)
+    af = iaf.orig
+    # partition vector using `af.mask::PartitionMask`
+    y_1, y_2, y_3 = partition(af.mask, y)
+    # inverse transformation
+    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
+    logjac = -sum(log ∘ abs, af.s(y_2))
+    return combine(af.mask, x_1, y_2, y_3), logjac
+end
+
+function Bijectors.with_logabsdet_jacobian(
+    iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix
+)
+    af = iaf.orig
+    # partition vector using `af.mask::PartitionMask`
+    y_1, y_2, y_3 = partition(af.mask, y)
+    # inverse transformation
+    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
+    logjac = -sum(log ∘ abs, af.s(y_2); dims = 1)
+    return combine(af.mask, x_1, y_2, y_3), vec(logjac)
+end
+
+################### 
+# an equivalent definition of AffineCoupling using Bijectors.Coupling 
+# (see https://github.com/TuringLang/Bijectors.jl/blob/74d52d4eda72a6149b1a89b72524545525419b3f/src/bijectors/coupling.jl#L188C1-L188C1)
+###################
+
+# struct AffineCoupling <: Bijectors.Bijector
+#     dim::Int
+#     mask::Bijectors.PartitionMask
+#     s::Flux.Chain
+#     t::Flux.Chain
+# end
+
+# # let params track field s and t
+# @functor AffineCoupling (s, t)
+
+# function AffineCoupling(dim, mask, s, t)
+#     return Bijectors.Coupling(θ -> Bijectors.Shift(t(θ)) ∘ Bijectors.Scale(s(θ)), mask)
+# end
+
+# function AffineCoupling(
+#     dim::Int,  # dimension of input
+#     hdims::Int, # dimension of hidden units for s and t
+#     mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
+# )
+#     cdims = length(mask_idx) # dimension of parts used to construct coupling law
+#     s = mlp3(cdims, hdims, cdims)
+#     t = mlp3(cdims, hdims, cdims)
+#     mask = PartitionMask(dim, mask_idx)
+#     return AffineCoupling(dim, mask, s, t)
+# end
+
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
new file mode 100644
index 0000000..5348b43
--- /dev/null
+++ b/src/flows/utils.jl
@@ -0,0 +1,18 @@
+using Bijectors: transformed
+using Flux
+
+"""
+A simple wrapper for a 3 layer dense MLP
+"""
+function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
+    return Chain(
+        Flux.Dense(input_dim, hidden_dims, activation),
+        Flux.Dense(hidden_dims, hidden_dims, activation),
+        Flux.Dense(hidden_dims, output_dim),
+    )
+end
+
+function create_flow(Ls, q₀)
+    ts =  reduce(∘, Ls)
+    return transformed(q₀, ts)
+end

From 574e25754f3b60728e2655c77726cbbd5298adcf Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 4 Jul 2025 22:05:06 -0700
Subject: [PATCH 02/45] import Functors

---
 src/NormalizingFlows.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 8700aa8..7bd39e0 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -8,6 +8,7 @@ using Optimisers
 using ProgressMeter
 using Random
 using StatsBase
+using Functors
 import DifferentiationInterface as DI
 
 using DocStringExtensions

From cd635653e1cf0b8798c60f5b2f3ad8551804102a Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 15:02:40 -0700
Subject: [PATCH 03/45] add fully connect nn constructor

---
 src/flows/utils.jl | 49 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 5348b43..04b9886 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -2,6 +2,8 @@ using Bijectors: transformed
 using Flux
 
 """
+    mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
+
 A simple wrapper for a 3 layer dense MLP
 """
 function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
@@ -12,7 +14,52 @@ function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux
     )
 end
 
+"""
+    fnn(
+        input_dim::Int,
+        hidden_dims::AbstractVector{<:Int},
+        output_dim::Int;
+        inlayer_activation=Flux.leakyrelu,
+        output_activation=Flux.tanh,
+    )
+
+Create a fully connected neural network (FNN).
+
+# Arguments
+- `input_dim::Int`: The dimension of the input layer.
+- `hidden_dims::AbstractVector{<:Int}`: A vector of integers specifying the dimensions of the hidden layers.
+- `output_dim::Int`: The dimension of the output layer.
+- `inlayer_activation`: The activation function for the hidden layers. Defaults to `Flux.leakyrelu`.
+- `output_activation`: The activation function for the output layer. Defaults to `Flux.tanh`.
+
+# Returns
+- A `Flux.Chain` representing the FNN.
+"""
+function fnn(
+    input_dim::Int,
+    hidden_dims::AbstractVector{<:Int},
+    output_dim::Int;
+    inlayer_activation=Flux.leakyrelu,
+    output_activation=Flux.tanh,
+)
+    # Create a chain of dense layers
+    # First layer
+    layers = Any[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
+
+    # Hidden layers
+    for i in 1:(length(hidden_dims) - 1)
+        push!(
+            layers,
+            Flux.Dense(hidden_dims[i], hidden_dims[i + 1], inlayer_activation),
+        )
+    end
+
+    # Output layer
+    push!(layers, Flux.Dense(hidden_dims[end], output_dim, output_activation))
+    return Chain(layers...)
+end
+
 function create_flow(Ls, q₀)
     ts =  reduce(∘, Ls)
     return transformed(q₀, ts)
-end
+end
\ No newline at end of file

From e03213ca7f84cb76c9c4458eb13430c46fb61ec1 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 16:58:51 -0700
Subject: [PATCH 04/45] update realnvp default constructor

---
 src/NormalizingFlows.jl   |  3 +-
 src/flows/neuralspline.jl |  2 +-
 src/flows/realnvp.jl      | 90 ++++++++++++++++++++++++++++++---------
 src/flows/utils.jl        | 34 +++++++++++----
 4 files changed, 99 insertions(+), 30 deletions(-)

diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 7bd39e0..8bbe00b 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -1,13 +1,14 @@
 module NormalizingFlows
 
 using ADTypes
-using Bijectors
 using Distributions
 using LinearAlgebra
 using Optimisers
 using ProgressMeter
 using Random
 using StatsBase
+using Bijectors
+using Bijectors: PartitionMask, Inverse, combine, partition
 using Functors
 import DifferentiationInterface as DI
 
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index a42ee37..d9bd605 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -39,7 +39,7 @@ end
 
 # define forward and inverse transformation
 """
-Build a rational quadratic spline from the nn output
+Build a rational quadratic spline (RQS) from the nn output
 Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
 
 we just need to map the nn output to the knots and derivatives of the RQS
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 3282163..bf713ff 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -1,6 +1,10 @@
-##################################
-# define affine coupling layer using Bijectors.jl interface
-#################################
+"""
+Default constructor of Affine Coupling flow layer
+
+following the general architecture as Eq(3) in [^AD2025]
+
+[^AD2024]: Agrawal, J., & Domke, J. (2025). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *AISTATS*
+"""
 struct AffineCoupling <: Bijectors.Bijector
     dim::Int
     mask::Bijectors.PartitionMask
@@ -12,13 +16,16 @@ end
 @functor AffineCoupling (s, t)
 
 function AffineCoupling(
-    dim::Int,  # dimension of input
-    hdims::Int, # dimension of hidden units for s and t
-    mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-)
-    cdims = length(mask_idx) # dimension of parts used to construct coupling law
-    s = mlp3(cdims, hdims, cdims)
-    t = mlp3(cdims, hdims, cdims)
+    dim::Int,                       # dimension of the problem
+    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
+    mask_idx::AbstractVector{Int},       # index of dimensione that one wants to apply transformations on
+    paramtype::Type{T}
+) where {T<:AbstractFloat}
+    cdims = length(mask_idx)  # dimension of parts used to construct coupling law
+    # for the scaling network s, add tanh to the output to ensure stability during training
+    s = fnn(dim-cdims, hdims, cdims; output_activation=Flux.tanh, paramtype=paramtype)  
+    # no transfomration for the output of the translation network t
+    t = fnn(dim-cdims, hdims, cdims; output_activation=nothing, paramtype=paramtype)
     mask = PartitionMask(dim, mask_idx)
     return AffineCoupling(dim, mask, s, t)
 end
@@ -26,7 +33,8 @@ end
 function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
     # partition vector using 'af.mask::PartitionMask`
     x₁, x₂, x₃ = partition(af.mask, x)
-    y₁ = x₁ .* af.s(x₂) .+ af.t(x₂)
+    s_x₂ = af.s(x₂)
+    y₁ = x₁ .* exp.(s_x₂) .+ af.t(x₂)
     return combine(af.mask, y₁, x₂, x₃)
 end
 
@@ -36,15 +44,17 @@ end
 
 function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector)
     x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2)) # this is a scalar
+    s_x2 = af.s(x_2)
+    y_1 = exp.(s_x2) .* x_1 .+ af.t(x_2)
+    logjac = sum(s_x2) # this is a scalar
     return combine(af.mask, y_1, x_2, x_3), logjac
 end
 
 function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix)
     x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2); dims = 1) # 1 × size(x, 2)
+    s_x2 = af.s(x_2)
+    y_1 = exp.(s_x2) .* x_1 .+ af.t(x_2)
+    logjac = sum(s_x2; dims=1) # 1 × size(x, 2)
     return combine(af.mask, y_1, x_2, x_3), vec(logjac)
 end
 
@@ -56,8 +66,9 @@ function Bijectors.with_logabsdet_jacobian(
     # partition vector using `af.mask::PartitionMask`
     y_1, y_2, y_3 = partition(af.mask, y)
     # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2))
+    s_y2 = af.s(y_2)
+    x_1 = (y_1 .- af.t(y_2)) .* exp.(-s_y2)
+    logjac = -sum(s_y2)
     return combine(af.mask, x_1, y_2, y_3), logjac
 end
 
@@ -68,8 +79,9 @@ function Bijectors.with_logabsdet_jacobian(
     # partition vector using `af.mask::PartitionMask`
     y_1, y_2, y_3 = partition(af.mask, y)
     # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2); dims = 1)
+    s_y2 = af.s(y_2)
+    x_1 = (y_1 .- af.t(y_2)) .* exp.(-s_y2)
+    logjac = -sum(s_y2; dims=1)
     return combine(af.mask, x_1, y_2, y_3), vec(logjac)
 end
 
@@ -104,3 +116,43 @@ end
 #     return AffineCoupling(dim, mask, s, t)
 # end
 
+"""
+Default constructor of RealNVP flow layer
+
+single layer of realnvp flow, which is a composition of 2 affine coupling transformations
+with complementary masks
+"""
+function RealNVP_layer(
+    dims::Int,                      # dimension of problem
+    hdims::AbstractVector{Int};     # dimension of hidden units for s and t
+    paramtype::Type{T} = Float64,   # type of the parameters
+) where {T<:AbstractFloat}
+
+    mask_idx1 = 1:2:dims
+    mask_idx2 = 2:2:dims
+
+    # by default use the odd-even masking strategy
+    af1 = AffineCoupling(dims, hdims, mask_idx1, paramtype)
+    af2 = AffineCoupling(dims, hdims, mask_idx2, paramtype)
+
+    return reduce(∘, (af1, af2))
+end
+
+
+function RealNVP(
+    dims::Int,                      # dimension of problem
+    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
+    nlayers::Int;                   # number of RealNVP_layer 
+    paramtype::Type{T} = Float64,   # type of the parameters
+) where {T<:AbstractFloat}
+
+    q0 = MvNormal(zeros(dims), I) # std Gaussian as the reference distribution
+    Ls = [RealNVP_layer(dims, hdims; paramtype=paramtype) for _ in 1:nlayers]
+    
+    create_flow(Ls, q0)         
+end
+
+function RealNVP(dims:Int; paramtype::Type{T} = Float64) where {T<:AbstractFloat}
+    # default RealNVP with 10 layers, each couplling function has 2 hidden layers with 32 units
+    return RealNVP(dims, [32, 32], 10; paramtype=paramtype)
+end
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 04b9886..1ff9a69 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -6,21 +6,29 @@ using Flux
 
 A simple wrapper for a 3 layer dense MLP
 """
-function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
-    return Chain(
+function mlp3(
+    input_dim::Int, 
+    hidden_dims::Int, 
+    output_dim::Int; 
+    activation=Flux.leakyrelu,
+    paramtype::Type{T} = Float64
+) where {T<:AbstractFloat}
+    m = Chain(
         Flux.Dense(input_dim, hidden_dims, activation),
         Flux.Dense(hidden_dims, hidden_dims, activation),
         Flux.Dense(hidden_dims, output_dim),
     )
+    return Flux._paramtype(paramtype, m)
 end
 
 """
     fnn(
         input_dim::Int,
-        hidden_dims::AbstractVector{<:Int},
+        hidden_dims::AbstractVector{Int},
         output_dim::Int;
         inlayer_activation=Flux.leakyrelu,
-        output_activation=Flux.tanh,
+        output_activation=nothing,
+        paramtype::Type{T} = Float64,
     )
 
 Create a fully connected neural network (FNN).
@@ -31,17 +39,19 @@ Create a fully connected neural network (FNN).
 - `output_dim::Int`: The dimension of the output layer.
 - `inlayer_activation`: The activation function for the hidden layers. Defaults to `Flux.leakyrelu`.
 - `output_activation`: The activation function for the output layer. Defaults to `Flux.tanh`.
+- `paramtype::Type{T} = Float64`: The type of the parameters in the network, defaults to `Float64`.
 
 # Returns
 - A `Flux.Chain` representing the FNN.
 """
 function fnn(
     input_dim::Int,
-    hidden_dims::AbstractVector{<:Int},
+    hidden_dims::AbstractVector{Int},
     output_dim::Int;
     inlayer_activation=Flux.leakyrelu,
-    output_activation=Flux.tanh,
-)
+    output_activation=nothing,
+    paramtype::Type{T} = Float64,
+) where {T<:AbstractFloat}
     # Create a chain of dense layers
     # First layer
     layers = Any[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
@@ -55,8 +65,14 @@ function fnn(
     end
 
     # Output layer
-    push!(layers, Flux.Dense(hidden_dims[end], output_dim, output_activation))
-    return Chain(layers...)
+    if output_activation === nothing
+        push!(layers, Flux.Dense(hidden_dims[end], output_dim))
+    else
+        push!(layers, Flux.Dense(hidden_dims[end], output_dim, output_activation))
+    end
+
+    m = Chain(layers...)
+    return Flux._paramtype(paramtype, m)
 end
 
 function create_flow(Ls, q₀)

From 34a964e484cc834bd0a1bd8625ae9ab5a27fd7ec Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 16:59:35 -0700
Subject: [PATCH 05/45] minor typo fix

---
 src/flows/realnvp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index bf713ff..1edf0e5 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -152,7 +152,7 @@ function RealNVP(
     create_flow(Ls, q0)         
 end
 
-function RealNVP(dims:Int; paramtype::Type{T} = Float64) where {T<:AbstractFloat}
+function RealNVP(dims::Int; paramtype::Type{T} = Float64) where {T<:AbstractFloat}
     # default RealNVP with 10 layers, each couplling function has 2 hidden layers with 32 units
     return RealNVP(dims, [32, 32], 10; paramtype=paramtype)
 end

From 1c4118bc7038861f285ca0022c5f3e24b59dfde9 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 21:19:39 -0700
Subject: [PATCH 06/45] minor update of realnnvp constructor and add some doc

---
 src/NormalizingFlows.jl   |  2 ++
 src/flows/neuralspline.jl |  1 -
 src/flows/realnvp.jl      | 64 ++++++++++++++++++++++++++++++---------
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 8bbe00b..4794a9f 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -131,5 +131,7 @@ include("flows/utils.jl")
 include("flows/realnvp.jl")
 include("flows/neuralspline.jl")
 
+export RealNVP_layer, realnvp
+
 
 end
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index d9bd605..1e648b7 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -37,7 +37,6 @@ end
 
 @functor NeuralSplineLayer (nn,)
 
-# define forward and inverse transformation
 """
 Build a rational quadratic spline (RQS) from the nn output
 Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 1edf0e5..7b2942a 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -3,7 +3,7 @@ Default constructor of Affine Coupling flow layer
 
 following the general architecture as Eq(3) in [^AD2025]
 
-[^AD2024]: Agrawal, J., & Domke, J. (2025). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *AISTATS*
+[^AD2025]: Agrawal, J., & Domke, J. (2025). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *AISTATS*
 """
 struct AffineCoupling <: Bijectors.Bijector
     dim::Int
@@ -117,10 +117,21 @@ end
 # end
 
 """
-Default constructor of RealNVP flow layer
+    RealNVP_layer(dims, hdims; paramtype = Float64)
 
-single layer of realnvp flow, which is a composition of 2 affine coupling transformations
-with complementary masks
+Default constructor of single layer of realnvp flow, 
+which is a composition of 2 affine coupling transformations with complementary masks.
+The masking strategy is odd-even masking.
+
+# Arguments
+- `dims::Int`: dimension of the problem
+- `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
+
+# Keyword Arguments
+- `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
+
+# Returns
+- A `Bijectors.Bijector` representing the RealNVP layer.
 """
 function RealNVP_layer(
     dims::Int,                      # dimension of problem
@@ -134,25 +145,50 @@ function RealNVP_layer(
     # by default use the odd-even masking strategy
     af1 = AffineCoupling(dims, hdims, mask_idx1, paramtype)
     af2 = AffineCoupling(dims, hdims, mask_idx2, paramtype)
-
     return reduce(∘, (af1, af2))
 end
 
+"""
+    realnvp(q0, dims, hdims, nlayers; paramtype = Float64)
 
-function RealNVP(
-    dims::Int,                      # dimension of problem
+Default constructor of RealNVP flow, which is a composition of `nlayers` RealNVP_layer.
+# Arguments
+- `q0::Distribution{Continuous, Multivariate}`: reference distribution, e.g. `MvNormal(zeros(dims), I)`
+- `dims::Int`: dimension of problem
+- `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
+- `nlayers::Int`: number of RealNVP_layer  
+# Keyword Arguments
+- `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
+
+# Returns
+- A `Bijectors.MultivariateTransformed` representing the RealNVP flow.
+
+"""
+
+function realnvp(
+    q0::Distribution{Continuous, Multivariate},  
     hdims::AbstractVector{Int},     # dimension of hidden units for s and t
     nlayers::Int;                   # number of RealNVP_layer 
     paramtype::Type{T} = Float64,   # type of the parameters
 ) where {T<:AbstractFloat}
 
-    q0 = MvNormal(zeros(dims), I) # std Gaussian as the reference distribution
-    Ls = [RealNVP_layer(dims, hdims; paramtype=paramtype) for _ in 1:nlayers]
-    
+    dims = length(q0)  # dimension of the reference distribution == dim of the problem
+    Ls = [RealNVP_layer(dims, hdims; paramtype=paramtype) for _ in 1:nlayers] 
     create_flow(Ls, q0)         
 end
 
-function RealNVP(dims::Int; paramtype::Type{T} = Float64) where {T<:AbstractFloat}
-    # default RealNVP with 10 layers, each couplling function has 2 hidden layers with 32 units
-    return RealNVP(dims, [32, 32], 10; paramtype=paramtype)
-end
+"""
+    realnvp(q0; paramtype = Float64)
+
+Default constructor of RealNVP with 10 layers, 
+each coupling function has 2 hidden layers with 32 units. 
+Following the general architecture as in [^ASD2020] (see Apdx. E).
+
+
+[^ASD2020]: Agrawal, A., & Sheldon, D., & Domke, J. (2020). 
+Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. 
+In *NeurIPS*.
+"""
+realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = RealNVP(
+    q0, [32, 32], 10; paramtype=paramtype
+)
\ No newline at end of file

From d6b86cb914db7d3a28eb1a43d0f59656c0ac1b23 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 21:21:26 -0700
Subject: [PATCH 07/45] fixing bug in Distribution types

---
 src/flows/realnvp.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 7b2942a..7722207 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -153,7 +153,7 @@ end
 
 Default constructor of RealNVP flow, which is a composition of `nlayers` RealNVP_layer.
 # Arguments
-- `q0::Distribution{Continuous, Multivariate}`: reference distribution, e.g. `MvNormal(zeros(dims), I)`
+- `q0::Distribution{Multivariate,Continuous}`: reference distribution, e.g. `MvNormal(zeros(dims), I)`
 - `dims::Int`: dimension of problem
 - `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
 - `nlayers::Int`: number of RealNVP_layer  
@@ -166,7 +166,7 @@ Default constructor of RealNVP flow, which is a composition of `nlayers` RealNVP
 """
 
 function realnvp(
-    q0::Distribution{Continuous, Multivariate},  
+    q0::Distribution{Multivariate,Continuous},  
     hdims::AbstractVector{Int},     # dimension of hidden units for s and t
     nlayers::Int;                   # number of RealNVP_layer 
     paramtype::Type{T} = Float64,   # type of the parameters

From aa2adeb8f6568a1059e171c337f035a05633772c Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 21:26:02 -0700
Subject: [PATCH 08/45] exclude nsf for now

---
 src/NormalizingFlows.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 4794a9f..44d71ee 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -129,7 +129,7 @@ end
 # interface of contructing common flow layers
 include("flows/utils.jl")
 include("flows/realnvp.jl")
-include("flows/neuralspline.jl")
+# include("flows/neuralspline.jl")
 
 export RealNVP_layer, realnvp
 

From 00ca29cb1ed637d83e9735bf9a9e5ddc2023c5a2 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 21:29:12 -0700
Subject: [PATCH 09/45] minor ed in nsf

---
 src/flows/neuralspline.jl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index 1e648b7..c45c3d1 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -8,19 +8,19 @@ Neural Rational quadratic Spline layer
 [1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
 """
 struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector
-    dim::Int                # dimension of input
-    K::Int                  # number of knots
-    n_dims_transferred::Int  # number of dimensions that are transformed
-    nn::A   # networks that parmaterize the knots and derivatives
-    B::T                    # bound of the knots
+    dim::Int                        # dimension of input
+    K::Int                          # number of knots
+    n_dims_transferred::Int         # number of dimensions that are transformed
+    nn::A                           # networks that parmaterize the knots and derivatives
+    B::T                            # bound of the knots
     mask::Bijectors.PartitionMask
 end
 
 function NeuralSplineLayer(
-    dim::T1,                # dimension of input
-    hdims::T1,              # dimension of hidden units for s and t
-    K::T1,                  # number of knots
-    B::T2,                  # bound of the knots
+    dim::T1,                         # dimension of input
+    hdims::T1,                       # dimension of hidden units for s and t
+    K::T1,                           # number of knots
+    B::T2,                           # bound of the knots
     mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
 ) where {T1<:Int,T2<:Real}
     num_of_transformed_dims = length(mask_idx)

From 731e657cd3aaa2e92bb8235068905cd26b41345f Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 21:29:31 -0700
Subject: [PATCH 10/45] fix typo in realnvp

---
 src/flows/realnvp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 7722207..52f81ba 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -189,6 +189,6 @@ Following the general architecture as in [^ASD2020] (see Apdx. E).
 Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. 
 In *NeurIPS*.
 """
-realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = RealNVP(
+realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp(
     q0, [32, 32], 10; paramtype=paramtype
 )
\ No newline at end of file

From e4fa67b3d347f4789e94846065fb3a4cfa33a2b9 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 21:55:29 -0700
Subject: [PATCH 11/45] add realnvp test

---
 test/flow.jl     | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl |  1 +
 2 files changed, 66 insertions(+)
 create mode 100644 test/flow.jl

diff --git a/test/flow.jl b/test/flow.jl
new file mode 100644
index 0000000..5ad158a
--- /dev/null
+++ b/test/flow.jl
@@ -0,0 +1,65 @@
+@testset "RealNVP flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 2
+    hdims = [32, 32]
+    for T in [Float32, Float64]
+        # Create a RealNVP flow
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+        flow = NormalizingFlows.realnvp(q₀, hdims, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-6
+            @test lj_fwd ≈ -lj_bwd rtol=1e-6
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-6
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-6
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Define a simple log-likelihood function
+            logp(z) = logpdf(q₀, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test elbo_value is not NaN and not Inf
+            @test !isnan(elbo_value)
+            @test !isinf(elbo_value)
+            @test !isnan(elbo_batch_value)
+            @test !isinf(elbo_batch_value)
+        end
+
+        #todo add tests for ad
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 33a9808..4a63fb2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,3 +14,4 @@ using Test
 include("ad.jl")
 include("objectives.jl")
 include("interface.jl")
+include("flow.jl")

From 1dfebd1fccadcdce11e64404dc039af42f0a642c Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 22:26:50 -0700
Subject: [PATCH 12/45] export nsf layer

---
 src/NormalizingFlows.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 44d71ee..3071fd0 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -129,9 +129,11 @@ end
 # interface of contructing common flow layers
 include("flows/utils.jl")
 include("flows/realnvp.jl")
-# include("flows/neuralspline.jl")
+include("flows/neuralspline.jl")
 
-export RealNVP_layer, realnvp
+export create_flow
+export RealNVP_layer, realnvp, AffineCoupling
+export NeuralSplineLayer
 
 
 end

From 55fb607a1409190b8658c29979157ca332ca286f Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 22:27:11 -0700
Subject: [PATCH 13/45] update demos, debugging mooncake with elbo

---
 example/Project.toml               |   1 +
 example/demo_RealNVP.jl            | 123 ++---------------------------
 example/demo_neural_spline_flow.jl |  99 +----------------------
 example/utils.jl                   |   5 --
 4 files changed, 10 insertions(+), 218 deletions(-)

diff --git a/example/Project.toml b/example/Project.toml
index 0b9b021..086ae58 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -17,6 +17,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extras]
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index 516e4c9..20eb9c5 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -11,114 +11,6 @@ using NormalizingFlows
 include("SyntheticTargets.jl")
 include("utils.jl")
 
-##################################
-# define affine coupling layer using Bijectors.jl interface
-#################################
-struct AffineCoupling <: Bijectors.Bijector
-    dim::Int
-    mask::Bijectors.PartitionMask
-    s::Flux.Chain
-    t::Flux.Chain
-end
-
-# let params track field s and t
-@functor AffineCoupling (s, t)
-
-function AffineCoupling(
-    dim::Int,  # dimension of input
-    hdims::Int, # dimension of hidden units for s and t
-    mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-)
-    cdims = length(mask_idx) # dimension of parts used to construct coupling law
-    s = mlp3(cdims, hdims, cdims)
-    t = mlp3(cdims, hdims, cdims)
-    mask = PartitionMask(dim, mask_idx)
-    return AffineCoupling(dim, mask, s, t)
-end
-
-function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
-    # partition vector using 'af.mask::PartitionMask`
-    x₁, x₂, x₃ = partition(af.mask, x)
-    y₁ = x₁ .* af.s(x₂) .+ af.t(x₂)
-    return combine(af.mask, y₁, x₂, x₃)
-end
-
-function (af::AffineCoupling)(x::AbstractArray)
-    return transform(af, x)
-end
-
-function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2)) # this is a scalar
-    return combine(af.mask, y_1, x_2, x_3), logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix)
-    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2); dims = 1) # 1 × size(x, 2)
-    return combine(af.mask, y_1, x_2, x_3), vec(logjac)
-end
-
-
-function Bijectors.with_logabsdet_jacobian(
-    iaf::Inverse{<:AffineCoupling}, y::AbstractVector
-)
-    af = iaf.orig
-    # partition vector using `af.mask::PartitionMask`
-    y_1, y_2, y_3 = partition(af.mask, y)
-    # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2))
-    return combine(af.mask, x_1, y_2, y_3), logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(
-    iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix
-)
-    af = iaf.orig
-    # partition vector using `af.mask::PartitionMask`
-    y_1, y_2, y_3 = partition(af.mask, y)
-    # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2); dims = 1)
-    return combine(af.mask, x_1, y_2, y_3), vec(logjac)
-end
-
-################### 
-# an equivalent definition of AffineCoupling using Bijectors.Coupling 
-# (see https://github.com/TuringLang/Bijectors.jl/blob/74d52d4eda72a6149b1a89b72524545525419b3f/src/bijectors/coupling.jl#L188C1-L188C1)
-###################
-
-# struct AffineCoupling <: Bijectors.Bijector
-#     dim::Int
-#     mask::Bijectors.PartitionMask
-#     s::Flux.Chain
-#     t::Flux.Chain
-# end
-
-# # let params track field s and t
-# @functor AffineCoupling (s, t)
-
-# function AffineCoupling(dim, mask, s, t)
-#     return Bijectors.Coupling(θ -> Bijectors.Shift(t(θ)) ∘ Bijectors.Scale(s(θ)), mask)
-# end
-
-# function AffineCoupling(
-#     dim::Int,  # dimension of input
-#     hdims::Int, # dimension of hidden units for s and t
-#     mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-# )
-#     cdims = length(mask_idx) # dimension of parts used to construct coupling law
-#     s = mlp3(cdims, hdims, cdims)
-#     t = mlp3(cdims, hdims, cdims)
-#     mask = PartitionMask(dim, mask_idx)
-#     return AffineCoupling(dim, mask, s, t)
-# end
-
-
-
 ##################################
 # start demo
 #################################
@@ -132,29 +24,30 @@ T = Float32
 target = Banana(2, 1.0f0, 100.0f0)
 logp = Base.Fix1(logpdf, target)
 
+
 ######################################
 # learn the target using Affine coupling flow
 ######################################
 @leaf MvNormal
-q0 = MvNormal(zeros(T, 2), ones(T, 2))
+q0 = MvNormal(zeros(T, 2), I)
 
 d = 2
-hdims = 32
-
-# alternating the coupling layers
-Ls = [AffineCoupling(d, hdims, [1]) ∘ AffineCoupling(d, hdims, [2]) for i in 1:3]
+hdims = [16, 16]
+nlayers = 3
 
-flow = create_flow(Ls, q0)
+# use NormalizingFlows.realnvp to create a RealNVP flow
+flow = realnvp(q0, hdims, nlayers; paramtype=T)
 flow_untrained = deepcopy(flow)
 
 
 ######################################
 # start training
 ######################################
-sample_per_iter = 64
+sample_per_iter = 16
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
 adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index ffeba09..3ad8b86 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -11,104 +11,6 @@ using NormalizingFlows
 include("SyntheticTargets.jl")
 include("utils.jl")
 
-##################################
-# define neural spline layer using Bijectors.jl interface
-#################################
-"""
-Neural Rational quadratic Spline layer 
-
-# References
-[1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
-"""
-struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector
-    dim::Int                # dimension of input
-    K::Int                  # number of knots
-    n_dims_transferred::Int  # number of dimensions that are transformed
-    nn::A   # networks that parmaterize the knots and derivatives
-    B::T                    # bound of the knots
-    mask::Bijectors.PartitionMask
-end
-
-function NeuralSplineLayer(
-    dim::T1,                # dimension of input
-    hdims::T1,              # dimension of hidden units for s and t
-    K::T1,                  # number of knots
-    B::T2,                  # bound of the knots
-    mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
-) where {T1<:Int,T2<:Real}
-    num_of_transformed_dims = length(mask_idx)
-    input_dims = dim - num_of_transformed_dims
-    
-    # output dim of the NN
-    output_dims = (3K - 1)*num_of_transformed_dims
-    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
-    nn = mlp3(input_dims, hdims, output_dims)
-
-    mask = Bijectors.PartitionMask(dim, mask_idx)
-    return NeuralSplineLayer(dim, K, num_of_transformed_dims, nn, B, mask)
-end
-
-@functor NeuralSplineLayer (nn,)
-
-# define forward and inverse transformation
-"""
-Build a rational quadratic spline from the nn output
-Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
-
-we just need to map the nn output to the knots and derivatives of the RQS
-"""
-function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector)
-    K, B = nsl.K, nsl.B
-    nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :)
-    ws = @view nnoutput[:, 1:K]
-    hs = @view nnoutput[:, (K + 1):(2K)]
-    ds = @view nnoutput[:, (2K + 1):(3K - 1)]
-    return Bijectors.RationalQuadraticSpline(ws, hs, ds, B)
-end
-
-function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    # instantiate rqs knots and derivatives
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1 = Bijectors.transform(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3)
-end
-
-function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, y3 = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    x1 = Bijectors.transform(Inverse(rqs), y1)
-    return Bijectors.combine(nsl.mask, x1, y2, y3)
-end
-
-function (nsl::NeuralSplineLayer)(x::AbstractVector)
-    return Bijectors.transform(nsl, x)
-end
-
-# define logabsdetjac
-function Bijectors.logabsdetjac(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, _ = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    logjac = logabsdetjac(rqs, x_1)
-    return logjac
-end
-
-function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, _ = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    logjac = logabsdetjac(Inverse(rqs), y1)
-    return logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1, logjac = with_logabsdet_jacobian(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac
-end
-
 ##################################
 # start demo
 #################################
@@ -148,6 +50,7 @@ sample_per_iter = 64
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
 adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
diff --git a/example/utils.jl b/example/utils.jl
index b5d19ca..fb28810 100644
--- a/example/utils.jl
+++ b/example/utils.jl
@@ -13,11 +13,6 @@ function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux
     )
 end
 
-function create_flow(Ls, q₀)
-    ts =  reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
-
 function compare_trained_and_untrained_flow(
     flow_trained::Bijectors.MultivariateTransformed,
     flow_untrained::Bijectors.MultivariateTransformed,

From a2f6fbee34f01b4818d42bf9777d4315d664b08c Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 13 Jul 2025 22:32:09 -0700
Subject: [PATCH 14/45] add AD tests for realnvp elbo

---
 test/ad.jl       | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl |  2 +-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/test/ad.jl b/test/ad.jl
index 725b354..0fdfa2b 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -73,3 +73,50 @@ end
         end
     end
 end
+
+
+@testset "AD for ELBO on realnvp" begin
+    @testset "$at" for at in [
+        ADTypes.AutoZygote(),
+        ADTypes.AutoForwardDiff(),
+        ADTypes.AutoReverseDiff(; compile=false),
+        ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+        ADTypes.AutoMooncake(; config=Mooncake.Config()),
+    ]
+        @testset "$T" for T in [Float32, Float64]
+            μ = 10 * ones(T, 2)
+            Σ = Diagonal(4 * ones(T, 2))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # necessary for Zygote/mooncake to differentiate through the flow
+            # prevent updating params of q0
+            @leaf MvNormal
+            q₀ = MvNormal(zeros(T, 2), ones(T, 2))
+            flow = realnvp(q₀, [8, 8], 3; paramtype=T)
+
+            θ, re = Optimisers.destructure(flow)
+
+            # check grad computation for elbo
+            function loss(θ, rng, logp, sample_per_iter)
+                return -NormalizingFlows.elbo_batch(rng, re(θ), logp, sample_per_iter)
+            end
+
+            rng = Random.default_rng()
+            sample_per_iter = 10
+
+            prep = NormalizingFlows._prepare_gradient(
+                loss, at, θ, rng, logp, sample_per_iter
+            )
+            value, grad = NormalizingFlows._value_and_gradient(
+                loss, prep, at, θ, rng, logp, sample_per_iter
+            )
+
+            @test value !== nothing
+            @test all(grad .!= nothing)
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4a63fb2..2808fb1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,7 +11,7 @@ import DifferentiationInterface as DI
 
 using Test
 
-include("ad.jl")
 include("objectives.jl")
 include("interface.jl")
 include("flow.jl")
+include("ad.jl")

From e39b8a8fe1c4df71ede36f8163323709472dd1b3 Mon Sep 17 00:00:00 2001
From: zuhengxu <zuhengdavid@gmail.com>
Date: Wed, 23 Jul 2025 16:37:58 -0700
Subject: [PATCH 15/45] wip debug mooncake on coupling layers

---
 Project.toml            |   4 +-
 example/Project.toml    |   2 +
 example/demo_RealNVP.jl |   4 +-
 example/test_n.jl       | 167 ++++++++++++++++++++++++++++++++++++++++
 src/flows/realnvp.jl    |   4 +-
 test/Project.toml       |   2 +-
 test/ad.jl              |   2 +-
 7 files changed, 178 insertions(+), 7 deletions(-)
 create mode 100644 example/test_n.jl

diff --git a/Project.toml b/Project.toml
index 3812492..30b5039 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NormalizingFlows"
 uuid = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -34,4 +34,4 @@ Functors = "0.5.2"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
-julia = "1.10"
+julia = "1.11"
diff --git a/example/Project.toml b/example/Project.toml
index 086ae58..ef94533 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -3,8 +3,10 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index 20eb9c5..67d2664 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -48,7 +48,9 @@ sample_per_iter = 16
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
 # TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
-adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+adtype = ADTypes.AutoMooncake(; config = nothing)
+# adtype = ADTypes.AutoZygote()
+
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
     rng, 
diff --git a/example/test_n.jl b/example/test_n.jl
new file mode 100644
index 0000000..595e96c
--- /dev/null
+++ b/example/test_n.jl
@@ -0,0 +1,167 @@
+using Flux
+using Bijectors
+using Bijectors: partition, combine, PartitionMask
+
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Mooncake, Zygote, Enzyme, ADTypes
+import NormalizingFlows as NF
+
+import DifferentiationInterface as DI
+
+
+pt = Float64
+inputdim = 4
+outputdim = 3
+
+x = randn(pt, inputdim)
+
+bs = 64
+xs = randn(pt, inputdim, 64)
+
+# compose two fully connected networks
+m1 = NF.fnn(inputdim, [16, 16], outputdim; output_activation=nothing, paramtype=pt)
+m2 = NF.fnn(outputdim, [16, 16], inputdim; output_activation=Flux.tanh, paramtype=pt)
+mm = reduce(∘, (m2, m1))
+psm, stm = Optimisers.destructure(mm)
+
+function lsm(ps, st, x)
+    model = st(ps)
+    y = model(x)
+    return sum(y) # just a dummy loss
+end
+
+adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+
+val, grad = DI.value_and_gradient(
+    lsm, adtype, 
+    psm, DI.Cache(stm), DI.Constant(xs)
+)
+
+
+acl = NF.AffineCoupling( inputdim, [16, 16], 1:2:inputdim, pt)
+psacl,stacl = Optimisers.destructure(acl)
+
+function loss(ps, st, x)
+    model = st(ps)
+    y = model(x)
+    return sum(y) # just a dummy loss
+end
+
+val, grad = DI.value_and_gradient(
+    loss, 
+    ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
+
+# val, grad = DI.value_and_gradient(
+#     loss, 
+#     ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+#     psacl, DI.Cache(stacl), DI.Constant(x)
+# )
+
+function loss_acl_manual(ps, st, x)
+    acl = st(ps)
+    s_net = acl.s
+    t_net = acl.t
+    mask = acl.mask
+    x₁, x₂, x₃ = partition(mask, x)
+    y₁ = exp.(s_net(x₂)) .* x₁ .+ t_net(x₂)
+    y = combine(mask, y₁, x₂, x₃)
+    # println("y = ", y)
+    return sum(y)
+end
+
+val, grad = DI.value_and_gradient(
+    loss_acl_manual, 
+    # ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+    # ADTypes.AutoEnzyme(;
+    #         mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+    #         function_annotation=Enzyme.Const,
+    #     ),
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
+
+
+
+function mlp3(
+    input_dim::Int, 
+    hidden_dims::Int, 
+    output_dim::Int; 
+    activation=Flux.leakyrelu,
+    paramtype::Type{T} = Float64
+) where {T<:AbstractFloat}
+    m = Chain(
+        Flux.Dense(input_dim, hidden_dims, activation),
+        Flux.Dense(hidden_dims, hidden_dims, activation),
+        Flux.Dense(hidden_dims, output_dim),
+    )
+    return Flux._paramtype(paramtype, m)
+end
+
+function ls_msk(ps, st, x, mask)
+    t_net = st(ps)
+    x₁, x₂, x₃ = partition(mask, x)
+    y₁ = x₁ .+ t_net(x₂)
+    y = combine(mask, y₁, x₂, x₃)
+    # println("y = ", y)
+    return sum(abs2, y)
+end
+
+inputdim = 4
+mask_idx = 1:2:inputdim
+mask = PartitionMask(inputdim, mask_idx)
+cdim = length(mask_idx)
+
+x = randn(inputdim)
+
+t_net = mlp3(cdim, 16, cdim; paramtype = Float64)
+ps, st = Optimisers.destructure(t_net)
+
+ls_msk(ps, st, x, mask) # 3.0167880799441793
+
+val, grad = DI.value_and_gradient(
+    ls_msk, 
+    ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+    ps, DI.Cache(st), DI.Constant(x), DI.Constant(mask)
+)
+
+
+struct ACL
+    mask::Bijectors.PartitionMask
+    t::Flux.Chain
+end
+@functor ACL (t, )
+
+acl = ACL(mask, t_net)
+psacl, stacl = Optimisers.destructure(acl)
+
+function loss_acl(ps, st, x)
+    acl = st(ps)
+    t_net = acl.t
+    mask = acl.mask
+    x₁, x₂, x₃ = partition(mask, x)
+    y₁ = x₁ .+ t_net(x₂)
+    y = combine(mask, y₁, x₂, x₃)
+    return sum(abs2, y)
+end
+loss_acl(psacl, stacl, x) # 3.0167880799441793
+
+val, grad = DI.value_and_gradient(
+    loss_acl, 
+    ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
+
+val, grad = DI.value_and_gradient(
+    loss_acl, 
+    ADTypes.AutoMooncake(; config = Mooncake.Config()), 
+    psacl, DI.Cache(stacl), DI.Constant(x)
+)
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 52f81ba..0d1e6a7 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -38,7 +38,7 @@ function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
     return combine(af.mask, y₁, x₂, x₃)
 end
 
-function (af::AffineCoupling)(x::AbstractArray)
+function (af::AffineCoupling)(x::AbstractVecOrMat)
     return transform(af, x)
 end
 
@@ -191,4 +191,4 @@ In *NeurIPS*.
 """
 realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp(
     q0, [32, 32], 10; paramtype=paramtype
-)
\ No newline at end of file
+)
diff --git a/test/Project.toml b/test/Project.toml
index be5ffa0..27816cd 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -17,4 +17,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-Mooncake = "0.4.101"
+Mooncake = "0.4.140"
diff --git a/test/ad.jl b/test/ad.jl
index 0fdfa2b..4a59f5f 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -84,7 +84,7 @@ end
             mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
             function_annotation=Enzyme.Const,
         ),
-        ADTypes.AutoMooncake(; config=Mooncake.Config()),
+        # ADTypes.AutoMooncake(; config=nothing),
     ]
         @testset "$T" for T in [Float32, Float64]
             μ = 10 * ones(T, 2)

From 84ce45f9523b8d4a1f8bf586779d86e54bef34cb Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 25 Jul 2025 13:22:01 -0700
Subject: [PATCH 16/45] found that bug revealed by mooncake 0.4.124

---
 test/Project.toml | 3 ---
 test/ad.jl        | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index 27816cd..66f38e3 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -15,6 +15,3 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-
-[compat]
-Mooncake = "0.4.140"
diff --git a/test/ad.jl b/test/ad.jl
index 4a59f5f..0fdfa2b 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -84,7 +84,7 @@ end
             mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
             function_annotation=Enzyme.Const,
         ),
-        # ADTypes.AutoMooncake(; config=nothing),
+        ADTypes.AutoMooncake(; config=Mooncake.Config()),
     ]
         @testset "$T" for T in [Float32, Float64]
             μ = 10 * ones(T, 2)

From 9ba0a3b192b06394dbc3eda761ec1b0e13c45149 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Thu, 31 Jul 2025 15:44:55 -0700
Subject: [PATCH 17/45] add compat mooncake v0.4.142, fixed the autograd error
 on nested struct

---
 example/Project.toml |   3 +
 example/test_n.jl    | 167 -------------------------------------------
 2 files changed, 3 insertions(+), 167 deletions(-)
 delete mode 100644 example/test_n.jl

diff --git a/example/Project.toml b/example/Project.toml
index ef94533..0082b0b 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -23,3 +23,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extras]
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+
+[compat]
+Mooncake = "0.4.142"
diff --git a/example/test_n.jl b/example/test_n.jl
deleted file mode 100644
index 595e96c..0000000
--- a/example/test_n.jl
+++ /dev/null
@@ -1,167 +0,0 @@
-using Flux
-using Bijectors
-using Bijectors: partition, combine, PartitionMask
-
-using Random, Distributions, LinearAlgebra
-using Functors
-using Optimisers, ADTypes
-using Mooncake, Zygote, Enzyme, ADTypes
-import NormalizingFlows as NF
-
-import DifferentiationInterface as DI
-
-
-pt = Float64
-inputdim = 4
-outputdim = 3
-
-x = randn(pt, inputdim)
-
-bs = 64
-xs = randn(pt, inputdim, 64)
-
-# compose two fully connected networks
-m1 = NF.fnn(inputdim, [16, 16], outputdim; output_activation=nothing, paramtype=pt)
-m2 = NF.fnn(outputdim, [16, 16], inputdim; output_activation=Flux.tanh, paramtype=pt)
-mm = reduce(∘, (m2, m1))
-psm, stm = Optimisers.destructure(mm)
-
-function lsm(ps, st, x)
-    model = st(ps)
-    y = model(x)
-    return sum(y) # just a dummy loss
-end
-
-adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
-
-val, grad = DI.value_and_gradient(
-    lsm, adtype, 
-    psm, DI.Cache(stm), DI.Constant(xs)
-)
-
-
-acl = NF.AffineCoupling( inputdim, [16, 16], 1:2:inputdim, pt)
-psacl,stacl = Optimisers.destructure(acl)
-
-function loss(ps, st, x)
-    model = st(ps)
-    y = model(x)
-    return sum(y) # just a dummy loss
-end
-
-val, grad = DI.value_and_gradient(
-    loss, 
-    ADTypes.AutoEnzyme(;
-            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
-            function_annotation=Enzyme.Const,
-        ),
-    psacl, DI.Cache(stacl), DI.Constant(x)
-)
-
-# val, grad = DI.value_and_gradient(
-#     loss, 
-#     ADTypes.AutoMooncake(; config = Mooncake.Config()), 
-#     psacl, DI.Cache(stacl), DI.Constant(x)
-# )
-
-function loss_acl_manual(ps, st, x)
-    acl = st(ps)
-    s_net = acl.s
-    t_net = acl.t
-    mask = acl.mask
-    x₁, x₂, x₃ = partition(mask, x)
-    y₁ = exp.(s_net(x₂)) .* x₁ .+ t_net(x₂)
-    y = combine(mask, y₁, x₂, x₃)
-    # println("y = ", y)
-    return sum(y)
-end
-
-val, grad = DI.value_and_gradient(
-    loss_acl_manual, 
-    # ADTypes.AutoMooncake(; config = Mooncake.Config()), 
-    # ADTypes.AutoEnzyme(;
-    #         mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
-    #         function_annotation=Enzyme.Const,
-    #     ),
-    psacl, DI.Cache(stacl), DI.Constant(x)
-)
-
-
-
-function mlp3(
-    input_dim::Int, 
-    hidden_dims::Int, 
-    output_dim::Int; 
-    activation=Flux.leakyrelu,
-    paramtype::Type{T} = Float64
-) where {T<:AbstractFloat}
-    m = Chain(
-        Flux.Dense(input_dim, hidden_dims, activation),
-        Flux.Dense(hidden_dims, hidden_dims, activation),
-        Flux.Dense(hidden_dims, output_dim),
-    )
-    return Flux._paramtype(paramtype, m)
-end
-
-function ls_msk(ps, st, x, mask)
-    t_net = st(ps)
-    x₁, x₂, x₃ = partition(mask, x)
-    y₁ = x₁ .+ t_net(x₂)
-    y = combine(mask, y₁, x₂, x₃)
-    # println("y = ", y)
-    return sum(abs2, y)
-end
-
-inputdim = 4
-mask_idx = 1:2:inputdim
-mask = PartitionMask(inputdim, mask_idx)
-cdim = length(mask_idx)
-
-x = randn(inputdim)
-
-t_net = mlp3(cdim, 16, cdim; paramtype = Float64)
-ps, st = Optimisers.destructure(t_net)
-
-ls_msk(ps, st, x, mask) # 3.0167880799441793
-
-val, grad = DI.value_and_gradient(
-    ls_msk, 
-    ADTypes.AutoMooncake(; config = Mooncake.Config()), 
-    ps, DI.Cache(st), DI.Constant(x), DI.Constant(mask)
-)
-
-
-struct ACL
-    mask::Bijectors.PartitionMask
-    t::Flux.Chain
-end
-@functor ACL (t, )
-
-acl = ACL(mask, t_net)
-psacl, stacl = Optimisers.destructure(acl)
-
-function loss_acl(ps, st, x)
-    acl = st(ps)
-    t_net = acl.t
-    mask = acl.mask
-    x₁, x₂, x₃ = partition(mask, x)
-    y₁ = x₁ .+ t_net(x₂)
-    y = combine(mask, y₁, x₂, x₃)
-    return sum(abs2, y)
-end
-loss_acl(psacl, stacl, x) # 3.0167880799441793
-
-val, grad = DI.value_and_gradient(
-    loss_acl, 
-    ADTypes.AutoEnzyme(;
-            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
-            function_annotation=Enzyme.Const,
-        ),
-    psacl, DI.Cache(stacl), DI.Constant(x)
-)
-
-val, grad = DI.value_and_gradient(
-    loss_acl, 
-    ADTypes.AutoMooncake(; config = Mooncake.Config()), 
-    psacl, DI.Cache(stacl), DI.Constant(x)
-)

From 81000b9257d6b754bcc281a7d7fc9dcf1e7eb82a Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sat, 2 Aug 2025 16:57:56 -0700
Subject: [PATCH 18/45] add mooncake compat >= v0.4.142

---
 Project.toml         | 2 +-
 example/Project.toml | 7 ++++---
 test/Project.toml    | 4 ++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 30b5039..100e236 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,4 +34,4 @@ Functors = "0.5.2"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
-julia = "1.11"
+julia = "1.10.8"
diff --git a/example/Project.toml b/example/Project.toml
index 0082b0b..945bb6c 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -18,11 +18,12 @@ Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
-[extras]
-CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
-
 [compat]
 Mooncake = "0.4.142"
+
+[extras]
+CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
diff --git a/test/Project.toml b/test/Project.toml
index 66f38e3..5473818 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -15,3 +15,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+Mooncake = "0.4.142"
+

From 4caae4981748c5a8350d5252536742b7d1f05864 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sat, 2 Aug 2025 23:56:25 -0700
Subject: [PATCH 19/45] add nsf interface

---
 example/Project.toml               |  3 +
 example/demo_RealNVP.jl            | 10 ++--
 example/demo_neural_spline_flow.jl | 48 +++++++++++-----
 src/NormalizingFlows.jl            |  4 +-
 src/flows/neuralspline.jl          | 89 ++++++++++++++++++++++++------
 5 files changed, 115 insertions(+), 39 deletions(-)

diff --git a/example/Project.toml b/example/Project.toml
index 945bb6c..db875db 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -2,6 +2,7 @@
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
+ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
@@ -9,9 +10,11 @@ DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
+MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index 67d2664..62944d7 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -5,7 +5,7 @@ using Bijectors: partition, combine, PartitionMask
 using Random, Distributions, LinearAlgebra
 using Functors
 using Optimisers, ADTypes
-using Mooncake
+using Mooncake, Zygote
 using NormalizingFlows
 
 include("SyntheticTargets.jl")
@@ -47,18 +47,16 @@ sample_per_iter = 16
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
-# TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
-adtype = ADTypes.AutoMooncake(; config = nothing)
-# adtype = ADTypes.AutoZygote()
+adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
     rng, 
-    elbo_batch,        # using elbo_batch instead of elbo achieves 4-5 times speedup 
+    elbo,        # using elbo_batch instead of elbo achieves 4-5 times speedup 
     flow,
     logp,
     sample_per_iter;
-    max_iters=100,   # change to larger number of iterations (e.g., 50_000) for better results
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
     optimiser=Optimisers.Adam(5e-4),
     ADbackend=adtype,
     show_progress=true,
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index 3ad8b86..d3658f8 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -28,18 +28,9 @@ logp = Base.Fix1(logpdf, target)
 # learn the target using Affine coupling flow
 ######################################
 @leaf MvNormal
-q0 = MvNormal(zeros(T, 2), ones(T, 2))
-
-d = 2
-hdims = 64
-K = 10
-B = 30
-Ls = [
-    NeuralSplineLayer(d, hdims, K, B, [1]) ∘ NeuralSplineLayer(d, hdims, K, B, [2]) for
-    i in 1:3
-]
-
-flow = create_flow(Ls, q0)
+q0 = MvNormal(zeros(T, 2), I)
+
+flow = nsf(q0; paramtype=Float32)
 flow_untrained = deepcopy(flow)
 
 
@@ -50,7 +41,6 @@ sample_per_iter = 64
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
-# TODO: now using AutoMooncake the example broke, but AutoZygote works, need to debug
 adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
@@ -73,3 +63,35 @@ losses = map(x -> x.loss, stats)
 ######################################
 plot(losses; label="Loss", linewidth=2) # plot the loss
 compare_trained_and_untrained_flow(flow_trained, flow_untrained, target, 1000)
+
+
+
+
+
+
+
+
+
+# using MonotonicSplines, Plots, InverseFunctions, ChangesOfVariables
+
+# f = rand(RQSpline)
+# f.pX, f.pY, f.dYdX
+
+# plot(f, xlims = (-6, 6)); plot!(inverse(f), xlims = (-6, 6))
+
+# x = 1.2
+# y = f(x)
+# with_logabsdet_jacobian(f, x)
+# inverse(f)(y)
+# with_logabsdet_jacobian(inverse(f), y)
+
+
+
+# # test auto grad
+# function loss(x)
+#     y, laj = MonotonicSplines.rqs_forward(x, f.pX, f.pY, f.dYdX)
+#     return laj + 0.5 * sum((y .- 1).^2)
+# end
+
+# xx = rand()
+# val, g = DifferentiationInterface.value_and_gradient(loss, adtype, xx)
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 3071fd0..71217fb 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -132,8 +132,8 @@ include("flows/realnvp.jl")
 include("flows/neuralspline.jl")
 
 export create_flow
-export RealNVP_layer, realnvp, AffineCoupling
-export NeuralSplineLayer
+export AffineCoupling, RealNVP_layer, realnvp
+export NeuralSplineCoupling, NSF_layer, nsf
 
 
 end
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index c45c3d1..01a1dbd 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -1,41 +1,40 @@
-##################################
-# define neural spline layer using Bijectors.jl interface
-#################################
 """
 Neural Rational quadratic Spline layer 
 
 # References
 [1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
 """
-struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector
+struct NeuralSplineCoupling{T,A<:Flux.Chain} <: Bijectors.Bijector
     dim::Int                        # dimension of input
     K::Int                          # number of knots
     n_dims_transferred::Int         # number of dimensions that are transformed
-    nn::A                           # networks that parmaterize the knots and derivatives
     B::T                            # bound of the knots
+    nn::A                           # networks that parmaterize the knots and derivatives
     mask::Bijectors.PartitionMask
 end
 
-function NeuralSplineLayer(
+function NeuralSplineCoupling(
     dim::T1,                         # dimension of input
-    hdims::T1,                       # dimension of hidden units for s and t
+    hdims::AbstractVector{T1},       # dimension of hidden units for s and t
     K::T1,                           # number of knots
     B::T2,                           # bound of the knots
     mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
-) where {T1<:Int,T2<:Real}
+    paramtype::Type{T3},             # type of the parameters, e.g., Float64 or Float32
+) where {T1<:Int,T2<:Real,T3<:AbstractFloat}
     num_of_transformed_dims = length(mask_idx)
     input_dims = dim - num_of_transformed_dims
     
     # output dim of the NN
     output_dims = (3K - 1)*num_of_transformed_dims
     # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
-    nn = mlp3(input_dims, hdims, output_dims)
+    # todo: ensure type stability
+    nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
 
     mask = Bijectors.PartitionMask(dim, mask_idx)
-    return NeuralSplineLayer(dim, K, num_of_transformed_dims, nn, B, mask)
+    return NeuralSplineCoupling(dim, K, num_of_transformed_dims, B, nn, mask)
 end
 
-@functor NeuralSplineLayer (nn,)
+@functor NeuralSplineCoupling (nn,)
 
 """
 Build a rational quadratic spline (RQS) from the nn output
@@ -43,7 +42,7 @@ Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic
 
 we just need to map the nn output to the knots and derivatives of the RQS
 """
-function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector)
+function instantiate_rqs(nsl::NeuralSplineCoupling, x::AbstractVector)
     K, B = nsl.K, nsl.B
     nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :)
     ws = @view nnoutput[:, 1:K]
@@ -52,7 +51,7 @@ function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector)
     return Bijectors.RationalQuadraticSpline(ws, hs, ds, B)
 end
 
-function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector)
+function Bijectors.transform(nsl::NeuralSplineCoupling, x::AbstractVector)
     x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
     # instantiate rqs knots and derivatives
     rqs = instantiate_rqs(nsl, x_2)
@@ -60,7 +59,7 @@ function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector)
     return Bijectors.combine(nsl.mask, y_1, x_2, x_3)
 end
 
-function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
+function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
     nsl = insl.orig
     y1, y2, y3 = partition(nsl.mask, y)
     rqs = instantiate_rqs(nsl, y2)
@@ -68,19 +67,19 @@ function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVect
     return Bijectors.combine(nsl.mask, x1, y2, y3)
 end
 
-function (nsl::NeuralSplineLayer)(x::AbstractVector)
+function (nsl::NeuralSplineCoupling)(x::AbstractVector)
     return Bijectors.transform(nsl, x)
 end
 
 # define logabsdetjac
-function Bijectors.logabsdetjac(nsl::NeuralSplineLayer, x::AbstractVector)
+function Bijectors.logabsdetjac(nsl::NeuralSplineCoupling, x::AbstractVector)
     x_1, x_2, _ = Bijectors.partition(nsl.mask, x)
     rqs = instantiate_rqs(nsl, x_2)
     logjac = logabsdetjac(rqs, x_1)
     return logjac
 end
 
-function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
+function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
     nsl = insl.orig
     y1, y2, _ = partition(nsl.mask, y)
     rqs = instantiate_rqs(nsl, y2)
@@ -88,10 +87,64 @@ function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractV
     return logjac
 end
 
-function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineLayer, x::AbstractVector)
+function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineCoupling, x::AbstractVector)
     x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
     rqs = instantiate_rqs(nsl, x_2)
     y_1, logjac = with_logabsdet_jacobian(rqs, x_1)
     return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac
 end
 
+
+"""
+    NSF_layer(dims, hdims; paramtype = Float64)
+
+Default constructor of single layer of Neural Spline Flow (NSF) 
+which is a composition of 2 neural spline coupling transformations with complementary masks.
+The masking strategy is odd-even masking.
+
+# Arguments
+- `dims::Int`: dimension of the problem
+- `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
+- `K::Int`: number of knots
+- `B::AbstractFloat`: bound of the knots
+
+# Keyword Arguments
+- `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
+
+# Returns
+- A `Bijectors.Bijector` representing the NSF layer.
+"""
+function NSF_layer(
+    dims::T1,                      # dimension of problem
+    hdims::AbstractVector{T1},     # dimension of hidden units for nn 
+    K::T1,                           # number of knots
+    B::T2;                           # bound of the knots
+    paramtype::Type{T2} = Float64,   # type of the parameters
+) where {T1<:Int,T2<:AbstractFloat}
+
+    mask_idx1 = 1:2:dims
+    mask_idx2 = 2:2:dims
+
+    # by default use the odd-even masking strategy
+    nsf1 = NeuralSplineCoupling(dims, hdims, K, B, mask_idx1, paramtype)
+    nsf2 = NeuralSplineCoupling(dims, hdims, K, B, mask_idx2, paramtype)
+    return reduce(∘, (nsf1, nsf2))
+end
+
+function nsf(
+    q0::Distribution{Multivariate,Continuous},  
+    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
+    K::Int,
+    B::T,
+    nlayers::Int;                   # number of RealNVP_layer 
+    paramtype::Type{T} = Float64,   # type of the parameters
+) where {T<:AbstractFloat}
+
+    dims = length(q0)  # dimension of the reference distribution == dim of the problem
+    Ls = [NSF_layer(dims, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
+    create_flow(Ls, q0)         
+end
+
+nsf(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = nsf(
+    q0, [32, 32], 10, 30*one(T), 10; paramtype=paramtype
+)

From cf2e6741d0ad79b040b0d5bc9822a9ec1ca9377d Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 13:37:40 -0700
Subject: [PATCH 20/45] fix a typo in elbo_batch signiture

---
 src/objectives/elbo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
index d282e1e..6a4d4ba 100644
--- a/src/objectives/elbo.jl
+++ b/src/objectives/elbo.jl
@@ -77,5 +77,5 @@ function elbo_batch(rng::AbstractRNG, flow::Bijectors.MultivariateTransformed, l
     elbos = elbo_batch(flow, logp, xs)
     return mean(elbos)
 end
-elbo_batch(flow::Bijectors.UnivariateTransformed, logp, n_samples) = 
+elbo_batch(flow::Bijectors.TransformedDistribution, logp, n_samples) = 
     elbo_batch(Random.default_rng(), flow, logp, n_samples)

From 7f9c382d5af0de3259634e25bf0db7b1526b5355 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 13:38:12 -0700
Subject: [PATCH 21/45] rm redundant comments

---
 src/flows/neuralspline.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index 01a1dbd..0d34358 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -27,7 +27,6 @@ function NeuralSplineCoupling(
     # output dim of the NN
     output_dims = (3K - 1)*num_of_transformed_dims
     # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
-    # todo: ensure type stability
     nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
 
     mask = Bijectors.PartitionMask(dim, mask_idx)

From 9f0cbad18560f6b20d3d2ab581f01e385b68e961 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 13:50:14 -0700
Subject: [PATCH 22/45] making target adapting to the chosen Floating type
 automatically

---
 example/demo_RealNVP.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index 62944d7..d822d0b 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -21,7 +21,7 @@ T = Float32
 ######################################
 # a difficult banana target
 ######################################
-target = Banana(2, 1.0f0, 100.0f0)
+target = Banana(2, one(T), 100one(T))
 logp = Base.Fix1(logpdf, target)
 
 

From 99a0fedd28bbd26e4490f63ad58c4d5f97e2922a Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 16:50:18 -0700
Subject: [PATCH 23/45] rm redundant flux dependencies

---
 example/Project.toml |  4 ----
 example/utils.jl     | 12 ------------
 2 files changed, 16 deletions(-)

diff --git a/example/Project.toml b/example/Project.toml
index db875db..79c847b 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -2,19 +2,15 @@
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
-ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
-InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
-MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
diff --git a/example/utils.jl b/example/utils.jl
index fb28810..8f76ebf 100644
--- a/example/utils.jl
+++ b/example/utils.jl
@@ -1,17 +1,5 @@
 using Random, Distributions, LinearAlgebra
 using Bijectors: transformed
-using Flux
-
-"""
-A simple wrapper for a 3 layer dense MLP
-"""
-function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
-    return Chain(
-        Flux.Dense(input_dim, hidden_dims, activation),
-        Flux.Dense(hidden_dims, hidden_dims, activation),
-        Flux.Dense(hidden_dims, output_dim),
-    )
-end
 
 function compare_trained_and_untrained_flow(
     flow_trained::Bijectors.MultivariateTransformed,

From c4128faf2743364051d5d23f4fcee39727ee97d9 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 16:51:50 -0700
Subject: [PATCH 24/45] add new nsf implementation and demo; much faster than
 the original nsf

---
 .github/workflows/Examples.yml     |   2 +
 Project.toml                       |   2 +
 example/demo_neural_spline_flow.jl |  50 ++-----------
 example/demo_new_nsf.jl            |  65 +++++++++++++++++
 src/NormalizingFlows.jl            |   5 ++
 src/flows/new_nsf.jl               | 113 +++++++++++++++++++++++++++++
 6 files changed, 195 insertions(+), 42 deletions(-)
 create mode 100644 example/demo_new_nsf.jl
 create mode 100644 src/flows/new_nsf.jl

diff --git a/.github/workflows/Examples.yml b/.github/workflows/Examples.yml
index f9da45f..50e55c2 100644
--- a/.github/workflows/Examples.yml
+++ b/.github/workflows/Examples.yml
@@ -38,5 +38,7 @@ jobs:
           include("demo_RealNVP.jl");
           @info "Running neural spline flow demo";
           include("demo_neural_spline_flow.jl");
+          @info "Running new neural spline flow demo";
+          include("demo_new_nsf.jl");
           @info "Running Hamiltonian flow demo";
           include("demo_hamiltonian_flow.jl");'
diff --git a/Project.toml b/Project.toml
index 100e236..4c49b8f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -31,6 +32,7 @@ Distributions = "0.25"
 DocStringExtensions = "0.9"
 Flux = "0.16"
 Functors = "0.5.2"
+MonotonicSplines = "0.3.3"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index d3658f8..ae9ce07 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -1,4 +1,3 @@
-using Flux
 using Bijectors
 using Bijectors: partition, combine, PartitionMask
 
@@ -19,21 +18,20 @@ rng = Random.default_rng()
 T = Float32
 
 ######################################
-# neals funnel target
+# a difficult banana target
 ######################################
-target = Funnel(2, 0.0f0, 9.0f0)
-logp = Base.Fix1(logpdf, target)
 
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
 ######################################
-# learn the target using Affine coupling flow
+# learn the target using Neural Spline Flow
 ######################################
 @leaf MvNormal
 q0 = MvNormal(zeros(T, 2), I)
 
-flow = nsf(q0; paramtype=Float32)
-flow_untrained = deepcopy(flow)
-
 
+flow = nsf(q0; paramtype=T)
+flow_untrained = deepcopy(flow)
 ######################################
 # start training
 ######################################
@@ -48,8 +46,8 @@ flow_trained, stats, _ = train_flow(
     flow,
     logp,
     sample_per_iter;
-    max_iters=100,   # change to larger number of iterations (e.g., 50_000) for better results
-    optimiser=Optimisers.Adam(5e-5),
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
     ADbackend=adtype,
     show_progress=true,
     callback=cb,
@@ -63,35 +61,3 @@ losses = map(x -> x.loss, stats)
 ######################################
 plot(losses; label="Loss", linewidth=2) # plot the loss
 compare_trained_and_untrained_flow(flow_trained, flow_untrained, target, 1000)
-
-
-
-
-
-
-
-
-
-# using MonotonicSplines, Plots, InverseFunctions, ChangesOfVariables
-
-# f = rand(RQSpline)
-# f.pX, f.pY, f.dYdX
-
-# plot(f, xlims = (-6, 6)); plot!(inverse(f), xlims = (-6, 6))
-
-# x = 1.2
-# y = f(x)
-# with_logabsdet_jacobian(f, x)
-# inverse(f)(y)
-# with_logabsdet_jacobian(inverse(f), y)
-
-
-
-# # test auto grad
-# function loss(x)
-#     y, laj = MonotonicSplines.rqs_forward(x, f.pX, f.pY, f.dYdX)
-#     return laj + 0.5 * sum((y .- 1).^2)
-# end
-
-# xx = rand()
-# val, g = DifferentiationInterface.value_and_gradient(loss, adtype, xx)
diff --git a/example/demo_new_nsf.jl b/example/demo_new_nsf.jl
new file mode 100644
index 0000000..b016009
--- /dev/null
+++ b/example/demo_new_nsf.jl
@@ -0,0 +1,65 @@
+using Bijectors
+using Bijectors: partition, combine, PartitionMask
+
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Mooncake
+using NormalizingFlows
+
+include("SyntheticTargets.jl")
+include("utils.jl")
+
+##################################
+# start demo
+#################################
+Random.seed!(123)
+rng = Random.default_rng()
+T = Float32
+
+######################################
+# a difficult banana target
+######################################
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
+
+######################################
+# learn the target using Neural Spline Flow
+######################################
+@leaf MvNormal
+q0 = MvNormal(zeros(T, 2), I)
+
+
+flow = new_nsf(q0; paramtype=T)
+flow_untrained = deepcopy(flow)
+######################################
+# start training
+######################################
+sample_per_iter = 64
+
+# callback function to log training progress
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# TODO: mooncake has some issues with kernelabstractions?
+# adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+adtype = ADTypes.AutoZygote()
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
+flow_trained, stats, _ = train_flow(
+    elbo_batch,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
+    ADbackend=adtype,
+    show_progress=true,
+    callback=cb,
+    hasconverged=checkconv,
+)
+θ, re = Optimisers.destructure(flow_trained)
+losses = map(x -> x.loss, stats)
+
+######################################
+# evaluate trained flow
+######################################
+plot(losses; label="Loss", linewidth=2) # plot the loss
+compare_trained_and_untrained_flow(flow_trained, flow_untrained, target, 1000)
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 71217fb..770aba9 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -130,10 +130,15 @@ end
 include("flows/utils.jl")
 include("flows/realnvp.jl")
 include("flows/neuralspline.jl")
+# a new implementation of Neural Spline Flow based on MonotonicSplines.jl
+# the construction of the RQS seems to be more efficient than the one in Bijectors.jl
+# and supports batched operations.
+include("flows/new_nsf.jl")
 
 export create_flow
 export AffineCoupling, RealNVP_layer, realnvp
 export NeuralSplineCoupling, NSF_layer, nsf
+export NSC, new_NSF_layer, new_nsf
 
 
 end
diff --git a/src/flows/new_nsf.jl b/src/flows/new_nsf.jl
new file mode 100644
index 0000000..47c35d7
--- /dev/null
+++ b/src/flows/new_nsf.jl
@@ -0,0 +1,113 @@
+using MonotonicSplines
+
+struct NSC{T,A<:Flux.Chain} <: Bijectors.Bijector
+    dim::Int                        # dimension of input
+    K::Int                          # number of knots
+    n_dims_transferred::Int         # number of dimensions that are transformed
+    B::T                            # bound of the knots
+    nn::A                           # networks that parmaterize the knots and derivatives
+    mask::Bijectors.PartitionMask
+end
+
+function NSC(
+    dim::T1,                         # dimension of input
+    hdims::AbstractVector{T1},       # dimension of hidden units for s and t
+    K::T1,                           # number of knots
+    B::T2,                           # bound of the knots
+    mask_idx::AbstractVector{T1}, # index of dimensione that one wants to apply transformations on
+    paramtype::Type{T2},             # type of the parameters, e.g., Float64 or Float32
+) where {T1<:Int,T2<:AbstractFloat}
+    num_of_transformed_dims = length(mask_idx)
+    input_dims = dim - num_of_transformed_dims
+    
+    # output dim of the NN
+    output_dims = (3K - 1)*num_of_transformed_dims
+    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
+    nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
+
+    mask = Bijectors.PartitionMask(dim, mask_idx)
+    return NSC{T2, typeof(nn)}(dim, K, num_of_transformed_dims, B, nn, mask)
+end
+
+@functor NSC (nn,)
+
+function get_nsl_params(nsl::NSC, x::AbstractVecOrMat)
+    nnoutput = nsl.nn(x)
+    px, py, dydx = MonotonicSplines.rqs_params_from_nn(nnoutput, nsl.n_dims_transferred, nsl.B)
+    return px, py, dydx
+end
+
+function Bijectors.transform(nsl::NSC, x::AbstractVecOrMat)
+    x1, x2, x3 = Bijectors.partition(nsl.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsl_params(nsl, x2)
+    if x1 isa AbstractVector
+        x1 = reshape(x1, 1, length(x1))  # ensure x1 is a matrix
+    end
+    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, y1, x2, x3)
+end
+
+function Bijectors.with_logabsdet_jacobian(nsl::NSC, x::AbstractVecOrMat)
+    x1, x2, x3 = Bijectors.partition(nsl.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsl_params(nsl, x2)
+    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, y1, x2, x3), vec(logjac)
+end
+
+function Bijectors.transform(insl::Inverse{<:NSC}, y::AbstractVecOrMat)
+    nsl = insl.orig
+    y1, y2, y3 = partition(nsl.mask, y)
+    px, py, dydx = get_nsl_params(nsl, y2)
+    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, x1, y2, y3)
+end
+
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NSC}, y::AbstractVecOrMat)
+    nsl = insl.orig
+    y1, y2, y3 = partition(nsl.mask, y)
+    px, py, dydx = get_nsl_params(nsl, y2)
+    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, x1, y2, y3), logjac isa Real ? logjac : vec(logjac)
+end
+
+function (nsl::NSC)(x::AbstractVecOrMat)
+    return Bijectors.transform(nsl, x)
+end
+
+
+function new_NSF_layer(
+    dims::T1,                      # dimension of problem
+    hdims::AbstractVector{T1},     # dimension of hidden units for nn 
+    K::T1,                           # number of knots
+    B::T2;                           # bound of the knots
+    paramtype::Type{T2} = Float64,   # type of the parameters
+) where {T1<:Int,T2<:AbstractFloat}
+
+    mask_idx1 = 1:2:dims
+    mask_idx2 = 2:2:dims
+
+    # by default use the odd-even masking strategy
+    nsf1 = NSC(dims, hdims, K, B, mask_idx1, paramtype)
+    nsf2 = NSC(dims, hdims, K, B, mask_idx2, paramtype)
+    return reduce(∘, (nsf1, nsf2))
+end
+
+function new_nsf(
+    q0::Distribution{Multivariate,Continuous},  
+    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
+    K::Int,
+    B::T,
+    nlayers::Int;                   # number of RealNVP_layer 
+    paramtype::Type{T} = Float64,   # type of the parameters
+) where {T<:AbstractFloat}
+
+    dims = length(q0)  # dimension of the reference distribution == dim of the problem
+    Ls = [new_NSF_layer(dims, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
+    create_flow(Ls, q0)         
+end
+
+new_nsf(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = new_nsf(
+    q0, [32, 32], 10, 30*one(T), 10; paramtype=paramtype
+)

From 1f30b33ce66b5e88e21ba7d4a6d15091bb020354 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 17:00:18 -0700
Subject: [PATCH 25/45] rm redundant flux from realnvp demo

---
 example/demo_RealNVP.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index d822d0b..45f4213 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -1,4 +1,3 @@
-using Flux
 using Bijectors
 using Bijectors: partition, combine, PartitionMask
 

From 2903b8383a7d60b2138cd4f84a16e6bcbdbc3b15 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 17:30:16 -0700
Subject: [PATCH 26/45] dump the previous nsf implementation

---
 .github/workflows/Examples.yml     |   2 -
 example/demo_neural_spline_flow.jl |  12 +--
 example/demo_new_nsf.jl            |  65 -----------------
 src/NormalizingFlows.jl            |   5 --
 src/flows/neuralspline.jl          | 101 ++++++++++++--------------
 src/flows/new_nsf.jl               | 113 -----------------------------
 6 files changed, 54 insertions(+), 244 deletions(-)
 delete mode 100644 example/demo_new_nsf.jl
 delete mode 100644 src/flows/new_nsf.jl

diff --git a/.github/workflows/Examples.yml b/.github/workflows/Examples.yml
index 50e55c2..f9da45f 100644
--- a/.github/workflows/Examples.yml
+++ b/.github/workflows/Examples.yml
@@ -38,7 +38,5 @@ jobs:
           include("demo_RealNVP.jl");
           @info "Running neural spline flow demo";
           include("demo_neural_spline_flow.jl");
-          @info "Running new neural spline flow demo";
-          include("demo_new_nsf.jl");
           @info "Running Hamiltonian flow demo";
           include("demo_hamiltonian_flow.jl");'
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index ae9ce07..8fea9bf 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -4,7 +4,7 @@ using Bijectors: partition, combine, PartitionMask
 using Random, Distributions, LinearAlgebra
 using Functors
 using Optimisers, ADTypes
-using Mooncake
+using Zygote
 using NormalizingFlows
 
 include("SyntheticTargets.jl")
@@ -20,9 +20,9 @@ T = Float32
 ######################################
 # a difficult banana target
 ######################################
-
 target = Banana(2, one(T), 100one(T))
 logp = Base.Fix1(logpdf, target)
+
 ######################################
 # learn the target using Neural Spline Flow
 ######################################
@@ -39,14 +39,16 @@ sample_per_iter = 64
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
-adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+# TODO: mooncake has some issues with kernelabstractions?
+# adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+adtype = ADTypes.AutoZygote()
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
-    elbo,
+    elbo_batch,
     flow,
     logp,
     sample_per_iter;
-    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    max_iters=10000,   # change to larger number of iterations (e.g., 50_000) for better results
     optimiser=Optimisers.Adam(1e-4),
     ADbackend=adtype,
     show_progress=true,
diff --git a/example/demo_new_nsf.jl b/example/demo_new_nsf.jl
deleted file mode 100644
index b016009..0000000
--- a/example/demo_new_nsf.jl
+++ /dev/null
@@ -1,65 +0,0 @@
-using Bijectors
-using Bijectors: partition, combine, PartitionMask
-
-using Random, Distributions, LinearAlgebra
-using Functors
-using Optimisers, ADTypes
-using Mooncake
-using NormalizingFlows
-
-include("SyntheticTargets.jl")
-include("utils.jl")
-
-##################################
-# start demo
-#################################
-Random.seed!(123)
-rng = Random.default_rng()
-T = Float32
-
-######################################
-# a difficult banana target
-######################################
-target = Banana(2, one(T), 100one(T))
-logp = Base.Fix1(logpdf, target)
-
-######################################
-# learn the target using Neural Spline Flow
-######################################
-@leaf MvNormal
-q0 = MvNormal(zeros(T, 2), I)
-
-
-flow = new_nsf(q0; paramtype=T)
-flow_untrained = deepcopy(flow)
-######################################
-# start training
-######################################
-sample_per_iter = 64
-
-# callback function to log training progress
-cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
-# TODO: mooncake has some issues with kernelabstractions?
-# adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
-adtype = ADTypes.AutoZygote()
-checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
-flow_trained, stats, _ = train_flow(
-    elbo_batch,
-    flow,
-    logp,
-    sample_per_iter;
-    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
-    optimiser=Optimisers.Adam(1e-4),
-    ADbackend=adtype,
-    show_progress=true,
-    callback=cb,
-    hasconverged=checkconv,
-)
-θ, re = Optimisers.destructure(flow_trained)
-losses = map(x -> x.loss, stats)
-
-######################################
-# evaluate trained flow
-######################################
-plot(losses; label="Loss", linewidth=2) # plot the loss
-compare_trained_and_untrained_flow(flow_trained, flow_untrained, target, 1000)
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 770aba9..71217fb 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -130,15 +130,10 @@ end
 include("flows/utils.jl")
 include("flows/realnvp.jl")
 include("flows/neuralspline.jl")
-# a new implementation of Neural Spline Flow based on MonotonicSplines.jl
-# the construction of the RQS seems to be more efficient than the one in Bijectors.jl
-# and supports batched operations.
-include("flows/new_nsf.jl")
 
 export create_flow
 export AffineCoupling, RealNVP_layer, realnvp
 export NeuralSplineCoupling, NSF_layer, nsf
-export NSC, new_NSF_layer, new_nsf
 
 
 end
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index 0d34358..30af733 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -1,6 +1,10 @@
+using MonotonicSplines
+# a new implementation of Neural Spline Flow based on MonotonicSplines.jl
+# the construction of the RQS seems to be more efficient than the one in Bijectors.jl
+# and supports batched operations.
+
 """
 Neural Rational quadratic Spline layer 
-
 # References
 [1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
 """
@@ -18,9 +22,9 @@ function NeuralSplineCoupling(
     hdims::AbstractVector{T1},       # dimension of hidden units for s and t
     K::T1,                           # number of knots
     B::T2,                           # bound of the knots
-    mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
-    paramtype::Type{T3},             # type of the parameters, e.g., Float64 or Float32
-) where {T1<:Int,T2<:Real,T3<:AbstractFloat}
+    mask_idx::AbstractVector{T1}, # index of dimensione that one wants to apply transformations on
+    paramtype::Type{T2},             # type of the parameters, e.g., Float64 or Float32
+) where {T1<:Int,T2<:AbstractFloat}
     num_of_transformed_dims = length(mask_idx)
     input_dims = dim - num_of_transformed_dims
     
@@ -30,86 +34,75 @@ function NeuralSplineCoupling(
     nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
 
     mask = Bijectors.PartitionMask(dim, mask_idx)
-    return NeuralSplineCoupling(dim, K, num_of_transformed_dims, B, nn, mask)
+    return NeuralSplineCoupling{T2, typeof(nn)}(dim, K, num_of_transformed_dims, B, nn, mask)
 end
 
 @functor NeuralSplineCoupling (nn,)
 
-"""
-Build a rational quadratic spline (RQS) from the nn output
-Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
-
-we just need to map the nn output to the knots and derivatives of the RQS
-"""
-function instantiate_rqs(nsl::NeuralSplineCoupling, x::AbstractVector)
-    K, B = nsl.K, nsl.B
-    nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :)
-    ws = @view nnoutput[:, 1:K]
-    hs = @view nnoutput[:, (K + 1):(2K)]
-    ds = @view nnoutput[:, (2K + 1):(3K - 1)]
-    return Bijectors.RationalQuadraticSpline(ws, hs, ds, B)
+function get_nsc_params(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
+    nnoutput = nsc.nn(x)
+    px, py, dydx = MonotonicSplines.rqs_params_from_nn(nnoutput, nsc.n_dims_transferred, nsc.B)
+    return px, py, dydx
 end
 
-function Bijectors.transform(nsl::NeuralSplineCoupling, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    # instantiate rqs knots and derivatives
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1 = Bijectors.transform(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3)
-end
+# when input x is a vector instead of a matrix
+# need this to transform it to a matrix with one row
+# otherwise, rqs_forward and rqs_inverse will throw an error
+_ensure_matrix(x) = x isa AbstractVector ? reshape(x, 1, length(x)) : x
 
-function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, y3 = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    x1 = Bijectors.transform(Inverse(rqs), y1)
-    return Bijectors.combine(nsl.mask, x1, y2, y3)
+function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    x1 = _ensure_matrix(x1)
+    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, y1, x2, x3)
 end
 
-function (nsl::NeuralSplineCoupling)(x::AbstractVector)
-    return Bijectors.transform(nsl, x)
+function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    x1 = _ensure_matrix(x1)
+    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, y1, x2, x3), logjac isa Real ? logjac : vec(logjac)
 end
 
-# define logabsdetjac
-function Bijectors.logabsdetjac(nsl::NeuralSplineCoupling, x::AbstractVector)
-    x_1, x_2, _ = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    logjac = logabsdetjac(rqs, x_1)
-    return logjac
+function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVecOrMat)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    y1 = _ensure_matrix(y1)
+    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, x1, y2, y3)
 end
 
-function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, _ = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    logjac = logabsdetjac(Inverse(rqs), y1)
-    return logjac
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVecOrMat)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    y1 = _ensure_matrix(y1)
+    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, x1, y2, y3), logjac isa Real ? logjac : vec(logjac)
 end
 
-function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineCoupling, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1, logjac = with_logabsdet_jacobian(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac
+function (nsc::NeuralSplineCoupling)(x::AbstractVecOrMat)
+    return Bijectors.transform(nsc, x)
 end
 
 
 """
     NSF_layer(dims, hdims; paramtype = Float64)
-
 Default constructor of single layer of Neural Spline Flow (NSF) 
 which is a composition of 2 neural spline coupling transformations with complementary masks.
 The masking strategy is odd-even masking.
-
 # Arguments
 - `dims::Int`: dimension of the problem
 - `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
 - `K::Int`: number of knots
 - `B::AbstractFloat`: bound of the knots
-
 # Keyword Arguments
 - `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
-
 # Returns
 - A `Bijectors.Bijector` representing the NSF layer.
 """
diff --git a/src/flows/new_nsf.jl b/src/flows/new_nsf.jl
deleted file mode 100644
index 47c35d7..0000000
--- a/src/flows/new_nsf.jl
+++ /dev/null
@@ -1,113 +0,0 @@
-using MonotonicSplines
-
-struct NSC{T,A<:Flux.Chain} <: Bijectors.Bijector
-    dim::Int                        # dimension of input
-    K::Int                          # number of knots
-    n_dims_transferred::Int         # number of dimensions that are transformed
-    B::T                            # bound of the knots
-    nn::A                           # networks that parmaterize the knots and derivatives
-    mask::Bijectors.PartitionMask
-end
-
-function NSC(
-    dim::T1,                         # dimension of input
-    hdims::AbstractVector{T1},       # dimension of hidden units for s and t
-    K::T1,                           # number of knots
-    B::T2,                           # bound of the knots
-    mask_idx::AbstractVector{T1}, # index of dimensione that one wants to apply transformations on
-    paramtype::Type{T2},             # type of the parameters, e.g., Float64 or Float32
-) where {T1<:Int,T2<:AbstractFloat}
-    num_of_transformed_dims = length(mask_idx)
-    input_dims = dim - num_of_transformed_dims
-    
-    # output dim of the NN
-    output_dims = (3K - 1)*num_of_transformed_dims
-    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
-    nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
-
-    mask = Bijectors.PartitionMask(dim, mask_idx)
-    return NSC{T2, typeof(nn)}(dim, K, num_of_transformed_dims, B, nn, mask)
-end
-
-@functor NSC (nn,)
-
-function get_nsl_params(nsl::NSC, x::AbstractVecOrMat)
-    nnoutput = nsl.nn(x)
-    px, py, dydx = MonotonicSplines.rqs_params_from_nn(nnoutput, nsl.n_dims_transferred, nsl.B)
-    return px, py, dydx
-end
-
-function Bijectors.transform(nsl::NSC, x::AbstractVecOrMat)
-    x1, x2, x3 = Bijectors.partition(nsl.mask, x)
-    # instantiate rqs knots and derivatives
-    px, py, dydx = get_nsl_params(nsl, x2)
-    if x1 isa AbstractVector
-        x1 = reshape(x1, 1, length(x1))  # ensure x1 is a matrix
-    end
-    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
-    return Bijectors.combine(nsl.mask, y1, x2, x3)
-end
-
-function Bijectors.with_logabsdet_jacobian(nsl::NSC, x::AbstractVecOrMat)
-    x1, x2, x3 = Bijectors.partition(nsl.mask, x)
-    # instantiate rqs knots and derivatives
-    px, py, dydx = get_nsl_params(nsl, x2)
-    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
-    return Bijectors.combine(nsl.mask, y1, x2, x3), vec(logjac)
-end
-
-function Bijectors.transform(insl::Inverse{<:NSC}, y::AbstractVecOrMat)
-    nsl = insl.orig
-    y1, y2, y3 = partition(nsl.mask, y)
-    px, py, dydx = get_nsl_params(nsl, y2)
-    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
-    return Bijectors.combine(nsl.mask, x1, y2, y3)
-end
-
-function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NSC}, y::AbstractVecOrMat)
-    nsl = insl.orig
-    y1, y2, y3 = partition(nsl.mask, y)
-    px, py, dydx = get_nsl_params(nsl, y2)
-    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
-    return Bijectors.combine(nsl.mask, x1, y2, y3), logjac isa Real ? logjac : vec(logjac)
-end
-
-function (nsl::NSC)(x::AbstractVecOrMat)
-    return Bijectors.transform(nsl, x)
-end
-
-
-function new_NSF_layer(
-    dims::T1,                      # dimension of problem
-    hdims::AbstractVector{T1},     # dimension of hidden units for nn 
-    K::T1,                           # number of knots
-    B::T2;                           # bound of the knots
-    paramtype::Type{T2} = Float64,   # type of the parameters
-) where {T1<:Int,T2<:AbstractFloat}
-
-    mask_idx1 = 1:2:dims
-    mask_idx2 = 2:2:dims
-
-    # by default use the odd-even masking strategy
-    nsf1 = NSC(dims, hdims, K, B, mask_idx1, paramtype)
-    nsf2 = NSC(dims, hdims, K, B, mask_idx2, paramtype)
-    return reduce(∘, (nsf1, nsf2))
-end
-
-function new_nsf(
-    q0::Distribution{Multivariate,Continuous},  
-    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
-    K::Int,
-    B::T,
-    nlayers::Int;                   # number of RealNVP_layer 
-    paramtype::Type{T} = Float64,   # type of the parameters
-) where {T<:AbstractFloat}
-
-    dims = length(q0)  # dimension of the reference distribution == dim of the problem
-    Ls = [new_NSF_layer(dims, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
-    create_flow(Ls, q0)         
-end
-
-new_nsf(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = new_nsf(
-    q0, [32, 32], 10, 30*one(T), 10; paramtype=paramtype
-)

From 48bc3d30fa594afa1a072510f037cab5c99e6f74 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 17:33:58 -0700
Subject: [PATCH 27/45] add test for nsf

---
 test/flow.jl | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/test/flow.jl b/test/flow.jl
index 5ad158a..273b20e 100644
--- a/test/flow.jl
+++ b/test/flow.jl
@@ -39,6 +39,72 @@
         end
 
 
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Define a simple log-likelihood function
+            logp(z) = logpdf(q₀, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test elbo_value is not NaN and not Inf
+            @test !isnan(elbo_value)
+            @test !isinf(elbo_value)
+            @test !isnan(elbo_batch_value)
+            @test !isinf(elbo_batch_value)
+        end
+
+        #todo add tests for ad
+    end
+end
+
+@testset "Neural Spline flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 2
+    hdims = [32, 32]
+    for T in [Float32, Float64]
+        # Create a RealNVP flow
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+        flow = NormalizingFlows.nsf(q₀; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-6
+            @test lj_fwd ≈ -lj_bwd rtol=1e-6
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-6
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-6
+        end
+
+
         @testset "ELBO test for type: $T" begin
             μ = randn(T, dim)
             Σ = Diagonal(rand(T, dim) .+ T(1e-3))

From 9494de1be22992c8cf2ffe0e09a7e577f597c7ca Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 17:34:57 -0700
Subject: [PATCH 28/45] add ad test for nsf

---
 test/ad.jl | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/test/ad.jl b/test/ad.jl
index 0fdfa2b..32c4b18 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -120,3 +120,50 @@ end
         end
     end
 end
+
+@testset "AD for ELBO on NSF" begin
+    @testset "$at" for at in [
+        ADTypes.AutoZygote(),
+        ADTypes.AutoForwardDiff(),
+        ADTypes.AutoReverseDiff(; compile=false),
+        ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+        # it doesn't work with mooncake yet
+        ADTypes.AutoMooncake(; config=Mooncake.Config()),
+    ]
+        @testset "$T" for T in [Float32, Float64]
+            μ = 10 * ones(T, 2)
+            Σ = Diagonal(4 * ones(T, 2))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # necessary for Zygote/mooncake to differentiate through the flow
+            # prevent updating params of q0
+            @leaf MvNormal
+            q₀ = MvNormal(zeros(T, 2), ones(T, 2))
+            flow = realnvp(q₀, [8, 8], 3; paramtype=T)
+
+            θ, re = Optimisers.destructure(flow)
+
+            # check grad computation for elbo
+            function loss(θ, rng, logp, sample_per_iter)
+                return -NormalizingFlows.elbo_batch(rng, re(θ), logp, sample_per_iter)
+            end
+
+            rng = Random.default_rng()
+            sample_per_iter = 10
+
+            prep = NormalizingFlows._prepare_gradient(
+                loss, at, θ, rng, logp, sample_per_iter
+            )
+            value, grad = NormalizingFlows._value_and_gradient(
+                loss, prep, at, θ, rng, logp, sample_per_iter
+            )
+
+            @test value !== nothing
+            @test all(grad .!= nothing)
+        end
+    end
+end

From 8f61fc9611e1ebc8435c643423df47a01b88abcd Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 17:35:31 -0700
Subject: [PATCH 29/45] fix typo in nsf test

---
 test/ad.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ad.jl b/test/ad.jl
index 32c4b18..4103deb 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -143,7 +143,7 @@ end
             # prevent updating params of q0
             @leaf MvNormal
             q₀ = MvNormal(zeros(T, 2), ones(T, 2))
-            flow = realnvp(q₀, [8, 8], 3; paramtype=T)
+            flow = nsf(q₀, [8, 8], 10, 5one(T), 3; paramtype=T)
 
             θ, re = Optimisers.destructure(flow)
 

From 977caafdf58a55ff78a01d18d7349384349c9e56 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 18:26:00 -0700
Subject: [PATCH 30/45] fix nsf test error regarding rand()

---
 src/NormalizingFlows.jl   |  2 ++
 src/flows/neuralspline.jl | 49 +++++++++++++++++++++++++++++++--------
 test/Project.toml         |  2 +-
 test/ad.jl                | 17 +++++++-------
 test/flow.jl              | 17 +++++++-------
 5 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 71217fb..d453cc2 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -129,6 +129,8 @@ end
 # interface of contructing common flow layers
 include("flows/utils.jl")
 include("flows/realnvp.jl")
+
+using MonotonicSplines
 include("flows/neuralspline.jl")
 
 export create_flow
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index 30af733..86374ce 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -1,10 +1,9 @@
-using MonotonicSplines
 # a new implementation of Neural Spline Flow based on MonotonicSplines.jl
 # the construction of the RQS seems to be more efficient than the one in Bijectors.jl
 # and supports batched operations.
 
 """
-Neural Rational quadratic Spline layer 
+Neural Rational Quadratic Spline Coupling layer 
 # References
 [1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
 """
@@ -41,49 +40,79 @@ end
 
 function get_nsc_params(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
     nnoutput = nsc.nn(x)
-    px, py, dydx = MonotonicSplines.rqs_params_from_nn(nnoutput, nsc.n_dims_transferred, nsc.B)
+    px, py, dydx = MonotonicSplines.rqs_params_from_nn(
+        nnoutput, nsc.n_dims_transferred, nsc.B
+    )
     return px, py, dydx
 end
 
 # when input x is a vector instead of a matrix
 # need this to transform it to a matrix with one row
 # otherwise, rqs_forward and rqs_inverse will throw an error
-_ensure_matrix(x) = x isa AbstractVector ? reshape(x, 1, length(x)) : x
+_ensure_matrix(x) = x isa AbstractVector ? reshape(x, length(x), 1) : x
 
-function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
+function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractVector)
     x1, x2, x3 = Bijectors.partition(nsc.mask, x)
     # instantiate rqs knots and derivatives
     px, py, dydx = get_nsc_params(nsc, x2)
     x1 = _ensure_matrix(x1)
     y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, vec(y1), x2, x3)
+end
+function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractMatrix)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
     return Bijectors.combine(nsc.mask, y1, x2, x3)
 end
 
-function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
+function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractVector)
     x1, x2, x3 = Bijectors.partition(nsc.mask, x)
     # instantiate rqs knots and derivatives
     px, py, dydx = get_nsc_params(nsc, x2)
     x1 = _ensure_matrix(x1)
     y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
-    return Bijectors.combine(nsc.mask, y1, x2, x3), logjac isa Real ? logjac : vec(logjac)
+    return Bijectors.combine(nsc.mask, vec(y1), x2, x3), logjac[1]
+end
+function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractMatrix)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, y1, x2, x3), vec(logjac)
 end
 
-function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVecOrMat)
+function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
     nsc = insl.orig
     y1, y2, y3 = partition(nsc.mask, y)
     px, py, dydx = get_nsc_params(nsc, y2)
     y1 = _ensure_matrix(y1)
     x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, vec(x1), y2, y3)
+end
+function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractMatrix)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
     return Bijectors.combine(nsc.mask, x1, y2, y3)
 end
 
-function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVecOrMat)
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
     nsc = insl.orig
     y1, y2, y3 = partition(nsc.mask, y)
     px, py, dydx = get_nsc_params(nsc, y2)
     y1 = _ensure_matrix(y1)
     x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
-    return Bijectors.combine(nsc.mask, x1, y2, y3), logjac isa Real ? logjac : vec(logjac)
+    return Bijectors.combine(nsc.mask, vec(x1), y2, y3), logjac[1]
+end
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractMatrix)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, x1, y2, y3), vec(logjac)
 end
 
 function (nsc::NeuralSplineCoupling)(x::AbstractVecOrMat)
diff --git a/test/Project.toml b/test/Project.toml
index 5473818..6e9455f 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,6 +8,7 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
@@ -18,4 +19,3 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 Mooncake = "0.4.142"
-
diff --git a/test/ad.jl b/test/ad.jl
index 4103deb..b8a4211 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -123,15 +123,16 @@ end
 
 @testset "AD for ELBO on NSF" begin
     @testset "$at" for at in [
+        # now NSF only works with Zygote
+        # TODO: make it work with other ADs (possibly by adapting MonotonicSplines/src/rqspline_pullbacks.jl to rrules?)
         ADTypes.AutoZygote(),
-        ADTypes.AutoForwardDiff(),
-        ADTypes.AutoReverseDiff(; compile=false),
-        ADTypes.AutoEnzyme(;
-            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
-            function_annotation=Enzyme.Const,
-        ),
-        # it doesn't work with mooncake yet
-        ADTypes.AutoMooncake(; config=Mooncake.Config()),
+        # ADTypes.AutoForwardDiff(),
+        # ADTypes.AutoReverseDiff(; compile=false),
+        # ADTypes.AutoEnzyme(;
+        #     mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+        #     function_annotation=Enzyme.Const,
+        # ),
+        # ADTypes.AutoMooncake(; config=Mooncake.Config()),
     ]
         @testset "$T" for T in [Float32, Float64]
             μ = 10 * ones(T, 2)
diff --git a/test/flow.jl b/test/flow.jl
index 273b20e..fa4ef25 100644
--- a/test/flow.jl
+++ b/test/flow.jl
@@ -59,8 +59,6 @@
             @test !isnan(elbo_batch_value)
             @test !isinf(elbo_batch_value)
         end
-
-        #todo add tests for ad
     end
 end
 
@@ -69,12 +67,15 @@ end
     
     dim = 5
     nlayers = 2
+    K = 10
     hdims = [32, 32]
     for T in [Float32, Float64]
-        # Create a RealNVP flow
-        q₀ = MvNormal(zeros(T, dim), I)
+        # Create a nsf 
         @leaf MvNormal
-        flow = NormalizingFlows.nsf(q₀; paramtype=T)
+        q₀ = MvNormal(zeros(T, dim), I)
+
+        B = 5one(T)
+        flow = NormalizingFlows.nsf(q₀, hdims, K, B, nlayers; paramtype=T)
 
         @testset "Sampling and density estimation for type: $T" begin
             ys = rand(flow, 100) 
@@ -100,8 +101,8 @@ end
             y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
             x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
 
-            @test x_batch ≈ x_batch_reconstructed rtol=1e-6
-            @test ljs_fwd ≈ -ljs_bwd rtol=1e-6
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-4
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-4
         end
 
 
@@ -125,7 +126,5 @@ end
             @test !isnan(elbo_batch_value)
             @test !isinf(elbo_batch_value)
         end
-
-        #todo add tests for ad
     end
 end

From 0b9e656aaf5862cddda9c9bd3aa0bdc71b403878 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Sun, 3 Aug 2025 20:03:17 -0700
Subject: [PATCH 31/45] relax rtol for nsf invertibility error in FLoat32

---
 test/flow.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/flow.jl b/test/flow.jl
index fa4ef25..0f7efff 100644
--- a/test/flow.jl
+++ b/test/flow.jl
@@ -71,8 +71,8 @@ end
     hdims = [32, 32]
     for T in [Float32, Float64]
         # Create a nsf 
-        @leaf MvNormal
         q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
 
         B = 5one(T)
         flow = NormalizingFlows.nsf(q₀, hdims, K, B, nlayers; paramtype=T)
@@ -94,8 +94,8 @@ end
             y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
             x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
 
-            @test x ≈ x_reconstructed rtol=1e-6
-            @test lj_fwd ≈ -lj_bwd rtol=1e-6
+            @test x ≈ x_reconstructed rtol=1e-4
+            @test lj_fwd ≈ -lj_bwd rtol=1e-4
 
             x_batch = rand(q₀, 10)
             y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)

From 48829adda06ce15ec19d7f727a9547d3ec0d3045 Mon Sep 17 00:00:00 2001
From: zuhengxu <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 14:36:07 -0700
Subject: [PATCH 32/45] update doc

---
 docs/Project.toml         |  1 +
 docs/make.jl              |  7 ++--
 docs/src/api.md           | 79 ++++++++++++++++++++++-----------------
 docs/src/example.md       | 41 ++++++++++----------
 src/NormalizingFlows.jl   | 10 ++---
 src/flows/neuralspline.jl | 49 ++++++++++++++++++------
 src/flows/realnvp.jl      | 54 +++++++++++++-------------
 src/flows/utils.jl        | 28 +++++++++++---
 src/objectives/elbo.jl    | 34 ++++++++---------
 9 files changed, 181 insertions(+), 122 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index 27726a5..6d3e869 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -5,5 +5,6 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/docs/make.jl b/docs/make.jl
index 304e983..5bb4df0 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -7,9 +7,10 @@ DocMeta.setdocmeta!(
 
 makedocs(;
     modules=[NormalizingFlows],
-    repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
     sitename="NormalizingFlows.jl",
-    format=Documenter.HTML(),
+    format=Documenter.HTML(;
+        repolink="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
+    ),
     pages=[
         "Home" => "index.md",
         "API" => "api.md",
@@ -17,4 +18,4 @@ makedocs(;
         "Customize your own flow layer" => "customized_layer.md",
     ],
     checkdocs=:exports,
-)
+)
\ No newline at end of file
diff --git a/docs/src/api.md b/docs/src/api.md
index 13fe0c2..4b9b566 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -1,27 +1,29 @@
-## API
+# API
 
 ```@index
 ```
 
-
 ## Main Function
 
 ```@docs
 NormalizingFlows.train_flow
 ```
 
-The flow object can be constructed by `transformed` function in `Bijectors.jl` package.
-For example of Gaussian VI, we can construct the flow as follows:
-```@julia
+The flow object can be constructed by `transformed` function in `Bijectors.jl`.
+For example, for Gaussian VI, we can construct the flow as follows:
+
+```julia
 using Distributions, Bijectors
-T= Float32
+T = Float32
 @leaf MvNormal # to prevent params in q₀ from being optimized
 q₀ = MvNormal(zeros(T, 2), ones(T, 2))
 flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2)))
 ```
-To train the Gaussian VI targeting at distirbution $p$ via ELBO maiximization, we can run
-```@julia
-using NormalizingFlows
+
+To train the Gaussian VI targeting distribution `p` via ELBO maximization, run:
+
+```julia
+using NormalizingFlows, Optimisers
 
 sample_per_iter = 10
 flow_trained, stats, _ = train_flow(
@@ -29,34 +31,44 @@ flow_trained, stats, _ = train_flow(
     flow,
     logp,
     sample_per_iter;
-    max_iters=2_000,
-    optimiser=Optimisers.ADAM(0.01 * one(T)),
+    max_iters = 2_000,
+    optimiser = Optimisers.ADAM(0.01 * one(T)),
 )
 ```
-## Variational Objectives
-We have implemented two variational objectives, namely, ELBO and the log-likelihood objective. 
-Users can also define their own objective functions, and pass it to the [`train_flow`](@ref) function.
-`train_flow` will optimize the flow parameters by maximizing `vo`.
-The objective function should take the following general form:
-```julia
-vo(rng, flow, args...) 
+
+## Coupling-based flows (default constructors)
+
+These helpers construct commonly used coupling-based flows with sensible defaults.
+
+```@docs
+NormalizingFlows.realnvp
+NormalizingFlows.nsf
+NormalizingFlows.RealNVP_layer
+NormalizingFlows.NSF_layer
+NormalizingFlows.AffineCoupling
+NormalizingFlows.NeuralSplineCoupling
+NormalizingFlows.create_flow
 ```
-where `rng` is the random number generator, `flow` is the flow object, and `args...` are the
-additional arguments that users can pass to the objective function.
 
-#### Evidence Lower Bound (ELBO)
-By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence between $q_\theta$ and $p$, i.e., 
-```math 
+## Variational Objectives
+
+We provide ELBO (reverse KL) and expected log-likelihood (forward KL). You can also
+supply your own objective with the signature `vo(rng, flow, args...)`.
+
+### Evidence Lower Bound (ELBO)
+
+By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence between $q_\theta$ and $p$:
+
+```math
 \begin{aligned}
 &\min _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right]  \quad \text{(Reverse KL)}\\
 & = \max _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ
 T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ
-F_1(X)\right)\right] \quad \text{(ELBO)} 
+F_1(X)\right)\right] \quad \text{(ELBO)}
 \end{aligned}
 ```
-Reverse KL minimization is typically used for **Bayesian computation**, 
-where one only has access to the log-(unnormalized)density of the target distribution $p$ (e.g., a Bayesian posterior distribution), 
-and hope to generate approximate samples from it.
+
+Reverse KL minimization is typically used for Bayesian computation when only `logp` is available.
 
 ```@docs
 NormalizingFlows.elbo
@@ -66,24 +78,23 @@ NormalizingFlows.elbo
 NormalizingFlows.elbo_batch
 ```
 
-#### Log-likelihood
+### Log-likelihood
+
+By maximizing the log-likelihood, it is equivalent to minimizing the forward KL divergence between $q_\theta$ and $p$:
 
-By maximizing the log-likelihood, it is equivalent to minimizing the forward KL divergence between $q_\theta$ and $p$, i.e., 
-```math 
+```math
 \begin{aligned}
 & \min_{\theta} \mathbb{E}_{p}\left[\log q_{\theta}(Z)-\log p(Z)\right] \quad \text{(Forward KL)} \\
 & = \max_{\theta} \mathbb{E}_{p}\left[\log q_{\theta}(Z)\right] \quad \text{(Expected log-likelihood)}
 \end{aligned}
 ```
-Forward KL minimization is typically used for **generative modeling**, 
-where one is given a set of samples from the target distribution $p$ (e.g., images)
-and aims to learn the density or a generative process that outputs high quality samples.
+
+Forward KL minimization is typically used for generative modeling when samples from `p` are given.
 
 ```@docs
 NormalizingFlows.loglikelihood
 ```
 
-
 ## Training Loop
 
 ```@docs
diff --git a/docs/src/example.md b/docs/src/example.md
index 01a9a67..2812725 100644
--- a/docs/src/example.md
+++ b/docs/src/example.md
@@ -1,38 +1,40 @@
 ## Example: Using Planar Flow
 
-Here we provide a minimal demonstration of learning a synthetic 2d banana distribution
-using *planar flows* (Renzende *et al.* 2015) by maximizing the [Evidence Lower Bound (ELBO)](@ref).
+Here we provide a minimal demonstration of learning a synthetic 2D banana distribution
+using planar flows (Rezende and Mohamed, 2015) by maximizing the ELBO.
 To complete this task, the two key inputs are:
 - the log-density function of the target distribution, 
 - the planar flow. 
 
-#### The Target Distribution
+- the log-density function of the target distribution
+- the planar flow
+
+### The Target Distribution
+
+The `Banana` object is defined in `example/targets/banana.jl` (see the source for details).
 
-The `Banana` object is defined in `example/targets/banana.jl`, see the [source code](https://github.com/zuhengxu/NormalizingFlows.jl/blob/main/example/targets/banana.jl) for details.
 ```julia
 p = Banana(2, 1.0f-1, 100.0f0)
 logp = Base.Fix1(logpdf, p)
 ```
-Visualize the contour of the log-density and the sample scatters of the target distribution: 
-![Banana](banana.png)
 
+Visualize the contour of the log-density and the sample scatters of the target distribution:
 
+![Banana](banana.png)
 
+### The Planar Flow
 
-#### The Planar Flow 
+The planar flow is defined by repeatedly applying a sequence of invertible
+transformations to a base distribution $q_0$. The building blocks for a planar flow
+of length $N$ are the following invertible transformations, called planar layers:
 
-The planar flow is defined by repeated applying a sequence of invertible
-transformations to a base distribution $q_0$.  The building blocks for a planar flow
-of length $N$ are the following invertible transformations, called *planar layers*:
 ```math
-\text{planar layers}: 
-T_{n, \theta_n}(x)=x+u_n \cdot \tanh \left(w_n^T x+b_n\right), \quad n=1, \ldots, N, 
+T_{n, \theta_n}(x)=x+u_n \cdot \tanh \left(w_n^T x+b_n\right), \quad n=1, \ldots, N.
 ```
-where $\theta_n = (u_n, w_n, b_n), n=1, \dots, N$ are the parameters to be learned. 
-Thankfully, [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl)
-provides a nice framework to define a normalizing flow.
-Here we used the `PlanarLayer()` from `Bijectors.jl` to construct a 
-20-layer planar flow, of which the base distribution is a 2d standard Gaussian distribution.
+
+Here $\theta_n = (u_n, w_n, b_n), n=1, \dots, N$ are the parameters to be learned.
+[`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) provides `PlanarLayer()`.
+Below is a 20-layer planar flow on a 2D standard Gaussian base distribution.
 
 ```julia
 using Bijectors, FunctionChains
@@ -51,8 +53,9 @@ q₀ = MvNormal(zeros(Float32, 2), I)
 flow = create_planar_flow(20, q₀)
 flow_untrained = deepcopy(flow) # keep a copy of the untrained flow for comparison
 ```
-*Notice that here the flow layers are chained together using `fchain` function from [`FunctionChains.jl`](https://github.com/oschulz/FunctionChains.jl). 
-Alternatively, one can do*
+
+Notice: Using `fchain` (FunctionChains.jl) reduces compilation time versus chaining with `∘` for many layers.
+
 ```julia
 ts = reduce(∘, [f32(PlanarLayer(d)) for i in 1:20]) 
 ```
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index d453cc2..4d400e1 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -21,11 +21,11 @@ export train_flow, elbo, elbo_batch, loglikelihood
 
 Train the given normalizing flow `flow` by calling `optimize`.
 
-# Arguments
-- `rng::AbstractRNG`: random number generator
-- `vo`: variational objective
-- `flow`: normalizing flow to be trained, we recommend to define flow as `<:Bijectors.TransformedDistribution` 
-- `args...`: additional arguments for `vo`
+Arguments
+- `rng::AbstractRNG`: random number generator (default: `Random.default_rng()`)
+- `vo`: objective with signature `vo(rng, flow, args...)`
+- `flow`: a `Bijectors.TransformedDistribution` (recommended)
+- `args...`: additional arguments passed to `vo`
 
 # Keyword Arguments
 - `max_iters::Int=1000`: maximum number of iterations
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index 86374ce..edbd850 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -121,19 +121,23 @@ end
 
 
 """
-    NSF_layer(dims, hdims; paramtype = Float64)
-Default constructor of single layer of Neural Spline Flow (NSF) 
-which is a composition of 2 neural spline coupling transformations with complementary masks.
-The masking strategy is odd-even masking.
-# Arguments
+    NSF_layer(dims, hdims, K, B; paramtype = Float64)
+
+Default constructor of a single layer of Neural Spline Flow (NSF), which is a
+composition of two neural spline coupling transformations with complementary
+odd–even masks.
+
+Arguments
 - `dims::Int`: dimension of the problem
-- `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
-- `K::Int`: number of knots
-- `B::AbstractFloat`: bound of the knots
-# Keyword Arguments
-- `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
-# Returns
-- A `Bijectors.Bijector` representing the NSF layer.
+- `hdims::AbstractVector{Int}`: hidden sizes of the MLP used to parameterize the spline
+- `K::Int`: number of knots for the rational quadratic spline
+- `B::AbstractFloat`: boundary for the spline domain
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type
+
+Returns
+- A `Bijectors.Bijector` representing the NSF layer
 """
 function NSF_layer(
     dims::T1,                      # dimension of problem
@@ -152,6 +156,27 @@ function NSF_layer(
     return reduce(∘, (nsf1, nsf2))
 end
 
+"""
+    nsf(q0, hdims, K, B, nlayers; paramtype = Float64)
+
+Default constructor of Neural Spline Flow (NSF), which composes `nlayers` NSF_layer
+blocks with odd-even masking.
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`).
+- `hdims::AbstractVector{Int}`: hidden layer sizes of the coupling networks.
+- `K::Int`: number of spline knots.
+- `B::AbstractFloat`: boundary range for spline knots.
+- `nlayers::Int`: number of NSF_layer blocks.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type (e.g., `Float32` for GPU-friendly).
+
+Returns
+- `Bijectors.MultivariateTransformed` representing the NSF flow.
+
+Use the shorthand `nsf(q0)` to construct a default configuration.
+"""
 function nsf(
     q0::Distribution{Multivariate,Continuous},  
     hdims::AbstractVector{Int},     # dimension of hidden units for s and t
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 0d1e6a7..d4837f1 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -1,9 +1,12 @@
 """
-Default constructor of Affine Coupling flow layer
+Affine coupling layer used in RealNVP.
 
-following the general architecture as Eq(3) in [^AD2025]
+Implements two subnetworks `s` (scale, exponentiated) and `t` (shift) applied to
+one partition of the input, conditioned on the complementary partition. The
+scale network uses `tanh` on its output before exponentiation to improve
+stability during training.
 
-[^AD2025]: Agrawal, J., & Domke, J. (2025). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *AISTATS*
+See also: Dinh et al., 2016 (RealNVP).
 """
 struct AffineCoupling <: Bijectors.Bijector
     dim::Int
@@ -119,19 +122,18 @@ end
 """
     RealNVP_layer(dims, hdims; paramtype = Float64)
 
-Default constructor of single layer of realnvp flow, 
-which is a composition of 2 affine coupling transformations with complementary masks.
-The masking strategy is odd-even masking.
+Construct a single RealNVP layer using two affine coupling bijections with
+odd–even masks.
 
-# Arguments
-- `dims::Int`: dimension of the problem
-- `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
+Arguments
+- `dims::Int`: dimensionality of the target distribution
+- `hdims::AbstractVector{Int}`: hidden sizes for the `s` and `t` MLPs
 
-# Keyword Arguments
-- `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type
 
-# Returns
-- A `Bijectors.Bijector` representing the RealNVP layer.
+Returns
+- A `Bijectors.Bijector` representing the RealNVP layer
 """
 function RealNVP_layer(
     dims::Int,                      # dimension of problem
@@ -149,22 +151,24 @@ function RealNVP_layer(
 end
 
 """
-    realnvp(q0, dims, hdims, nlayers; paramtype = Float64)
+    realnvp(q0, hdims, nlayers; paramtype = Float64)
+    realnvp(q0; paramtype = Float64)
 
-Default constructor of RealNVP flow, which is a composition of `nlayers` RealNVP_layer.
-# Arguments
-- `q0::Distribution{Multivariate,Continuous}`: reference distribution, e.g. `MvNormal(zeros(dims), I)`
-- `dims::Int`: dimension of problem
-- `hdims::AbstractVector{Int}`: dimension of hidden units for s and t
-- `nlayers::Int`: number of RealNVP_layer  
-# Keyword Arguments
-- `paramtype::Type{T} = Float64`: type of the parameters, defaults to `Float64`
+Construct a RealNVP flow by stacking `nlayers` RealNVP_layer blocks with
+odd–even masking. The no-argument variant uses 10 layers with `[32, 32]`
+hidden sizes per coupling network.
 
-# Returns
-- A `Bijectors.MultivariateTransformed` representing the RealNVP flow.
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g. `MvNormal(zeros(d), I)`)
+- `hdims::AbstractVector{Int}`: hidden sizes for the `s` and `t` MLPs
+- `nlayers::Int`: number of RealNVP layers
 
-"""
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type
 
+Returns
+- `Bijectors.MultivariateTransformed` representing the RealNVP flow
+"""
 function realnvp(
     q0::Distribution{Multivariate,Continuous},  
     hdims::AbstractVector{Int},     # dimension of hidden units for s and t
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 1ff9a69..e18e06b 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -1,6 +1,29 @@
 using Bijectors: transformed
 using Flux
 
+"""
+    create_flow(layers, q0)
+
+Construct a normalizing flow by composing the provided bijector layers and
+attaching them to the base distribution `q0`.
+
+- `layers`: an iterable of `Bijectors.Bijector` objects that are composed in order
+  (left-to-right) via function composition.
+- `q0`: the base distribution (e.g., `MvNormal(zeros(d), I)`).
+
+Returns a `Bijectors.TransformedDistribution` representing the resulting flow.
+
+Example
+
+    using Distributions
+    q0 = MvNormal(zeros(2), I)
+    flow = create_flow((Bijectors.Scale([1.0, 2.0]), Bijectors.Shift([0.0, 1.0])), q0)
+"""
+function create_flow(Ls, q₀)
+    ts =  reduce(∘, Ls)
+    return transformed(q₀, ts)
+end
+
 """
     mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
 
@@ -73,9 +96,4 @@ function fnn(
 
     m = Chain(layers...)
     return Flux._paramtype(paramtype, m)
-end
-
-function create_flow(Ls, q₀)
-    ts =  reduce(∘, Ls)
-    return transformed(q₀, ts)
 end
\ No newline at end of file
diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
index 6a4d4ba..d29933e 100644
--- a/src/objectives/elbo.jl
+++ b/src/objectives/elbo.jl
@@ -7,10 +7,11 @@ function elbo_single_sample(flow::Bijectors.TransformedDistribution, logp, x)
 end
 
 """
-    elbo(flow, logp, xs) 
-    elbo([rng, ]flow, logp, n_samples)
+    elbo(flow, logp, xs)
+    elbo([rng, ] flow, logp, n_samples)
 
-Compute the ELBO for a batch of samples `xs` from the reference distribution `flow.dist`.
+Monte Carlo estimates of the ELBO from a batch of samples `xs` from the 
+reference distribution `flow.dist`.
 
 # Arguments
 - `rng`: random number generator
@@ -46,25 +47,20 @@ end
 
 
 """
-    elbo_batch(flow, logp, xs) 
-    elbo_batch([rng, ]flow, logp, n_samples)
+    elbo_batch(flow, logp, xs)
+    elbo_batch([rng, ] flow, logp, n_samples)
 
-Instead of broadcasting over elbo_single_sample, this function directly
-computes the ELBO in a batched manner, which requires the flow.transform to be able to 
-handle batched transformation directly.
+Batched ELBO evaluation that transforms a matrix of samples in one call. This
+is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages
+the batched operation of the neural networks.
 
-This will be more efficient than `elbo` for invertible neural networks such as RealNVP, 
-Neural Spline Flow, etc.
-
-# Arguments
-- `rng`: random number generator
-- `flow`: variational distribution to be trained. In particular 
-  `flow = transformed(q₀, T::Bijectors.Bijector)`, 
-  q₀ is a reference distribution that one can easily sample and compute logpdf
-- `logp`: log-pdf of the target distribution (not necessarily normalized)
-- `xs`: samples from reference dist q₀
-- `n_samples`: number of samples from reference dist q₀
+Inputs
+- `flow::Bijectors.MultivariateTransformed`
+- `logp`: function returning log-density of target
+- `xs` or `n_samples`: column-wise sample batch or number of samples
 
+Returns
+- Scalar estimate of the ELBO
 """
 function elbo_batch(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
     # requires the flow transformation to be able to handle batched inputs

From 4e6bfbe9af7e7d0a6a3b9226c7fc92acf38235c5 Mon Sep 17 00:00:00 2001
From: zuhengxu <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 14:43:41 -0700
Subject: [PATCH 33/45] wip doc build erro

---
 docs/make.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 5bb4df0..304e983 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -7,10 +7,9 @@ DocMeta.setdocmeta!(
 
 makedocs(;
     modules=[NormalizingFlows],
+    repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
     sitename="NormalizingFlows.jl",
-    format=Documenter.HTML(;
-        repolink="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
-    ),
+    format=Documenter.HTML(),
     pages=[
         "Home" => "index.md",
         "API" => "api.md",
@@ -18,4 +17,4 @@ makedocs(;
         "Customize your own flow layer" => "customized_layer.md",
     ],
     checkdocs=:exports,
-)
\ No newline at end of file
+)

From eb1966480c95bc660582a11d881e00f5480d664f Mon Sep 17 00:00:00 2001
From: zuhengxu <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 15:12:43 -0700
Subject: [PATCH 34/45] updating docs

---
 docs/make.jl         | 7 +++++--
 docs/src/index.md    | 8 +++-----
 src/flows/realnvp.jl | 7 ++-----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 304e983..f8f8496 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,15 +1,18 @@
 using NormalizingFlows
 using Documenter
 
+using Random
+using Distributions
+
 DocMeta.setdocmeta!(
     NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true
 )
 
 makedocs(;
     modules=[NormalizingFlows],
-    repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
     sitename="NormalizingFlows.jl",
-    format=Documenter.HTML(),
+    repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
+    format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"),
     pages=[
         "Home" => "index.md",
         "API" => "api.md",
diff --git a/docs/src/index.md b/docs/src/index.md
index f685b8b..b708d88 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -4,7 +4,7 @@ CurrentModule = NormalizingFlows
 
 # NormalizingFlows.jl
 
-Documentation for [NormalizingFlows](https://github.com/TuringLang/NormalizingFlows.jl).
+Documentation for [NormalizingFlows.jl](https://github.com/TuringLang/NormalizingFlows.jl).
 
 
 The purpose of this package is to provide a simple and flexible interface for 
@@ -15,8 +15,6 @@ construct (e.g., define customized flow layers) and combine various components
 for variational approximation of general target distributions, 
 *without being tied to specific probabilistic programming frameworks or applications*. 
 
-See the [documentation](https://turinglang.org/NormalizingFlows.jl/dev/) for more.  
-
 ## Installation
 To install the package, run the following command in the Julia REPL:
 ```
@@ -67,7 +65,7 @@ optimization problems:
 \text{Reverse KL:}\quad
 &\argmin _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right] \\
 &= \argmin _{\theta} \mathbb{E}_{q_0}\left[\log \frac{q_\theta(T_N\circ \cdots \circ T_1(Z_0))}{p(T_N\circ \cdots \circ T_1(Z_0))}\right] \\
-&= \argmax _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ F_1(X)\right)\right]
+&= \argmax _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(T_n \circ \cdots \circ T_1(X)\right)\right]
 \end{aligned}
 ```
 and 
@@ -82,4 +80,4 @@ Both problems can be solved via standard stochastic optimization algorithms,
 such as stochastic gradient descent (SGD) and its variants. 
 
 
-
+For a detailed introduction of normalizing flows, please refer to
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index d4837f1..2817619 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -186,12 +186,9 @@ end
 
 Default constructor of RealNVP with 10 layers, 
 each coupling function has 2 hidden layers with 32 units. 
-Following the general architecture as in [^ASD2020] (see Apdx. E).
+Following the general architecture as in the Apdx. E of [^ASD2020].
 
-
-[^ASD2020]: Agrawal, A., & Sheldon, D., & Domke, J. (2020). 
-Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. 
-In *NeurIPS*.
+[^ASD2020]: Agrawal, A., & Sheldon, D., & Domke, J. (2020). Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. In *NeurIPS*.
 """
 realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp(
     q0, [32, 32], 10; paramtype=paramtype

From ae531003c216dfe53dfbcac512efb1fe63632780 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 15:36:56 -0700
Subject: [PATCH 35/45] update gha perms

---
 .github/workflows/Docs.yml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml
index 64d5e3d..7a98981 100644
--- a/.github/workflows/Docs.yml
+++ b/.github/workflows/Docs.yml
@@ -15,7 +15,7 @@ concurrency:
 
 permissions:
   contents: write
-  pull-requests: read
+  pull-requests: write
 
 jobs:
   docs:
@@ -25,9 +25,10 @@ jobs:
       - name: Build and deploy Documenter.jl docs
         uses: TuringLang/actions/DocsDocumenter@main
         
-      - run: |
-          julia --project=docs -e '
-            using Documenter: DocMeta, doctest
-            using NormalizingFlows
-            DocMeta.setdocmeta!(NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true)
-            doctest(NormalizingFlows)'
+      - name: Run doctests
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Documenter: DocMeta, doctest
+          using NormalizingFlows
+          DocMeta.setdocmeta!(NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true)
+          doctest(NormalizingFlows)

From b8b229b0274c13fb5a7cd571716324e4c8c14cf4 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 15:37:06 -0700
Subject: [PATCH 36/45] minor ed on docs

---
 docs/src/api.md                    | 4 ++--
 example/demo_neural_spline_flow.jl | 2 +-
 src/objectives/elbo.jl             | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/src/api.md b/docs/src/api.md
index 4b9b566..e5c25a5 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -63,8 +63,8 @@ By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence
 \begin{aligned}
 &\min _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right]  \quad \text{(Reverse KL)}\\
 & = \max _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ
-T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ
-F_1(X)\right)\right] \quad \text{(ELBO)}
+T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(T_n \circ \cdots \circ
+T_1(X)\right)\right] \quad \text{(ELBO)}
 \end{aligned}
 ```
 
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index 8fea9bf..a53ed38 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -48,7 +48,7 @@ flow_trained, stats, _ = train_flow(
     flow,
     logp,
     sample_per_iter;
-    max_iters=10000,   # change to larger number of iterations (e.g., 50_000) for better results
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
     optimiser=Optimisers.Adam(1e-4),
     ADbackend=adtype,
     show_progress=true,
diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
index d29933e..f7ba355 100644
--- a/src/objectives/elbo.jl
+++ b/src/objectives/elbo.jl
@@ -50,8 +50,9 @@ end
     elbo_batch(flow, logp, xs)
     elbo_batch([rng, ] flow, logp, n_samples)
 
-Batched ELBO evaluation that transforms a matrix of samples in one call. This
-is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages
+Batched ELBO estimates that transforms a matrix of samples (each column represents a single
+sample) in one call. 
+This is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages
 the batched operation of the neural networks.
 
 Inputs

From 2d5b9c5c7b9ca009cb2642d99a24b72523ce9d34 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 22:42:26 -0700
Subject: [PATCH 37/45] update docs

---
 docs/make.jl                       |   7 +-
 docs/src/NSF.md                    |  47 ++++++++++
 docs/src/PlanarFlow.md             | 135 +++++++++++++++++++++++++++++
 docs/src/RealNVP.md                |  54 ++++++++++++
 docs/src/api.md                    |  82 ++++++++----------
 docs/src/customized_layer.md       |   9 +-
 docs/src/example.md                | 125 --------------------------
 docs/src/index.md                  |   2 +-
 docs/src/usage.md                  |  40 +++++++++
 example/demo_RealNVP.jl            |   5 +-
 example/demo_neural_spline_flow.jl |   6 +-
 src/NormalizingFlows.jl            |   5 +-
 src/flows/neuralspline.jl          |  93 ++++++++++++++------
 src/flows/realnvp.jl               |  77 +++++++++++-----
 14 files changed, 448 insertions(+), 239 deletions(-)
 create mode 100644 docs/src/NSF.md
 create mode 100644 docs/src/PlanarFlow.md
 create mode 100644 docs/src/RealNVP.md
 delete mode 100644 docs/src/example.md
 create mode 100644 docs/src/usage.md

diff --git a/docs/make.jl b/docs/make.jl
index f8f8496..e402e10 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,8 +15,13 @@ makedocs(;
     format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"),
     pages=[
         "Home" => "index.md",
+        "General usage" => "usage.md",
         "API" => "api.md",
-        "Example" => "example.md",
+        "Example" => [
+            "Planar Flow" => "PlanarFlow.md",
+            "RealNVP" => "RealNVP.md",
+            "Neural Spline Flow" => "NSF.md",
+        ],
         "Customize your own flow layer" => "customized_layer.md",
     ],
     checkdocs=:exports,
diff --git a/docs/src/NSF.md b/docs/src/NSF.md
new file mode 100644
index 0000000..df03a41
--- /dev/null
+++ b/docs/src/NSF.md
@@ -0,0 +1,47 @@
+# Demo of NSF on 2D Banana Distribution
+
+```julia
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Zygote
+using NormalizingFlows
+
+
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
+
+######################################
+# learn the target using Neural Spline Flow
+######################################
+@leaf MvNormal
+q0 = MvNormal(zeros(T, 2), I)
+
+
+flow = nsf(q0; paramtype=T)
+flow_untrained = deepcopy(flow)
+######################################
+# start training
+######################################
+sample_per_iter = 64
+
+# callback function to log training progress
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# nsf only supports AutoZygote
+adtype = ADTypes.AutoZygote()
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
+flow_trained, stats, _ = train_flow(
+    elbo_batch,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
+    ADbackend=adtype,
+    show_progress=true,
+    callback=cb,
+    hasconverged=checkconv,
+)
+θ, re = Optimisers.destructure(flow_trained)
+losses = map(x -> x.loss, stats)
+```
\ No newline at end of file
diff --git a/docs/src/PlanarFlow.md b/docs/src/PlanarFlow.md
new file mode 100644
index 0000000..696f433
--- /dev/null
+++ b/docs/src/PlanarFlow.md
@@ -0,0 +1,135 @@
+# Planar Flow on a 2D Banana Distribution
+
+This example demonstrates learning a synthetic 2D banana distribution with a planar normalizing flow [^RM2015] by maximizing the Evidence Lower BOund (ELBO).
+
+The two required ingredients are:
+
+- A log-density function `logp` for the target distribution.
+- A parametrised invertible transformation (the planar flow) applied to a simple base distribution.
+
+## Target Distribution
+
+The banana target used here is defined in `example/targets/banana.jl` (see source for details):
+
+```julia
+using Random, Distributions
+Random.seed!(123)
+
+target = Banana(2, 1.0, 10.0)  # (dimension, nonlinearity, scale)
+logp = Base.Fix1(logpdf, target)
+```
+
+You can visualise its contour and samples (figure shipped as `banana.png`).
+
+![Banana](banana.png)
+
+## Planar Flow
+
+A planar flow of length N applies a sequence of planar layers to a base distribution q₀:
+
+```math
+T_{n,\theta_n}(x) = x + u_n \tanh(w_n^T x + b_n), \qquad n = 1,\ldots,N.
+```
+
+Parameters θₙ = (uₙ, wₙ, bₙ) are learned. `Bijectors.jl` provides `PlanarLayer`.
+
+```julia
+using Bijectors
+using Functors # for @leaf
+
+function create_planar_flow(n_layers::Int, q₀)
+    d = length(q₀)
+    Ls = [PlanarLayer(d) for _ in 1:n_layers]
+    ts = reduce(∘, Ls)  # alternatively: FunctionChains.fchain(Ls)
+    return transformed(q₀, ts)
+end
+
+@leaf MvNormal  # prevent updating base distribution parameters
+q₀ = MvNormal(zeros(2), ones(2))
+flow = create_planar_flow(10, q₀)
+flow_untrained = deepcopy(flow)  # keep copy for comparison
+```
+
+If you build *many* layers (e.g. > ~30) you may reduce compilation time by using `FunctionChains.jl`:
+
+```julia
+# uncomment the following lines to use FunctionChains
+# using FunctionChains
+# ts = fchain([PlanarLayer(d) for _ in 1:n_layers])
+```
+See [this comment](https://github.com/TuringLang/NormalizingFlows.jl/blob/8f4371d48228adf368d851e221af076ff929f1cf/src/NormalizingFlows.jl#L52)
+for how the compilation time might be a concern.
+
+## Training the Flow
+
+We maximize the ELBO (here using the minibatch estimator `elbo_batch`) with the generic `train_flow` interface.
+
+```julia
+using NormalizingFlows
+using ADTypes, Optimisers
+using Mooncake
+
+sample_per_iter = 32
+adtype = ADTypes.AutoMooncake(; config=Mooncake.Config())  # try AutoZygote() / AutoForwardDiff() / etc.
+# optional: callback function to track the batch size per iteration and the AD backend used 
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter, ad=adtype)
+# optional: defined stopping criteria when the gradient norm is less than 1e-3
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < 1e-3
+
+flow_trained, stats, _ = train_flow(
+    elbo_batch,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters = 20_000,
+    optimiser = Optimisers.Adam(1e-2),
+    ADbackend = adtype,
+    callback = cb,
+    hasconverged = checkconv,
+    show_progress = false,
+)
+
+losses = map(x -> x.loss, stats)
+```
+
+Plot the losses (negative ELBO):
+
+```julia
+using Plots
+plot(losses; xlabel = "iteration", ylabel = "negative ELBO", label = "", lw = 2)
+```
+
+![elbo](elbo.png)
+
+## Evaluating the Trained Flow
+
+The trained flow is a `Bijectors.TransformedDistribution`, so we can call `rand` to draw iid samples and call `logpdf` to evaluate the log-density function of the flow.
+See [documentation of `Bijectors.jl`](https://turinglang.org/Bijectors.jl/dev/distributions/) for details.
+```julia
+n_samples = 1_000
+samples_trained   = rand(flow_trained, n_samples)
+samples_untrained = rand(flow_untrained, n_samples)
+samples_true      = rand(target, n_samples)
+```
+
+Simple visual comparison:
+
+```julia
+using Plots
+scatter(samples_true[1, :], samples_true[2, :]; label="Target", ms=2, alpha=0.5)
+scatter!(samples_untrained[1, :], samples_untrained[2, :]; label="Untrained", ms=2, alpha=0.5)
+scatter!(samples_trained[1, :],  samples_trained[2, :];  label="Trained", ms=2, alpha=0.5)
+plot!(title = "Planar Flow: Before vs After Training", xlabel = "x₁", ylabel = "x₂", legend = :topleft)
+```
+
+![compare](comparison.png)
+
+## Notes
+
+- Use `elbo` instead of `elbo_batch` for a single-sample estimator.
+- Switch AD backends by changing `adtype` (see `ADTypes.jl`).
+- Marking the base distribution with `@leaf` prevents its parameters from being updated during training.
+
+## Reference
+
+[^RM2015]: Rezende, D. & Mohamed, S. (2015). Variational Inference with Normalizing Flows. ICML.
diff --git a/docs/src/RealNVP.md b/docs/src/RealNVP.md
new file mode 100644
index 0000000..53fd2c5
--- /dev/null
+++ b/docs/src/RealNVP.md
@@ -0,0 +1,54 @@
+# Demo of RealNVP on 2D Banana Distribution
+
+```julia
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Mooncake
+using NormalizingFlows
+
+
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
+
+######################################
+# set up the RealNVP
+######################################
+@leaf MvNormal
+q0 = MvNormal(zeros(T, 2), I)
+
+d = 2
+hdims = [16, 16]
+nlayers = 3
+
+# use NormalizingFlows.realnvp to create a RealNVP flow
+flow = realnvp(q0, hdims, nlayers; paramtype=T)
+flow_untrained = deepcopy(flow)
+
+
+######################################
+# start training
+######################################
+sample_per_iter = 16
+
+# callback function to log training progress
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
+flow_trained, stats, _ = train_flow(
+    rng, 
+    elbo,        # using elbo_batch instead of elbo achieves 4-5 times speedup 
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(5e-4),
+    ADbackend=adtype,
+    show_progress=true,
+    callback=cb,
+    hasconverged=checkconv,
+)
+θ, re = Optimisers.destructure(flow_trained)
+losses = map(x -> x.loss, stats)
+```
\ No newline at end of file
diff --git a/docs/src/api.md b/docs/src/api.md
index e5c25a5..de7530c 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -3,53 +3,6 @@
 ```@index
 ```
 
-## Main Function
-
-```@docs
-NormalizingFlows.train_flow
-```
-
-The flow object can be constructed by `transformed` function in `Bijectors.jl`.
-For example, for Gaussian VI, we can construct the flow as follows:
-
-```julia
-using Distributions, Bijectors
-T = Float32
-@leaf MvNormal # to prevent params in q₀ from being optimized
-q₀ = MvNormal(zeros(T, 2), ones(T, 2))
-flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2)))
-```
-
-To train the Gaussian VI targeting distribution `p` via ELBO maximization, run:
-
-```julia
-using NormalizingFlows, Optimisers
-
-sample_per_iter = 10
-flow_trained, stats, _ = train_flow(
-    elbo,
-    flow,
-    logp,
-    sample_per_iter;
-    max_iters = 2_000,
-    optimiser = Optimisers.ADAM(0.01 * one(T)),
-)
-```
-
-## Coupling-based flows (default constructors)
-
-These helpers construct commonly used coupling-based flows with sensible defaults.
-
-```@docs
-NormalizingFlows.realnvp
-NormalizingFlows.nsf
-NormalizingFlows.RealNVP_layer
-NormalizingFlows.NSF_layer
-NormalizingFlows.AffineCoupling
-NormalizingFlows.NeuralSplineCoupling
-NormalizingFlows.create_flow
-```
-
 ## Variational Objectives
 
 We provide ELBO (reverse KL) and expected log-likelihood (forward KL). You can also
@@ -100,3 +53,38 @@ NormalizingFlows.loglikelihood
 ```@docs
 NormalizingFlows.optimize
 ```
+
+
+## Available Flows
+
+`NormalizingFlows.jl` provides two commonly used normalizing flows: `RealNVP` and 
+`Neural Spline Flow (NSF)`.
+
+### RealNVP (Affine Coupling Flow)
+
+These helpers construct commonly used coupling-based flows with sensible defaults.
+
+```@docs
+NormalizingFlows.realnvp
+NormalizingFlows.RealNVP_layer
+NormalizingFlows.AffineCoupling
+```
+
+### Neural Spline Flow (NSF)
+
+```@docs
+NormalizingFlows.nsf
+NormalizingFlows.NSF_layer
+NormalizingFlows.NeuralSplineCoupling
+```
+
+##  Utility Functions
+
+```@docs
+NormalizingFlows.create_flow
+```
+
+```@docs
+NormalizingFlows.fnn
+```
+
diff --git a/docs/src/customized_layer.md b/docs/src/customized_layer.md
index af9062c..9fb1d1d 100644
--- a/docs/src/customized_layer.md
+++ b/docs/src/customized_layer.md
@@ -12,7 +12,11 @@ for more details.
 
 
 In this tutorial, we demonstrate how to define a customized normalizing flow
-layer -- an `Affine Coupling Layer` (Dinh *et al.*, 2016) -- using `Bijectors.jl` and `Flux.jl`.
+layer -- an `Affine Coupling Layer` -- using `Bijectors.jl` and `Flux.jl`,
+which is the building block of the RealNVP flow [^LJS2017].
+It's worth mentioning that the [`realnvp`](@ref) implemented in `NormalizingFlows.jl`
+is slightly different from this tutorial with some optimization for the training stability
+and performance.
 
 ## Affine Coupling Flow
 
@@ -176,5 +180,4 @@ logpdf(flow, x[:,1])
 
 
 ## Reference
-Dinh, L., Sohl-Dickstein, J. and Bengio, S., 2016. *Density estimation using real nvp.* 
-arXiv:1605.08803.
\ No newline at end of file
+[^LJS2017]: Dinh, L., Sohl-Dickstein, J. and Bengio, S., 2017. Density estimation using real nvp. in *ICLR*
\ No newline at end of file
diff --git a/docs/src/example.md b/docs/src/example.md
deleted file mode 100644
index 2812725..0000000
--- a/docs/src/example.md
+++ /dev/null
@@ -1,125 +0,0 @@
-## Example: Using Planar Flow
-
-Here we provide a minimal demonstration of learning a synthetic 2D banana distribution
-using planar flows (Rezende and Mohamed, 2015) by maximizing the ELBO.
-To complete this task, the two key inputs are:
-- the log-density function of the target distribution, 
-- the planar flow. 
-
-- the log-density function of the target distribution
-- the planar flow
-
-### The Target Distribution
-
-The `Banana` object is defined in `example/targets/banana.jl` (see the source for details).
-
-```julia
-p = Banana(2, 1.0f-1, 100.0f0)
-logp = Base.Fix1(logpdf, p)
-```
-
-Visualize the contour of the log-density and the sample scatters of the target distribution:
-
-![Banana](banana.png)
-
-### The Planar Flow
-
-The planar flow is defined by repeatedly applying a sequence of invertible
-transformations to a base distribution $q_0$. The building blocks for a planar flow
-of length $N$ are the following invertible transformations, called planar layers:
-
-```math
-T_{n, \theta_n}(x)=x+u_n \cdot \tanh \left(w_n^T x+b_n\right), \quad n=1, \ldots, N.
-```
-
-Here $\theta_n = (u_n, w_n, b_n), n=1, \dots, N$ are the parameters to be learned.
-[`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) provides `PlanarLayer()`.
-Below is a 20-layer planar flow on a 2D standard Gaussian base distribution.
-
-```julia
-using Bijectors, FunctionChains
-using Functors
-
-function create_planar_flow(n_layers::Int, q₀)
-    d = length(q₀)
-    Ls = [f32(PlanarLayer(d)) for _ in 1:n_layers]
-    ts = fchain(Ls)
-    return transformed(q₀, ts)
-end
-
-# create a 20-layer planar flow
-@leaf MvNormal # to prevent params in q₀ from being optimized
-q₀ = MvNormal(zeros(Float32, 2), I)
-flow = create_planar_flow(20, q₀)
-flow_untrained = deepcopy(flow) # keep a copy of the untrained flow for comparison
-```
-
-Notice: Using `fchain` (FunctionChains.jl) reduces compilation time versus chaining with `∘` for many layers.
-
-```julia
-ts = reduce(∘, [f32(PlanarLayer(d)) for i in 1:20]) 
-```
-*However, we recommend using `fchain` to reduce the compilation time when the number of layers is large.
-See [this comment](https://github.com/TuringLang/NormalizingFlows.jl/blob/8f4371d48228adf368d851e221af076ff929f1cf/src/NormalizingFlows.jl#L52)
-for how the compilation time might be a concern.*
-
-
-#### Flow Training
-Then we can train the flow by maximizing the ELBO using the [`train_flow`](@ref) function as follows: 
-```julia
-using NormalizingFlows
-using ADTypes
-using Optimisers
-
-sample_per_iter = 10
-# callback function to track the number of samples used per iteration
-cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,)
-# defined stopping criteria when the gradient norm is less than 1e-3
-checkconv(iter, stat, re, θ, st) = stat.gradient_norm < 1e-3
-flow_trained, stats, _ = train_flow(
-    elbo,
-    flow,
-    logp,
-    sample_per_iter;
-    max_iters=200_00,
-    optimiser=Optimisers.ADAM(),
-    callback=cb,
-    hasconverged=checkconv,
-    ADbackend=AutoZygote(), # using Zygote as the AD backend
-)
-```
-
-Examine the loss values during training:
-```julia
-using Plots
-
-losses = map(x -> x.loss, stats)
-plot(losses; xlabel = "#iteration", ylabel= "negative ELBO", label="", linewidth=2) 
-```
-![elbo](elbo.png)
-
-## Evaluating Trained Flow 
-Finally, we can evaluate the trained flow by sampling from it and compare it with the target distribution.
-Since the flow is defined as a `Bijectors.TransformedDistribution`, one can
-easily sample from it using `rand` function, or examine the density using `logpdf` function.
-See [documentation of `Bijectors.jl`](https://turinglang.org/Bijectors.jl/dev/distributions/) for details.
-```julia
-using Random, Distributions
-
-nsample = 1000
-samples_trained = rand(flow_trained, n_samples) # 1000 iid samples from the trained flow 
-samples_untrained = rand(flow_untrained, n_samples) # 1000 iid samples from the untrained flow
-samples_true = rand(p, n_samples) # 1000 iid samples from the target
-
-# plot 
-scatter(samples_true[1, :], samples_true[2, :]; label="True Distribution", color=:blue, markersize=2, alpha=0.5)
-scatter!(samples_untrained[1, :], samples_untrained[2, :]; label="Untrained Flow", color=:red, markersize=2, alpha=0.5)
-scatter!(samples_trained[1, :], samples_trained[2, :]; label="Trained Flow", color=:green, markersize=2, alpha=0.5)
-plot!(title = "Comparison of Trained and Untrained Flow", xlabel = "X", ylabel= "Y", legend=:topleft) 
-```
-![compare](comparison.png)
-
-
-## Reference 
-
-- Rezende, D. and Mohamed, S., 2015. *Variational inference with normalizing flows*. International Conference on Machine Learning  
diff --git a/docs/src/index.md b/docs/src/index.md
index b708d88..d53c2b0 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -19,7 +19,7 @@ for variational approximation of general target distributions,
 To install the package, run the following command in the Julia REPL:
 ```
 ]  # enter Pkg mode
-(@v1.11) pkg> add git@github.com:TuringLang/NormalizingFlows.jl.git
+(@v1.11) pkg> add NormalizingFlows
 ```
 Then simply run the following command to use the package:
 ```julia
diff --git a/docs/src/usage.md b/docs/src/usage.md
new file mode 100644
index 0000000..cefb816
--- /dev/null
+++ b/docs/src/usage.md
@@ -0,0 +1,40 @@
+## General usage
+
+`train_flow` is the main function to train a normalizing flow. 
+The users mostly need to specify a normalizing flow `flow`, 
+the variational objective `vo` and its corresponding arguments `args...`.
+
+```@docs
+NormalizingFlows.train_flow
+```
+
+The flow object can be constructed by `transformed` function in `Bijectors.jl`.
+For example, for mean-field Gaussian VI, we can construct the flow family as follows:
+
+```julia
+using Distributions, Bijectors
+T = Float32
+@leaf MvNormal # to prevent params in q₀ from being optimized
+q₀ = MvNormal(zeros(T, 2), ones(T, 2))
+# the flow family is defined by a shift and a scale 
+flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2)))
+```
+
+To train the Gaussian VI targeting distribution `p` via ELBO maximization, run:
+
+```julia
+using NormalizingFlows, Optimisers
+using ADTypes, Mooncake
+
+sample_per_iter = 10
+flow_trained, stats, _ = train_flow(
+    elbo,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=5_000,
+    optimiser=Optimisers.Adam(one(T)/100),
+    ADbackend=ADTypes.AutoMooncake(; config=Mooncake.Config()),
+    show_progress=true,
+)
+```
\ No newline at end of file
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index 45f4213..bf5897c 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -1,10 +1,7 @@
-using Bijectors
-using Bijectors: partition, combine, PartitionMask
-
 using Random, Distributions, LinearAlgebra
 using Functors
 using Optimisers, ADTypes
-using Mooncake, Zygote
+using Mooncake
 using NormalizingFlows
 
 include("SyntheticTargets.jl")
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index a53ed38..4e98c30 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -1,6 +1,3 @@
-using Bijectors
-using Bijectors: partition, combine, PartitionMask
-
 using Random, Distributions, LinearAlgebra
 using Functors
 using Optimisers, ADTypes
@@ -39,8 +36,7 @@ sample_per_iter = 64
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
-# TODO: mooncake has some issues with kernelabstractions?
-# adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+# nsf only supports AutoZygote
 adtype = ADTypes.AutoZygote()
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 4d400e1..2eaa677 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -23,8 +23,9 @@ Train the given normalizing flow `flow` by calling `optimize`.
 
 Arguments
 - `rng::AbstractRNG`: random number generator (default: `Random.default_rng()`)
-- `vo`: objective with signature `vo(rng, flow, args...)`
-- `flow`: a `Bijectors.TransformedDistribution` (recommended)
+- `vo`: variational objective with signature `vo(rng, flow, args...)`. 
+    We implement [`elbo`](@ref), [`elbo_batch`](@ref), and [`loglikelihood`](@ref).
+- `flow`: the normalizing flow---a `Bijectors.TransformedDistribution` (recommended)
 - `args...`: additional arguments passed to `vo`
 
 # Keyword Arguments
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index edbd850..bd57cbf 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -1,11 +1,36 @@
-# a new implementation of Neural Spline Flow based on MonotonicSplines.jl
-# the construction of the RQS seems to be more efficient than the one in Bijectors.jl
-# and supports batched operations.
-
 """
-Neural Rational Quadratic Spline Coupling layer 
-# References
-[1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
+    NeuralSplineCoupling(dim, hdims, K, B, mask_idx, paramtype)
+    NeuralSplineCoupling(dim, K, n_dims_transferred, B, nn, mask)
+
+Neural Rational Quadratic Spline (RQS) coupling bijector [^DBMP2019].
+
+A conditioner network takes the unchanged partition as input and outputs the
+parameters of monotonic rational quadratic splines for the transformed
+coordinates. Batched inputs (matrices with column vectors) are supported.
+
+Arguments
+- `dim::Int`: total input dimension.
+- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner MLP.
+- `K::Int`: number of spline knots per transformed coordinate.
+- `B::AbstractFloat`: boundary/box constraint for spline domain.
+- `mask_idx::AbstractVector{Int}`: indices of the transformed coordinates.
+
+Keyword Arguments
+- `paramtype::Type{<:AbstractFloat}`: parameter element type.
+
+Fields
+- `nn::Flux.Chain`: conditioner that outputs all spline params for all transformed dims.
+- `mask::Bijectors.PartitionMask`: partition specification.
+
+Notes
+- Output dimensionality of the conditioner is `(3K - 1) * n_transformed`.
+- For computation performance, we rely on 
+[`MonotonicSplines.jl`](https://github.com/bat/MonotonicSplines.jl) for the
+building the rational quadratic spline functions.
+- See `MonotonicSplines.rqs_forward` and `MonotonicSplines.rqs_inverse` for forward/inverse 
+and log-determinant computations.
+
+[^DBMP2019]: Durkan, C., Bekasov, A., Murray, I. and Papamarkou, T. (2019). Neural Spline Flows. *NeurIPS.*
 """
 struct NeuralSplineCoupling{T,A<:Flux.Chain} <: Bijectors.Bijector
     dim::Int                        # dimension of input
@@ -123,21 +148,24 @@ end
 """
     NSF_layer(dims, hdims, K, B; paramtype = Float64)
 
-Default constructor of a single layer of Neural Spline Flow (NSF), which is a
-composition of two neural spline coupling transformations with complementary
-odd–even masks.
+Build a single Neural Spline Flow (NSF) layer by composing two
+`NeuralSplineCoupling` bijectors with complementary odd–even masks.
 
 Arguments
-- `dims::Int`: dimension of the problem
-- `hdims::AbstractVector{Int}`: hidden sizes of the MLP used to parameterize the spline
-- `K::Int`: number of knots for the rational quadratic spline
-- `B::AbstractFloat`: boundary for the spline domain
+- `dims::Int`: dimensionality of the problem.
+- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network.
+- `K::Int`: number of spline knots.
+- `B::AbstractFloat`: spline boundary.
 
 Keyword Arguments
-- `paramtype::Type{T} = Float64`: parameter element type
+- `paramtype::Type{T} = Float64`: parameter element type.
 
 Returns
-- A `Bijectors.Bijector` representing the NSF layer
+- A `Bijectors.Bijector` representing the NSF layer.
+
+Example
+- `layer = NSF_layer(4, [64,64], 10, 3.0)`
+- `y = layer(randn(4, 32))`
 """
 function NSF_layer(
     dims::T1,                      # dimension of problem
@@ -158,24 +186,35 @@ end
 
 """
     nsf(q0, hdims, K, B, nlayers; paramtype = Float64)
+    nsf(q0; paramtype = Float64)
 
-Default constructor of Neural Spline Flow (NSF), which composes `nlayers` NSF_layer
-blocks with odd-even masking.
+Construct an NSF by stacking `nlayers` `NSF_layer` blocks. The one-argument
+variant defaults to 10 layers with `[32, 32]` hidden sizes, 10 knots, and
+boundary `30` (scaled by `one(T)`).
 
 Arguments
-- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`).
-- `hdims::AbstractVector{Int}`: hidden layer sizes of the coupling networks.
-- `K::Int`: number of spline knots.
-- `B::AbstractFloat`: boundary range for spline knots.
-- `nlayers::Int`: number of NSF_layer blocks.
+- `q0::Distribution{Multivariate,Continuous}`: base distribution.
+- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network.
+- `K::Int`: spline knots per coordinate.
+- `B::AbstractFloat`: spline boundary.
+- `nlayers::Int`: number of NSF layers.
 
 Keyword Arguments
-- `paramtype::Type{T} = Float64`: parameter element type (e.g., `Float32` for GPU-friendly).
+- `paramtype::Type{T} = Float64`: parameter element type.
 
 Returns
-- `Bijectors.MultivariateTransformed` representing the NSF flow.
-
-Use the shorthand `nsf(q0)` to construct a default configuration.
+- `Bijectors.TransformedDistribution` representing the NSF flow.
+
+Notes:
+- Under the hood, `nsf` relies on the rational quadratic spline function implememented in 
+`MonotonicSplines.jl` for performance reasons.  `MonotonicSplines.jl` uses 
+`KernelAbstractions.jl` to support batched operations. 
+Because of this, so far `nsf` only supports `Zygote` as the AD type.
+  
+
+Example
+- `q0 = MvNormal(zeros(3), I); flow = nsf(q0, [64,64], 8, 3.0, 6)`
+- `x = rand(flow, 128); lp = logpdf(flow, x)`
 """
 function nsf(
     q0::Distribution{Multivariate,Continuous},  
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 2817619..6d34925 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -1,12 +1,34 @@
 """
-Affine coupling layer used in RealNVP.
+    AffineCoupling(dim, hdims, mask_idx, paramtype)
+    AffineCoupling(dim, mask, s, t)
 
-Implements two subnetworks `s` (scale, exponentiated) and `t` (shift) applied to
-one partition of the input, conditioned on the complementary partition. The
-scale network uses `tanh` on its output before exponentiation to improve
-stability during training.
+Affine coupling bijector used in RealNVP [^LJS2017].
 
-See also: Dinh et al., 2016 (RealNVP).
+Two subnetworks `s` (log-scale, exponentiated in the forward pass) and `t` (shift)
+act on one partition of the input, conditioned on the complementary partition
+(as defined by `mask`). For numerical stability, the output of `s` passes
+through `tanh` before exponentiation.
+
+Arguments
+- `dim::Int`: total dimensionality of the input.
+- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner MLPs `s` and `t`.
+- `mask_idx::AbstractVector{Int}`: indices of the dimensions to transform.
+  The complement is used as the conditioner input.
+
+Keyword Arguments
+- `paramtype::Type{<:AbstractFloat}`: parameter element type (e.g. `Float32`).
+
+Fields
+- `mask::Bijectors.PartitionMask`: partition specification.
+- `s::Flux.Chain`: conditioner producing log-scales for the transformed block.
+- `t::Flux.Chain`: conditioner producing shifts for the transformed block.
+
+Notes
+- Forward: with `(x₁,x₂,x₃) = partition(mask, x)`, compute `y₁ = x₁ .* exp.(s(x₂)) .+ t(x₂)`.
+- Log-determinant: `sum(s(x₂))` (or columnwise for batched matrices).
+- All methods support both vectors and column-major batches (matrices).
+
+[^LJS2017]: Dinh, L., Sohl-Dickstein, J. and Bengio, S. (2017).  Density estimation using Real NVP. ICLR.
 """
 struct AffineCoupling <: Bijectors.Bijector
     dim::Int
@@ -122,18 +144,22 @@ end
 """
     RealNVP_layer(dims, hdims; paramtype = Float64)
 
-Construct a single RealNVP layer using two affine coupling bijections with
-odd–even masks.
+Construct a single RealNVP layer by composing two `AffineCoupling` bijectors
+with complementary odd–even masks.
 
 Arguments
-- `dims::Int`: dimensionality of the target distribution
-- `hdims::AbstractVector{Int}`: hidden sizes for the `s` and `t` MLPs
+- `dims::Int`: dimensionality of the problem.
+- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner networks.
 
 Keyword Arguments
-- `paramtype::Type{T} = Float64`: parameter element type
+- `paramtype::Type{T} = Float64`: parameter element type.
 
 Returns
-- A `Bijectors.Bijector` representing the RealNVP layer
+- A `Bijectors.Bijector` representing the RealNVP layer.
+
+Example
+- `layer = RealNVP_layer(4, [64, 64])`
+- `y = layer(randn(4, 16))`  # batched forward
 """
 function RealNVP_layer(
     dims::Int,                      # dimension of problem
@@ -154,20 +180,24 @@ end
     realnvp(q0, hdims, nlayers; paramtype = Float64)
     realnvp(q0; paramtype = Float64)
 
-Construct a RealNVP flow by stacking `nlayers` RealNVP_layer blocks with
-odd–even masking. The no-argument variant uses 10 layers with `[32, 32]`
-hidden sizes per coupling network.
+Construct a RealNVP flow by stacking `nlayers` `RealNVP_layer` blocks with
+odd–even masking. The 1-argument variant defaults to 10 layers with
+hidden sizes `[32, 32]` per conditioner.
 
 Arguments
-- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g. `MvNormal(zeros(d), I)`)
-- `hdims::AbstractVector{Int}`: hidden sizes for the `s` and `t` MLPs
-- `nlayers::Int`: number of RealNVP layers
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g. `MvNormal(zeros(d), I)`).
+- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner networks.
+- `nlayers::Int`: number of stacked RealNVP layers.
 
 Keyword Arguments
-- `paramtype::Type{T} = Float64`: parameter element type
+- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness).
 
 Returns
-- `Bijectors.MultivariateTransformed` representing the RealNVP flow
+- `Bijectors.TransformedDistribution` representing the RealNVP flow.
+
+Example
+- `q0 = MvNormal(zeros(2), I); flow = realnvp(q0, [64,64], 8)`
+- `x = rand(flow, 128); lp = logpdf(flow, x)`
 """
 function realnvp(
     q0::Distribution{Multivariate,Continuous},  
@@ -184,11 +214,10 @@ end
 """
     realnvp(q0; paramtype = Float64)
 
-Default constructor of RealNVP with 10 layers, 
-each coupling function has 2 hidden layers with 32 units. 
-Following the general architecture as in the Apdx. E of [^ASD2020].
+Default constructor: 10 layers, each conditioner uses hidden sizes `[32, 32]`.
+Follows a common RealNVP architecture similar to Appendix E of [^ASD2020].
 
-[^ASD2020]: Agrawal, A., & Sheldon, D., & Domke, J. (2020). Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. In *NeurIPS*.
+[^ASD2020]: Agrawal, A., Sheldon, D., Domke, J. (2020). Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. NeurIPS.
 """
 realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp(
     q0, [32, 32], 10; paramtype=paramtype

From 8a04147b2e8948b100d0b704168980d3abd8b807 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Fri, 8 Aug 2025 22:47:53 -0700
Subject: [PATCH 38/45] update readme

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d4cb2c0..5bf5651 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![Build Status](https://github.com/TuringLang/NormalizingFlows.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TuringLang/NormalizingFlows.jl/actions/workflows/CI.yml?query=branch%3Amain)
 
 
-**Last updated: 2025-Mar-04**
+**Last updated: 2025-Aug-08**
 
 A normalizing flow library for Julia.
 
@@ -17,6 +17,8 @@ without being tied to specific probabilistic programming frameworks or applicati
 
 See the [documentation](https://turinglang.org/NormalizingFlows.jl/dev/) for more.  
 
+We also provide several demos and examples in [example](https://github.com/TuringLang/NormalizingFlows.jl/tree/main/example).
+
 ## Installation
 To install the package, run the following command in the Julia REPL:
 ```julia
@@ -90,3 +92,4 @@ where one wants to learn the underlying distribution of some data.
 - [Flux.jl](https://fluxml.ai/Flux.jl/stable/)
 - [Optimisers.jl](https://github.com/FluxML/Optimisers.jl)
 - [AdvancedVI.jl](https://github.com/TuringLang/AdvancedVI.jl)
+- [MonotonicSplines.jl](https://github.com/bat/MonotonicSplines.jl)

From 2431e57ebd1a3036b546d68804c6917283fc4088 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Tue, 19 Aug 2025 22:47:27 -0700
Subject: [PATCH 39/45] incorpoerate comments from red-portal and sunxd3

---
 docs/src/example.md       |  0
 src/flows/neuralspline.jl | 41 +++++++++++++++++++--------------------
 src/flows/realnvp.jl      | 34 +-------------------------------
 src/flows/utils.jl        |  9 +++++----
 test/flow.jl              | 28 +++++++++++++-------------
 5 files changed, 40 insertions(+), 72 deletions(-)
 create mode 100644 docs/src/example.md

diff --git a/docs/src/example.md b/docs/src/example.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
index bd57cbf..0d8eab0 100644
--- a/src/flows/neuralspline.jl
+++ b/src/flows/neuralspline.jl
@@ -1,6 +1,6 @@
 """
     NeuralSplineCoupling(dim, hdims, K, B, mask_idx, paramtype)
-    NeuralSplineCoupling(dim, K, n_dims_transferred, B, nn, mask)
+    NeuralSplineCoupling(dim, K, n_dims_transformed, B, nn, mask)
 
 Neural Rational Quadratic Spline (RQS) coupling bijector [^DBMP2019].
 
@@ -19,7 +19,7 @@ Keyword Arguments
 - `paramtype::Type{<:AbstractFloat}`: parameter element type.
 
 Fields
-- `nn::Flux.Chain`: conditioner that outputs all spline params for all transformed dims.
+- `nn::Flux.Chain`: conditioner that outputs all spline params for all transformed dim.
 - `mask::Bijectors.PartitionMask`: partition specification.
 
 Notes
@@ -35,9 +35,9 @@ and log-determinant computations.
 struct NeuralSplineCoupling{T,A<:Flux.Chain} <: Bijectors.Bijector
     dim::Int                        # dimension of input
     K::Int                          # number of knots
-    n_dims_transferred::Int         # number of dimensions that are transformed
+    n_dims_transformed::Int         # number of dimensions that are transformed
     B::T                            # bound of the knots
-    nn::A                           # networks that parmaterize the knots and derivatives
+    nn::A                           # networks that parameterize the knots and derivatives
     mask::Bijectors.PartitionMask
 end
 
@@ -46,13 +46,12 @@ function NeuralSplineCoupling(
     hdims::AbstractVector{T1},       # dimension of hidden units for s and t
     K::T1,                           # number of knots
     B::T2,                           # bound of the knots
-    mask_idx::AbstractVector{T1}, # index of dimensione that one wants to apply transformations on
+    mask_idx::AbstractVector{T1},    # indices of the transformed dimensions
     paramtype::Type{T2},             # type of the parameters, e.g., Float64 or Float32
 ) where {T1<:Int,T2<:AbstractFloat}
     num_of_transformed_dims = length(mask_idx)
     input_dims = dim - num_of_transformed_dims
     
-    # output dim of the NN
     output_dims = (3K - 1)*num_of_transformed_dims
     # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
     nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
@@ -66,7 +65,7 @@ end
 function get_nsc_params(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
     nnoutput = nsc.nn(x)
     px, py, dydx = MonotonicSplines.rqs_params_from_nn(
-        nnoutput, nsc.n_dims_transferred, nsc.B
+        nnoutput, nsc.n_dims_transformed, nsc.B
     )
     return px, py, dydx
 end
@@ -146,13 +145,13 @@ end
 
 
 """
-    NSF_layer(dims, hdims, K, B; paramtype = Float64)
+    NSF_layer(dim, hdims, K, B; paramtype = Float64)
 
 Build a single Neural Spline Flow (NSF) layer by composing two
 `NeuralSplineCoupling` bijectors with complementary odd–even masks.
 
 Arguments
-- `dims::Int`: dimensionality of the problem.
+- `dim::Int`: dimensionality of the problem.
 - `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network.
 - `K::Int`: number of spline knots.
 - `B::AbstractFloat`: spline boundary.
@@ -168,19 +167,19 @@ Example
 - `y = layer(randn(4, 32))`
 """
 function NSF_layer(
-    dims::T1,                      # dimension of problem
+    dim::T1,                      # dimension of problem
     hdims::AbstractVector{T1},     # dimension of hidden units for nn 
     K::T1,                           # number of knots
     B::T2;                           # bound of the knots
     paramtype::Type{T2} = Float64,   # type of the parameters
 ) where {T1<:Int,T2<:AbstractFloat}
 
-    mask_idx1 = 1:2:dims
-    mask_idx2 = 2:2:dims
+    mask_idx1 = 1:2:dim
+    mask_idx2 = 2:2:dim
 
     # by default use the odd-even masking strategy
-    nsf1 = NeuralSplineCoupling(dims, hdims, K, B, mask_idx1, paramtype)
-    nsf2 = NeuralSplineCoupling(dims, hdims, K, B, mask_idx2, paramtype)
+    nsf1 = NeuralSplineCoupling(dim, hdims, K, B, mask_idx1, paramtype)
+    nsf2 = NeuralSplineCoupling(dim, hdims, K, B, mask_idx2, paramtype)
     return reduce(∘, (nsf1, nsf2))
 end
 
@@ -205,11 +204,11 @@ Keyword Arguments
 Returns
 - `Bijectors.TransformedDistribution` representing the NSF flow.
 
-Notes:
-- Under the hood, `nsf` relies on the rational quadratic spline function implememented in 
-`MonotonicSplines.jl` for performance reasons.  `MonotonicSplines.jl` uses 
-`KernelAbstractions.jl` to support batched operations. 
-Because of this, so far `nsf` only supports `Zygote` as the AD type.
+!!! note 
+    Under the hood, `nsf` relies on the rational quadratic spline function implememented in 
+    `MonotonicSplines.jl` for performance reasons.  `MonotonicSplines.jl` uses 
+    `KernelAbstractions.jl` to support batched operations. 
+    Because of this, so far `nsf` only supports `Zygote` as the AD type.
   
 
 Example
@@ -225,8 +224,8 @@ function nsf(
     paramtype::Type{T} = Float64,   # type of the parameters
 ) where {T<:AbstractFloat}
 
-    dims = length(q0)  # dimension of the reference distribution == dim of the problem
-    Ls = [NSF_layer(dims, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
+    dim = length(q0)  # dimension of the reference distribution == dim of the problem
+    Ls = [NSF_layer(dim, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
     create_flow(Ls, q0)         
 end
 
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 6d34925..42855b7 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -37,13 +37,12 @@ struct AffineCoupling <: Bijectors.Bijector
     t::Flux.Chain
 end
 
-# let params track field s and t
 @functor AffineCoupling (s, t)
 
 function AffineCoupling(
     dim::Int,                       # dimension of the problem
     hdims::AbstractVector{Int},     # dimension of hidden units for s and t
-    mask_idx::AbstractVector{Int},       # index of dimensione that one wants to apply transformations on
+    mask_idx::AbstractVector{Int},       # indices of the transformed dimensions
     paramtype::Type{T}
 ) where {T<:AbstractFloat}
     cdims = length(mask_idx)  # dimension of parts used to construct coupling law
@@ -110,37 +109,6 @@ function Bijectors.with_logabsdet_jacobian(
     return combine(af.mask, x_1, y_2, y_3), vec(logjac)
 end
 
-################### 
-# an equivalent definition of AffineCoupling using Bijectors.Coupling 
-# (see https://github.com/TuringLang/Bijectors.jl/blob/74d52d4eda72a6149b1a89b72524545525419b3f/src/bijectors/coupling.jl#L188C1-L188C1)
-###################
-
-# struct AffineCoupling <: Bijectors.Bijector
-#     dim::Int
-#     mask::Bijectors.PartitionMask
-#     s::Flux.Chain
-#     t::Flux.Chain
-# end
-
-# # let params track field s and t
-# @functor AffineCoupling (s, t)
-
-# function AffineCoupling(dim, mask, s, t)
-#     return Bijectors.Coupling(θ -> Bijectors.Shift(t(θ)) ∘ Bijectors.Scale(s(θ)), mask)
-# end
-
-# function AffineCoupling(
-#     dim::Int,  # dimension of input
-#     hdims::Int, # dimension of hidden units for s and t
-#     mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-# )
-#     cdims = length(mask_idx) # dimension of parts used to construct coupling law
-#     s = mlp3(cdims, hdims, cdims)
-#     t = mlp3(cdims, hdims, cdims)
-#     mask = PartitionMask(dim, mask_idx)
-#     return AffineCoupling(dim, mask, s, t)
-# end
-
 """
     RealNVP_layer(dims, hdims; paramtype = Float64)
 
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index e18e06b..2463616 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -8,14 +8,15 @@ Construct a normalizing flow by composing the provided bijector layers and
 attaching them to the base distribution `q0`.
 
 - `layers`: an iterable of `Bijectors.Bijector` objects that are composed in order
-  (left-to-right) via function composition.
+  (left-to-right) via function composition 
+(for instance, if `layers = [l1, l2, l3]`, the flow will be `l3∘l2∘l1(q0)`).
 - `q0`: the base distribution (e.g., `MvNormal(zeros(d), I)`).
 
 Returns a `Bijectors.TransformedDistribution` representing the resulting flow.
 
 Example
 
-    using Distributions
+    using Distributions, Bijectors, LinearAlgebra
     q0 = MvNormal(zeros(2), I)
     flow = create_flow((Bijectors.Scale([1.0, 2.0]), Bijectors.Shift([0.0, 1.0])), q0)
 """
@@ -77,7 +78,7 @@ function fnn(
 ) where {T<:AbstractFloat}
     # Create a chain of dense layers
     # First layer
-    layers = Any[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
+    layers = [Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
 
     # Hidden layers
     for i in 1:(length(hidden_dims) - 1)
@@ -96,4 +97,4 @@ function fnn(
 
     m = Chain(layers...)
     return Flux._paramtype(paramtype, m)
-end
\ No newline at end of file
+end
diff --git a/test/flow.jl b/test/flow.jl
index 0f7efff..de8e1ee 100644
--- a/test/flow.jl
+++ b/test/flow.jl
@@ -45,19 +45,19 @@
             target = MvNormal(μ, Σ)
             logp(z) = logpdf(target, z)
 
-            # Define a simple log-likelihood function
-            logp(z) = logpdf(q₀, z)
-
             # Compute ELBO
             batchsize = 64
             elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
             elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
 
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
             # test elbo_value is not NaN and not Inf
-            @test !isnan(elbo_value)
-            @test !isinf(elbo_value)
-            @test !isnan(elbo_batch_value)
-            @test !isinf(elbo_batch_value)
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
         end
     end
 end
@@ -112,19 +112,19 @@ end
             target = MvNormal(μ, Σ)
             logp(z) = logpdf(target, z)
 
-            # Define a simple log-likelihood function
-            logp(z) = logpdf(q₀, z)
-
             # Compute ELBO
             batchsize = 64
             elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
             elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
 
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
             # test elbo_value is not NaN and not Inf
-            @test !isnan(elbo_value)
-            @test !isinf(elbo_value)
-            @test !isnan(elbo_batch_value)
-            @test !isinf(elbo_batch_value)
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
         end
     end
 end

From 4dec51a9b755e772b96f442674e3a3e431cce977 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Tue, 19 Aug 2025 23:09:16 -0700
Subject: [PATCH 40/45] fix test error

---
 src/flows/utils.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 2463616..953e818 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -77,8 +77,8 @@ function fnn(
     paramtype::Type{T} = Float64,
 ) where {T<:AbstractFloat}
     # Create a chain of dense layers
-    # First layer
-    layers = [Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
+    # First layer (need to use Any[] otherwise it complains about type instability when pushing layers)
+    layers = Any[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
 
     # Hidden layers
     for i in 1:(length(hidden_dims) - 1)

From fb6b80b827729af305c060109cd8e8c53a1ec577 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Tue, 19 Aug 2025 23:41:16 -0700
Subject: [PATCH 41/45] add planar and radial flow; updating docs

---
 docs/src/api.md             |  13 +++-
 example/demo_RealNVP.jl     |   2 +-
 example/demo_planar_flow.jl |   9 +--
 example/demo_radial_flow.jl |   9 +--
 src/NormalizingFlows.jl     |   2 +
 src/flows/planar_radial.jl  |  60 ++++++++++++++++
 test/flow.jl                | 134 ++++++++++++++++++++++++++++++++++++
 7 files changed, 209 insertions(+), 20 deletions(-)
 create mode 100644 src/flows/planar_radial.jl

diff --git a/docs/src/api.md b/docs/src/api.md
index de7530c..e6ec251 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -57,8 +57,8 @@ NormalizingFlows.optimize
 
 ## Available Flows
 
-`NormalizingFlows.jl` provides two commonly used normalizing flows: `RealNVP` and 
-`Neural Spline Flow (NSF)`.
+`NormalizingFlows.jl` provides two commonly used normalizing flows---`RealNVP` and
+`Neural Spline Flow (NSF)`---and two simple flows---`Planar Flow` and `Radial Flow`.
 
 ### RealNVP (Affine Coupling Flow)
 
@@ -78,7 +78,14 @@ NormalizingFlows.NSF_layer
 NormalizingFlows.NeuralSplineCoupling
 ```
 
-##  Utility Functions
+#### Planar and Radial Flows
+
+```@docs
+NormalizingFlows.planarflow
+NormalizingFlows.radialflow
+```
+
+## Utility Functions
 
 ```@docs
 NormalizingFlows.create_flow
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index bf5897c..f3d705e 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -48,7 +48,7 @@ adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
     rng, 
-    elbo,        # using elbo_batch instead of elbo achieves 4-5 times speedup 
+    elbo_batch,        # using elbo_batch instead of elbo achieves 4-5 times speedup 
     flow,
     logp,
     sample_per_iter;
diff --git a/example/demo_planar_flow.jl b/example/demo_planar_flow.jl
index 25a694e..3fbe7c8 100644
--- a/example/demo_planar_flow.jl
+++ b/example/demo_planar_flow.jl
@@ -20,16 +20,9 @@ logp = Base.Fix1(logpdf, target)
 ######################################
 # setup planar flow
 ######################################
-function create_planar_flow(n_layers::Int, q₀)
-    d = length(q₀)
-    Ls = [PlanarLayer(d) for _ in 1:n_layers]
-    ts = reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
-
 @leaf MvNormal
 q0 = MvNormal(zeros(T, 2), ones(T, 2))
-flow = create_planar_flow(10, q0)
+flow = planarflow(q0, 10; paramtype=T)
 flow_untrained = deepcopy(flow)
 
 ######################################
diff --git a/example/demo_radial_flow.jl b/example/demo_radial_flow.jl
index 89b4c56..044d550 100644
--- a/example/demo_radial_flow.jl
+++ b/example/demo_radial_flow.jl
@@ -19,17 +19,10 @@ logp = Base.Fix1(logpdf, target)
 ######################################
 # setup radial flow 
 ######################################
-function create_radial_flow(n_layers::Int, q₀)
-    d = length(q₀)
-    Ls = [RadialLayer(d) for _ in 1:n_layers]
-    ts = reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
-
 # create a 10-layer radial flow
 @leaf MvNormal
 q0 = MvNormal(zeros(T, 2), ones(T, 2))
-flow = create_radial_flow(10, q0)
+flow = radialflow(q0, 10; paramtype=T)
 
 flow_untrained = deepcopy(flow)
 
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 2eaa677..70c4699 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -129,12 +129,14 @@ end
 
 # interface of contructing common flow layers
 include("flows/utils.jl")
+include("flows/planar_radial.jl")
 include("flows/realnvp.jl")
 
 using MonotonicSplines
 include("flows/neuralspline.jl")
 
 export create_flow
+export planarflow, radialflow
 export AffineCoupling, RealNVP_layer, realnvp
 export NeuralSplineCoupling, NSF_layer, nsf
 
diff --git a/src/flows/planar_radial.jl b/src/flows/planar_radial.jl
new file mode 100644
index 0000000..554cd42
--- /dev/null
+++ b/src/flows/planar_radial.jl
@@ -0,0 +1,60 @@
+"""
+    planarflow(q0, nlayers; paramtype = Float64)
+
+Construct a Planar Flow by stacking `nlayers` `Bijectors.PlanarLayer` blocks
+on top of a base distribution `q0`.
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`).
+- `nlayers::Int`: number of planar layers to compose.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness).
+
+Returns
+- `Bijectors.TransformedDistribution` representing the planar flow.
+
+Example
+- `q0 = MvNormal(zeros(2), I); flow = planarflow(q0, 10)`
+- `x = rand(flow, 128); lp = logpdf(flow, x)`
+"""
+function planarflow(
+    q0::Distribution{Multivariate,Continuous},  
+    nlayers::Int;                   
+    paramtype::Type{T} = Float64,   
+) where {T<:AbstractFloat}
+    dim = length(q0)
+    Ls = [Flux._paramtype(paramtype, Bijectors.PlanarLayer(dim)) for _ in 1:nlayers]
+    return create_flow(Ls, q0)
+end
+
+
+"""
+    radialflow(q0, nlayers; paramtype = Float64)
+
+Construct a Radial Flow by stacking `nlayers` `Bijectors.RadialLayer` blocks
+on top of a base distribution `q0`.
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`).
+- `nlayers::Int`: number of radial layers to compose.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness).
+
+Returns
+- `Bijectors.TransformedDistribution` representing the radial flow.
+
+Example
+- `q0 = MvNormal(zeros(2), I); flow = radialflow(q0, 6)`
+- `x = rand(flow); lp = logpdf(flow, x)`
+"""
+function radialflow(
+    q0::Distribution{Multivariate,Continuous},  
+    nlayers::Int;                   
+    paramtype::Type{T} = Float64,   
+) where {T<:AbstractFloat}
+    dim = length(q0)
+    Ls = [Flux._paramtype(paramtype, Bijectors.RadialLayer(dim)) for _ in 1:nlayers]
+    return create_flow(Ls, q0)
+end
diff --git a/test/flow.jl b/test/flow.jl
index de8e1ee..6968558 100644
--- a/test/flow.jl
+++ b/test/flow.jl
@@ -128,3 +128,137 @@ end
         end
     end
 end
+
+
+
+@testset "Planar flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 10
+    for T in [Float32, Float64]
+        # Create a nsf 
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+
+        flow = NormalizingFlows.planarflow(q₀, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-4
+            @test lj_fwd ≈ -lj_bwd rtol=1e-4
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-4
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-4
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
+            # test elbo_value is not NaN and not Inf
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
+        end
+    end
+end
+
+
+
+@testset "Radial flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 10
+    for T in [Float32, Float64]
+        # Create a nsf 
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+
+        flow = NormalizingFlows.radialflow(q₀, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-4
+            @test lj_fwd ≈ -lj_bwd rtol=1e-4
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-4
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-4
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
+            # test elbo_value is not NaN and not Inf
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
+        end
+    end
+end
+
+

From 862c6dc5342bbc00db957577ec1590745d8b5056 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Wed, 20 Aug 2025 11:15:38 -0700
Subject: [PATCH 42/45] fixed error in doc for creat_flow

---
 docs/src/example.md    |  0
 src/flows/utils.jl     |  6 +++---
 src/objectives/elbo.jl | 31 ++++++++++++++++++++++++++-----
 3 files changed, 29 insertions(+), 8 deletions(-)
 delete mode 100644 docs/src/example.md

diff --git a/docs/src/example.md b/docs/src/example.md
deleted file mode 100644
index e69de29..0000000
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 953e818..1169d2b 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -9,7 +9,7 @@ attaching them to the base distribution `q0`.
 
 - `layers`: an iterable of `Bijectors.Bijector` objects that are composed in order
   (left-to-right) via function composition 
-(for instance, if `layers = [l1, l2, l3]`, the flow will be `l3∘l2∘l1(q0)`).
+(for instance, if `layers = [l1, l2, l3]`, the flow will be `l1∘l2∘l3(q0)`).
 - `q0`: the base distribution (e.g., `MvNormal(zeros(d), I)`).
 
 Returns a `Bijectors.TransformedDistribution` representing the resulting flow.
@@ -18,7 +18,7 @@ Example
 
     using Distributions, Bijectors, LinearAlgebra
     q0 = MvNormal(zeros(2), I)
-    flow = create_flow((Bijectors.Scale([1.0, 2.0]), Bijectors.Shift([0.0, 1.0])), q0)
+    flow = create_flow((Bijectors.Shift([0.0, 1.0]), Bijectors.Scale([1.0, 2.0])), q0)
 """
 function create_flow(Ls, q₀)
     ts =  reduce(∘, Ls)
@@ -62,7 +62,7 @@ Create a fully connected neural network (FNN).
 - `hidden_dims::AbstractVector{<:Int}`: A vector of integers specifying the dimensions of the hidden layers.
 - `output_dim::Int`: The dimension of the output layer.
 - `inlayer_activation`: The activation function for the hidden layers. Defaults to `Flux.leakyrelu`.
-- `output_activation`: The activation function for the output layer. Defaults to `Flux.tanh`.
+- `output_activation`: The activation function for the output layer. Defaults to `nothing`.
 - `paramtype::Type{T} = Float64`: The type of the parameters in the network, defaults to `Float64`.
 
 # Returns
diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
index f7ba355..6ddd47b 100644
--- a/src/objectives/elbo.jl
+++ b/src/objectives/elbo.jl
@@ -46,6 +46,29 @@ function elbo(flow::Bijectors.TransformedDistribution, logp, n_samples)
 end
 
 
+"""
+    _batched_elbos(flow, logp, xs)
+
+Batched ELBO estimates that transforms a matrix of samples (each column represents a single
+sample) in one call. 
+This is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages
+the batched operation of the neural networks.
+
+Inputs
+- `flow::Bijectors.MultivariateTransformed`
+- `logp`: function returning log-density of target
+- `xs`: column-wise sample batch
+
+Returns
+- a vector of ELBO estimates for each sample in the batch
+"""
+function _batched_elbos(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
+    # requires the flow transformation to be able to handle batched inputs
+    ys, logabsdetjac = with_logabsdet_jacobian(flow.transform, xs) 
+    elbos = logp(ys) .- logpdf(flow.dist, xs) .+ logabsdetjac
+    return elbos
+end
+
 """
     elbo_batch(flow, logp, xs)
     elbo_batch([rng, ] flow, logp, n_samples)
@@ -64,14 +87,12 @@ Returns
 - Scalar estimate of the ELBO
 """
 function elbo_batch(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
-    # requires the flow transformation to be able to handle batched inputs
-    ys, logabsdetjac = with_logabsdet_jacobian(flow.transform, xs) 
-    elbos = logp(ys) .- logpdf(flow.dist, xs) .+ logabsdetjac
-    return elbos
+    elbos = _batched_elbos(flow, logp, xs)
+    return mean(elbos)
 end
 function elbo_batch(rng::AbstractRNG, flow::Bijectors.MultivariateTransformed, logp, n_samples)
     xs = _device_specific_rand(rng, flow.dist, n_samples)
-    elbos = elbo_batch(flow, logp, xs)
+    elbos = _batched_elbos(flow, logp, xs)
     return mean(elbos)
 end
 elbo_batch(flow::Bijectors.TransformedDistribution, logp, n_samples) = 

From dc7360567508a8534ba48af94216607b5b2c5d5b Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Wed, 20 Aug 2025 12:27:47 -0700
Subject: [PATCH 43/45] rm redundant comments

---
 src/flows/realnvp.jl | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
index 42855b7..2f93dc5 100644
--- a/src/flows/realnvp.jl
+++ b/src/flows/realnvp.jl
@@ -39,10 +39,10 @@ end
 
 @functor AffineCoupling (s, t)
 
-function AffineCoupling(
-    dim::Int,                       # dimension of the problem
-    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
-    mask_idx::AbstractVector{Int},       # indices of the transformed dimensions
+function AffineCoupling( 
+    dim::Int,                       
+    hdims::AbstractVector{Int},
+    mask_idx::AbstractVector{Int},
     paramtype::Type{T}
 ) where {T<:AbstractFloat}
     cdims = length(mask_idx)  # dimension of parts used to construct coupling law
@@ -130,9 +130,9 @@ Example
 - `y = layer(randn(4, 16))`  # batched forward
 """
 function RealNVP_layer(
-    dims::Int,                      # dimension of problem
-    hdims::AbstractVector{Int};     # dimension of hidden units for s and t
-    paramtype::Type{T} = Float64,   # type of the parameters
+    dims::Int,
+    hdims::AbstractVector{Int};
+    paramtype::Type{T} = Float64,   
 ) where {T<:AbstractFloat}
 
     mask_idx1 = 1:2:dims
@@ -169,12 +169,12 @@ Example
 """
 function realnvp(
     q0::Distribution{Multivariate,Continuous},  
-    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
-    nlayers::Int;                   # number of RealNVP_layer 
-    paramtype::Type{T} = Float64,   # type of the parameters
+    hdims::AbstractVector{Int},     
+    nlayers::Int;
+    paramtype::Type{T} = Float64,
 ) where {T<:AbstractFloat}
 
-    dims = length(q0)  # dimension of the reference distribution == dim of the problem
+    dims = length(q0)
     Ls = [RealNVP_layer(dims, hdims; paramtype=paramtype) for _ in 1:nlayers] 
     create_flow(Ls, q0)         
 end

From 5753150b110c28f616a21bc8ebbbbfd6fb1ec9c3 Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Wed, 20 Aug 2025 12:28:18 -0700
Subject: [PATCH 44/45] change Any[] to Flux.Dense[]

---
 example/utils.jl   | 1 +
 src/flows/utils.jl | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/utils.jl b/example/utils.jl
index 8f76ebf..bdd2858 100644
--- a/example/utils.jl
+++ b/example/utils.jl
@@ -1,4 +1,5 @@
 using Random, Distributions, LinearAlgebra
+using Bijectors
 using Bijectors: transformed
 
 function compare_trained_and_untrained_flow(
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 1169d2b..5ab4b65 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -78,7 +78,7 @@ function fnn(
 ) where {T<:AbstractFloat}
     # Create a chain of dense layers
     # First layer (need to use Any[] otherwise it complains about type instability when pushing layers)
-    layers = Any[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
+    layers = Flux.Dense[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
 
     # Hidden layers
     for i in 1:(length(hidden_dims) - 1)

From d9525e7acd581bf8c7f8f9ddc17f67a352f1f81a Mon Sep 17 00:00:00 2001
From: Zuheng <zuhengdavid@gmail.com>
Date: Wed, 20 Aug 2025 13:34:40 -0700
Subject: [PATCH 45/45] minor comment update

---
 src/flows/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/flows/utils.jl b/src/flows/utils.jl
index 5ab4b65..83c2a44 100644
--- a/src/flows/utils.jl
+++ b/src/flows/utils.jl
@@ -77,7 +77,7 @@ function fnn(
     paramtype::Type{T} = Float64,
 ) where {T<:AbstractFloat}
     # Create a chain of dense layers
-    # First layer (need to use Any[] otherwise it complains about type instability when pushing layers)
+    # First layer
     layers = Flux.Dense[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
 
     # Hidden layers