diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml
index 64d5e3d..7a98981 100644
--- a/.github/workflows/Docs.yml
+++ b/.github/workflows/Docs.yml
@@ -15,7 +15,7 @@ concurrency:
 
 permissions:
   contents: write
-  pull-requests: read
+  pull-requests: write
 
 jobs:
   docs:
@@ -25,9 +25,10 @@ jobs:
       - name: Build and deploy Documenter.jl docs
         uses: TuringLang/actions/DocsDocumenter@main
         
-      - run: |
-          julia --project=docs -e '
-            using Documenter: DocMeta, doctest
-            using NormalizingFlows
-            DocMeta.setdocmeta!(NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true)
-            doctest(NormalizingFlows)'
+      - name: Run doctests
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Documenter: DocMeta, doctest
+          using NormalizingFlows
+          DocMeta.setdocmeta!(NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true)
+          doctest(NormalizingFlows)
diff --git a/Project.toml b/Project.toml
index f03ebb6..4c49b8f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NormalizingFlows"
 uuid = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -8,7 +8,10 @@ Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -27,7 +30,10 @@ CUDA = "5"
 DifferentiationInterface = "0.6, 0.7"
 Distributions = "0.25"
 DocStringExtensions = "0.9"
+Flux = "0.16"
+Functors = "0.5.2"
+MonotonicSplines = "0.3.3"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
-julia = "1.10"
+julia = "1.10.8"
diff --git a/README.md b/README.md
index d4cb2c0..5bf5651 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![Build Status](https://github.com/TuringLang/NormalizingFlows.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TuringLang/NormalizingFlows.jl/actions/workflows/CI.yml?query=branch%3Amain)
 
 
-**Last updated: 2025-Mar-04**
+**Last updated: 2025-Aug-08**
 
 A normalizing flow library for Julia.
 
@@ -17,6 +17,8 @@ without being tied to specific probabilistic programming frameworks or applicati
 
 See the [documentation](https://turinglang.org/NormalizingFlows.jl/dev/) for more.  
 
+We also provide several demos and examples in [example](https://github.com/TuringLang/NormalizingFlows.jl/tree/main/example).
+
 ## Installation
 To install the package, run the following command in the Julia REPL:
 ```julia
@@ -90,3 +92,4 @@ where one wants to learn the underlying distribution of some data.
 - [Flux.jl](https://fluxml.ai/Flux.jl/stable/)
 - [Optimisers.jl](https://github.com/FluxML/Optimisers.jl)
 - [AdvancedVI.jl](https://github.com/TuringLang/AdvancedVI.jl)
+- [MonotonicSplines.jl](https://github.com/bat/MonotonicSplines.jl)
diff --git a/docs/Project.toml b/docs/Project.toml
index 27726a5..6d3e869 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -5,5 +5,6 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/docs/make.jl b/docs/make.jl
index 304e983..e402e10 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,19 +1,27 @@
 using NormalizingFlows
 using Documenter
 
+using Random
+using Distributions
+
 DocMeta.setdocmeta!(
     NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true
 )
 
 makedocs(;
     modules=[NormalizingFlows],
-    repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
     sitename="NormalizingFlows.jl",
-    format=Documenter.HTML(),
+    repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}",
+    format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"),
     pages=[
         "Home" => "index.md",
+        "General usage" => "usage.md",
         "API" => "api.md",
-        "Example" => "example.md",
+        "Example" => [
+            "Planar Flow" => "PlanarFlow.md",
+            "RealNVP" => "RealNVP.md",
+            "Neural Spline Flow" => "NSF.md",
+        ],
         "Customize your own flow layer" => "customized_layer.md",
     ],
     checkdocs=:exports,
diff --git a/docs/src/NSF.md b/docs/src/NSF.md
new file mode 100644
index 0000000..df03a41
--- /dev/null
+++ b/docs/src/NSF.md
@@ -0,0 +1,47 @@
+# Demo of NSF on 2D Banana Distribution
+
+```julia
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Zygote
+using NormalizingFlows
+
+
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
+
+######################################
+# learn the target using Neural Spline Flow
+######################################
+@leaf MvNormal
+q0 = MvNormal(zeros(T, 2), I)
+
+
+flow = nsf(q0; paramtype=T)
+flow_untrained = deepcopy(flow)
+######################################
+# start training
+######################################
+sample_per_iter = 64
+
+# callback function to log training progress
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# nsf only supports AutoZygote
+adtype = ADTypes.AutoZygote()
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
+flow_trained, stats, _ = train_flow(
+    elbo_batch,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
+    ADbackend=adtype,
+    show_progress=true,
+    callback=cb,
+    hasconverged=checkconv,
+)
+θ, re = Optimisers.destructure(flow_trained)
+losses = map(x -> x.loss, stats)
+```
\ No newline at end of file
diff --git a/docs/src/PlanarFlow.md b/docs/src/PlanarFlow.md
new file mode 100644
index 0000000..696f433
--- /dev/null
+++ b/docs/src/PlanarFlow.md
@@ -0,0 +1,135 @@
+# Planar Flow on a 2D Banana Distribution
+
+This example demonstrates learning a synthetic 2D banana distribution with a planar normalizing flow [^RM2015] by maximizing the Evidence Lower BOund (ELBO).
+
+The two required ingredients are:
+
+- A log-density function `logp` for the target distribution.
+- A parametrised invertible transformation (the planar flow) applied to a simple base distribution.
+
+## Target Distribution
+
+The banana target used here is defined in `example/targets/banana.jl` (see source for details):
+
+```julia
+using Random, Distributions
+Random.seed!(123)
+
+target = Banana(2, 1.0, 10.0)  # (dimension, nonlinearity, scale)
+logp = Base.Fix1(logpdf, target)
+```
+
+You can visualise its contour and samples (figure shipped as `banana.png`).
+
+![Banana](banana.png)
+
+## Planar Flow
+
+A planar flow of length N applies a sequence of planar layers to a base distribution q₀:
+
+```math
+T_{n,\theta_n}(x) = x + u_n \tanh(w_n^T x + b_n), \qquad n = 1,\ldots,N.
+```
+
+Parameters θₙ = (uₙ, wₙ, bₙ) are learned. `Bijectors.jl` provides `PlanarLayer`.
+
+```julia
+using Bijectors
+using Functors # for @leaf
+
+function create_planar_flow(n_layers::Int, q₀)
+    d = length(q₀)
+    Ls = [PlanarLayer(d) for _ in 1:n_layers]
+    ts = reduce(∘, Ls)  # alternatively: FunctionChains.fchain(Ls)
+    return transformed(q₀, ts)
+end
+
+@leaf MvNormal  # prevent updating base distribution parameters
+q₀ = MvNormal(zeros(2), ones(2))
+flow = create_planar_flow(10, q₀)
+flow_untrained = deepcopy(flow)  # keep copy for comparison
+```
+
+If you build *many* layers (e.g. > ~30) you may reduce compilation time by using `FunctionChains.jl`:
+
+```julia
+# uncomment the following lines to use FunctionChains
+# using FunctionChains
+# ts = fchain([PlanarLayer(d) for _ in 1:n_layers])
+```
+See [this comment](https://github.com/TuringLang/NormalizingFlows.jl/blob/8f4371d48228adf368d851e221af076ff929f1cf/src/NormalizingFlows.jl#L52)
+for how the compilation time might be a concern.
+
+## Training the Flow
+
+We maximize the ELBO (here using the minibatch estimator `elbo_batch`) with the generic `train_flow` interface.
+
+```julia
+using NormalizingFlows
+using ADTypes, Optimisers
+using Mooncake
+
+sample_per_iter = 32
+adtype = ADTypes.AutoMooncake(; config=Mooncake.Config())  # try AutoZygote() / AutoForwardDiff() / etc.
+# optional: callback function to track the batch size per iteration and the AD backend used 
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter, ad=adtype)
+# optional: defined stopping criteria when the gradient norm is less than 1e-3
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < 1e-3
+
+flow_trained, stats, _ = train_flow(
+    elbo_batch,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters = 20_000,
+    optimiser = Optimisers.Adam(1e-2),
+    ADbackend = adtype,
+    callback = cb,
+    hasconverged = checkconv,
+    show_progress = false,
+)
+
+losses = map(x -> x.loss, stats)
+```
+
+Plot the losses (negative ELBO):
+
+```julia
+using Plots
+plot(losses; xlabel = "iteration", ylabel = "negative ELBO", label = "", lw = 2)
+```
+
+![elbo](elbo.png)
+
+## Evaluating the Trained Flow
+
+The trained flow is a `Bijectors.TransformedDistribution`, so we can call `rand` to draw iid samples and call `logpdf` to evaluate the log-density function of the flow.
+See [documentation of `Bijectors.jl`](https://turinglang.org/Bijectors.jl/dev/distributions/) for details.
+```julia
+n_samples = 1_000
+samples_trained   = rand(flow_trained, n_samples)
+samples_untrained = rand(flow_untrained, n_samples)
+samples_true      = rand(target, n_samples)
+```
+
+Simple visual comparison:
+
+```julia
+using Plots
+scatter(samples_true[1, :], samples_true[2, :]; label="Target", ms=2, alpha=0.5)
+scatter!(samples_untrained[1, :], samples_untrained[2, :]; label="Untrained", ms=2, alpha=0.5)
+scatter!(samples_trained[1, :],  samples_trained[2, :];  label="Trained", ms=2, alpha=0.5)
+plot!(title = "Planar Flow: Before vs After Training", xlabel = "x₁", ylabel = "x₂", legend = :topleft)
+```
+
+![compare](comparison.png)
+
+## Notes
+
+- Use `elbo` instead of `elbo_batch` for a single-sample estimator.
+- Switch AD backends by changing `adtype` (see `ADTypes.jl`).
+- Marking the base distribution with `@leaf` prevents its parameters from being updated during training.
+
+## Reference
+
+[^RM2015]: Rezende, D. & Mohamed, S. (2015). Variational Inference with Normalizing Flows. ICML.
diff --git a/docs/src/RealNVP.md b/docs/src/RealNVP.md
new file mode 100644
index 0000000..53fd2c5
--- /dev/null
+++ b/docs/src/RealNVP.md
@@ -0,0 +1,54 @@
+# Demo of RealNVP on 2D Banana Distribution
+
+```julia
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Mooncake
+using NormalizingFlows
+
+
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
+
+######################################
+# set up the RealNVP
+######################################
+@leaf MvNormal
+q0 = MvNormal(zeros(T, 2), I)
+
+d = 2
+hdims = [16, 16]
+nlayers = 3
+
+# use NormalizingFlows.realnvp to create a RealNVP flow
+flow = realnvp(q0, hdims, nlayers; paramtype=T)
+flow_untrained = deepcopy(flow)
+
+
+######################################
+# start training
+######################################
+sample_per_iter = 16
+
+# callback function to log training progress
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
+flow_trained, stats, _ = train_flow(
+    rng, 
+    elbo,        # using elbo_batch instead of elbo achieves 4-5 times speedup 
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(5e-4),
+    ADbackend=adtype,
+    show_progress=true,
+    callback=cb,
+    hasconverged=checkconv,
+)
+θ, re = Optimisers.destructure(flow_trained)
+losses = map(x -> x.loss, stats)
+```
\ No newline at end of file
diff --git a/docs/src/api.md b/docs/src/api.md
index 13fe0c2..e6ec251 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -1,62 +1,27 @@
-## API
+# API
 
 ```@index
 ```
 
+## Variational Objectives
 
-## Main Function
+We provide ELBO (reverse KL) and expected log-likelihood (forward KL). You can also
+supply your own objective with the signature `vo(rng, flow, args...)`.
 
-```@docs
-NormalizingFlows.train_flow
-```
+### Evidence Lower Bound (ELBO)
 
-The flow object can be constructed by `transformed` function in `Bijectors.jl` package.
-For example of Gaussian VI, we can construct the flow as follows:
-```@julia
-using Distributions, Bijectors
-T= Float32
-@leaf MvNormal # to prevent params in q₀ from being optimized
-q₀ = MvNormal(zeros(T, 2), ones(T, 2))
-flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2)))
-```
-To train the Gaussian VI targeting at distirbution $p$ via ELBO maiximization, we can run
-```@julia
-using NormalizingFlows
-
-sample_per_iter = 10
-flow_trained, stats, _ = train_flow(
-    elbo,
-    flow,
-    logp,
-    sample_per_iter;
-    max_iters=2_000,
-    optimiser=Optimisers.ADAM(0.01 * one(T)),
-)
-```
-## Variational Objectives
-We have implemented two variational objectives, namely, ELBO and the log-likelihood objective. 
-Users can also define their own objective functions, and pass it to the [`train_flow`](@ref) function.
-`train_flow` will optimize the flow parameters by maximizing `vo`.
-The objective function should take the following general form:
-```julia
-vo(rng, flow, args...) 
-```
-where `rng` is the random number generator, `flow` is the flow object, and `args...` are the
-additional arguments that users can pass to the objective function.
+By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence between $q_\theta$ and $p$:
 
-#### Evidence Lower Bound (ELBO)
-By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence between $q_\theta$ and $p$, i.e., 
-```math 
+```math
 \begin{aligned}
 &\min _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right]  \quad \text{(Reverse KL)}\\
 & = \max _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ
-T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ
-F_1(X)\right)\right] \quad \text{(ELBO)} 
+T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(T_n \circ \cdots \circ
+T_1(X)\right)\right] \quad \text{(ELBO)}
 \end{aligned}
 ```
-Reverse KL minimization is typically used for **Bayesian computation**, 
-where one only has access to the log-(unnormalized)density of the target distribution $p$ (e.g., a Bayesian posterior distribution), 
-and hope to generate approximate samples from it.
+
+Reverse KL minimization is typically used for Bayesian computation when only `logp` is available.
 
 ```@docs
 NormalizingFlows.elbo
@@ -66,26 +31,67 @@ NormalizingFlows.elbo
 NormalizingFlows.elbo_batch
 ```
 
-#### Log-likelihood
+### Log-likelihood
+
+By maximizing the log-likelihood, it is equivalent to minimizing the forward KL divergence between $q_\theta$ and $p$:
 
-By maximizing the log-likelihood, it is equivalent to minimizing the forward KL divergence between $q_\theta$ and $p$, i.e., 
-```math 
+```math
 \begin{aligned}
 & \min_{\theta} \mathbb{E}_{p}\left[\log q_{\theta}(Z)-\log p(Z)\right] \quad \text{(Forward KL)} \\
 & = \max_{\theta} \mathbb{E}_{p}\left[\log q_{\theta}(Z)\right] \quad \text{(Expected log-likelihood)}
 \end{aligned}
 ```
-Forward KL minimization is typically used for **generative modeling**, 
-where one is given a set of samples from the target distribution $p$ (e.g., images)
-and aims to learn the density or a generative process that outputs high quality samples.
+
+Forward KL minimization is typically used for generative modeling when samples from `p` are given.
 
 ```@docs
 NormalizingFlows.loglikelihood
 ```
 
-
 ## Training Loop
 
 ```@docs
 NormalizingFlows.optimize
 ```
+
+
+## Available Flows
+
+`NormalizingFlows.jl` provides two commonly used normalizing flows---`RealNVP` and
+`Neural Spline Flow (NSF)`---and two simple flows---`Planar Flow` and `Radial Flow`.
+
+### RealNVP (Affine Coupling Flow)
+
+These helpers construct commonly used coupling-based flows with sensible defaults.
+
+```@docs
+NormalizingFlows.realnvp
+NormalizingFlows.RealNVP_layer
+NormalizingFlows.AffineCoupling
+```
+
+### Neural Spline Flow (NSF)
+
+```@docs
+NormalizingFlows.nsf
+NormalizingFlows.NSF_layer
+NormalizingFlows.NeuralSplineCoupling
+```
+
+#### Planar and Radial Flows
+
+```@docs
+NormalizingFlows.planarflow
+NormalizingFlows.radialflow
+```
+
+## Utility Functions
+
+```@docs
+NormalizingFlows.create_flow
+```
+
+```@docs
+NormalizingFlows.fnn
+```
+
diff --git a/docs/src/customized_layer.md b/docs/src/customized_layer.md
index af9062c..9fb1d1d 100644
--- a/docs/src/customized_layer.md
+++ b/docs/src/customized_layer.md
@@ -12,7 +12,11 @@ for more details.
 
 
 In this tutorial, we demonstrate how to define a customized normalizing flow
-layer -- an `Affine Coupling Layer` (Dinh *et al.*, 2016) -- using `Bijectors.jl` and `Flux.jl`.
+layer -- an `Affine Coupling Layer` -- using `Bijectors.jl` and `Flux.jl`,
+which is the building block of the RealNVP flow [^LJS2017].
+It's worth mentioning that the [`realnvp`](@ref) implemented in `NormalizingFlows.jl`
+is slightly different from this tutorial with some optimization for the training stability
+and performance.
 
 ## Affine Coupling Flow
 
@@ -176,5 +180,4 @@ logpdf(flow, x[:,1])
 
 
 ## Reference
-Dinh, L., Sohl-Dickstein, J. and Bengio, S., 2016. *Density estimation using real nvp.* 
-arXiv:1605.08803.
\ No newline at end of file
+[^LJS2017]: Dinh, L., Sohl-Dickstein, J. and Bengio, S., 2017. Density estimation using real nvp. in *ICLR*
\ No newline at end of file
diff --git a/docs/src/example.md b/docs/src/example.md
deleted file mode 100644
index 01a9a67..0000000
--- a/docs/src/example.md
+++ /dev/null
@@ -1,122 +0,0 @@
-## Example: Using Planar Flow
-
-Here we provide a minimal demonstration of learning a synthetic 2d banana distribution
-using *planar flows* (Renzende *et al.* 2015) by maximizing the [Evidence Lower Bound (ELBO)](@ref).
-To complete this task, the two key inputs are:
-- the log-density function of the target distribution, 
-- the planar flow. 
-
-#### The Target Distribution
-
-The `Banana` object is defined in `example/targets/banana.jl`, see the [source code](https://github.com/zuhengxu/NormalizingFlows.jl/blob/main/example/targets/banana.jl) for details.
-```julia
-p = Banana(2, 1.0f-1, 100.0f0)
-logp = Base.Fix1(logpdf, p)
-```
-Visualize the contour of the log-density and the sample scatters of the target distribution: 
-![Banana](banana.png)
-
-
-
-
-#### The Planar Flow 
-
-The planar flow is defined by repeated applying a sequence of invertible
-transformations to a base distribution $q_0$.  The building blocks for a planar flow
-of length $N$ are the following invertible transformations, called *planar layers*:
-```math
-\text{planar layers}: 
-T_{n, \theta_n}(x)=x+u_n \cdot \tanh \left(w_n^T x+b_n\right), \quad n=1, \ldots, N, 
-```
-where $\theta_n = (u_n, w_n, b_n), n=1, \dots, N$ are the parameters to be learned. 
-Thankfully, [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl)
-provides a nice framework to define a normalizing flow.
-Here we used the `PlanarLayer()` from `Bijectors.jl` to construct a 
-20-layer planar flow, of which the base distribution is a 2d standard Gaussian distribution.
-
-```julia
-using Bijectors, FunctionChains
-using Functors
-
-function create_planar_flow(n_layers::Int, q₀)
-    d = length(q₀)
-    Ls = [f32(PlanarLayer(d)) for _ in 1:n_layers]
-    ts = fchain(Ls)
-    return transformed(q₀, ts)
-end
-
-# create a 20-layer planar flow
-@leaf MvNormal # to prevent params in q₀ from being optimized
-q₀ = MvNormal(zeros(Float32, 2), I)
-flow = create_planar_flow(20, q₀)
-flow_untrained = deepcopy(flow) # keep a copy of the untrained flow for comparison
-```
-*Notice that here the flow layers are chained together using `fchain` function from [`FunctionChains.jl`](https://github.com/oschulz/FunctionChains.jl). 
-Alternatively, one can do*
-```julia
-ts = reduce(∘, [f32(PlanarLayer(d)) for i in 1:20]) 
-```
-*However, we recommend using `fchain` to reduce the compilation time when the number of layers is large.
-See [this comment](https://github.com/TuringLang/NormalizingFlows.jl/blob/8f4371d48228adf368d851e221af076ff929f1cf/src/NormalizingFlows.jl#L52)
-for how the compilation time might be a concern.*
-
-
-#### Flow Training
-Then we can train the flow by maximizing the ELBO using the [`train_flow`](@ref) function as follows: 
-```julia
-using NormalizingFlows
-using ADTypes
-using Optimisers
-
-sample_per_iter = 10
-# callback function to track the number of samples used per iteration
-cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,)
-# defined stopping criteria when the gradient norm is less than 1e-3
-checkconv(iter, stat, re, θ, st) = stat.gradient_norm < 1e-3
-flow_trained, stats, _ = train_flow(
-    elbo,
-    flow,
-    logp,
-    sample_per_iter;
-    max_iters=200_00,
-    optimiser=Optimisers.ADAM(),
-    callback=cb,
-    hasconverged=checkconv,
-    ADbackend=AutoZygote(), # using Zygote as the AD backend
-)
-```
-
-Examine the loss values during training:
-```julia
-using Plots
-
-losses = map(x -> x.loss, stats)
-plot(losses; xlabel = "#iteration", ylabel= "negative ELBO", label="", linewidth=2) 
-```
-![elbo](elbo.png)
-
-## Evaluating Trained Flow 
-Finally, we can evaluate the trained flow by sampling from it and compare it with the target distribution.
-Since the flow is defined as a `Bijectors.TransformedDistribution`, one can
-easily sample from it using `rand` function, or examine the density using `logpdf` function.
-See [documentation of `Bijectors.jl`](https://turinglang.org/Bijectors.jl/dev/distributions/) for details.
-```julia
-using Random, Distributions
-
-nsample = 1000
-samples_trained = rand(flow_trained, n_samples) # 1000 iid samples from the trained flow 
-samples_untrained = rand(flow_untrained, n_samples) # 1000 iid samples from the untrained flow
-samples_true = rand(p, n_samples) # 1000 iid samples from the target
-
-# plot 
-scatter(samples_true[1, :], samples_true[2, :]; label="True Distribution", color=:blue, markersize=2, alpha=0.5)
-scatter!(samples_untrained[1, :], samples_untrained[2, :]; label="Untrained Flow", color=:red, markersize=2, alpha=0.5)
-scatter!(samples_trained[1, :], samples_trained[2, :]; label="Trained Flow", color=:green, markersize=2, alpha=0.5)
-plot!(title = "Comparison of Trained and Untrained Flow", xlabel = "X", ylabel= "Y", legend=:topleft) 
-```
-![compare](comparison.png)
-
-
-## Reference 
-
-- Rezende, D. and Mohamed, S., 2015. *Variational inference with normalizing flows*. International Conference on Machine Learning  
diff --git a/docs/src/index.md b/docs/src/index.md
index f685b8b..d53c2b0 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -4,7 +4,7 @@ CurrentModule = NormalizingFlows
 
 # NormalizingFlows.jl
 
-Documentation for [NormalizingFlows](https://github.com/TuringLang/NormalizingFlows.jl).
+Documentation for [NormalizingFlows.jl](https://github.com/TuringLang/NormalizingFlows.jl).
 
 
 The purpose of this package is to provide a simple and flexible interface for 
@@ -15,13 +15,11 @@ construct (e.g., define customized flow layers) and combine various components
 for variational approximation of general target distributions, 
 *without being tied to specific probabilistic programming frameworks or applications*. 
 
-See the [documentation](https://turinglang.org/NormalizingFlows.jl/dev/) for more.  
-
 ## Installation
 To install the package, run the following command in the Julia REPL:
 ```
 ]  # enter Pkg mode
-(@v1.11) pkg> add git@github.com:TuringLang/NormalizingFlows.jl.git
+(@v1.11) pkg> add NormalizingFlows
 ```
 Then simply run the following command to use the package:
 ```julia
@@ -67,7 +65,7 @@ optimization problems:
 \text{Reverse KL:}\quad
 &\argmin _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right] \\
 &= \argmin _{\theta} \mathbb{E}_{q_0}\left[\log \frac{q_\theta(T_N\circ \cdots \circ T_1(Z_0))}{p(T_N\circ \cdots \circ T_1(Z_0))}\right] \\
-&= \argmax _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ F_1(X)\right)\right]
+&= \argmax _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(T_n \circ \cdots \circ T_1(X)\right)\right]
 \end{aligned}
 ```
 and 
@@ -82,4 +80,4 @@ Both problems can be solved via standard stochastic optimization algorithms,
 such as stochastic gradient descent (SGD) and its variants. 
 
 
-
+For a detailed introduction of normalizing flows, please refer to
diff --git a/docs/src/usage.md b/docs/src/usage.md
new file mode 100644
index 0000000..cefb816
--- /dev/null
+++ b/docs/src/usage.md
@@ -0,0 +1,40 @@
+## General usage
+
+`train_flow` is the main function to train a normalizing flow. 
+The users mostly need to specify a normalizing flow `flow`, 
+the variational objective `vo` and its corresponding arguments `args...`.
+
+```@docs
+NormalizingFlows.train_flow
+```
+
+The flow object can be constructed by `transformed` function in `Bijectors.jl`.
+For example, for mean-field Gaussian VI, we can construct the flow family as follows:
+
+```julia
+using Distributions, Bijectors
+T = Float32
+@leaf MvNormal # to prevent params in q₀ from being optimized
+q₀ = MvNormal(zeros(T, 2), ones(T, 2))
+# the flow family is defined by a shift and a scale 
+flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2)))
+```
+
+To train the Gaussian VI targeting distribution `p` via ELBO maximization, run:
+
+```julia
+using NormalizingFlows, Optimisers
+using ADTypes, Mooncake
+
+sample_per_iter = 10
+flow_trained, stats, _ = train_flow(
+    elbo,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=5_000,
+    optimiser=Optimisers.Adam(one(T)/100),
+    ADbackend=ADTypes.AutoMooncake(; config=Mooncake.Config()),
+    show_progress=true,
+)
+```
\ No newline at end of file
diff --git a/example/Project.toml b/example/Project.toml
index 0b9b021..79c847b 100644
--- a/example/Project.toml
+++ b/example/Project.toml
@@ -3,9 +3,10 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -16,7 +17,12 @@ Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+Mooncake = "0.4.142"
 
 [extras]
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl
index 516e4c9..f3d705e 100644
--- a/example/demo_RealNVP.jl
+++ b/example/demo_RealNVP.jl
@@ -1,7 +1,3 @@
-using Flux
-using Bijectors
-using Bijectors: partition, combine, PartitionMask
-
 using Random, Distributions, LinearAlgebra
 using Functors
 using Optimisers, ADTypes
@@ -11,114 +7,6 @@ using NormalizingFlows
 include("SyntheticTargets.jl")
 include("utils.jl")
 
-##################################
-# define affine coupling layer using Bijectors.jl interface
-#################################
-struct AffineCoupling <: Bijectors.Bijector
-    dim::Int
-    mask::Bijectors.PartitionMask
-    s::Flux.Chain
-    t::Flux.Chain
-end
-
-# let params track field s and t
-@functor AffineCoupling (s, t)
-
-function AffineCoupling(
-    dim::Int,  # dimension of input
-    hdims::Int, # dimension of hidden units for s and t
-    mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-)
-    cdims = length(mask_idx) # dimension of parts used to construct coupling law
-    s = mlp3(cdims, hdims, cdims)
-    t = mlp3(cdims, hdims, cdims)
-    mask = PartitionMask(dim, mask_idx)
-    return AffineCoupling(dim, mask, s, t)
-end
-
-function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
-    # partition vector using 'af.mask::PartitionMask`
-    x₁, x₂, x₃ = partition(af.mask, x)
-    y₁ = x₁ .* af.s(x₂) .+ af.t(x₂)
-    return combine(af.mask, y₁, x₂, x₃)
-end
-
-function (af::AffineCoupling)(x::AbstractArray)
-    return transform(af, x)
-end
-
-function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2)) # this is a scalar
-    return combine(af.mask, y_1, x_2, x_3), logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix)
-    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
-    y_1 = af.s(x_2) .* x_1 .+ af.t(x_2)
-    logjac = sum(log ∘ abs, af.s(x_2); dims = 1) # 1 × size(x, 2)
-    return combine(af.mask, y_1, x_2, x_3), vec(logjac)
-end
-
-
-function Bijectors.with_logabsdet_jacobian(
-    iaf::Inverse{<:AffineCoupling}, y::AbstractVector
-)
-    af = iaf.orig
-    # partition vector using `af.mask::PartitionMask`
-    y_1, y_2, y_3 = partition(af.mask, y)
-    # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2))
-    return combine(af.mask, x_1, y_2, y_3), logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(
-    iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix
-)
-    af = iaf.orig
-    # partition vector using `af.mask::PartitionMask`
-    y_1, y_2, y_3 = partition(af.mask, y)
-    # inverse transformation
-    x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2)
-    logjac = -sum(log ∘ abs, af.s(y_2); dims = 1)
-    return combine(af.mask, x_1, y_2, y_3), vec(logjac)
-end
-
-################### 
-# an equivalent definition of AffineCoupling using Bijectors.Coupling 
-# (see https://github.com/TuringLang/Bijectors.jl/blob/74d52d4eda72a6149b1a89b72524545525419b3f/src/bijectors/coupling.jl#L188C1-L188C1)
-###################
-
-# struct AffineCoupling <: Bijectors.Bijector
-#     dim::Int
-#     mask::Bijectors.PartitionMask
-#     s::Flux.Chain
-#     t::Flux.Chain
-# end
-
-# # let params track field s and t
-# @functor AffineCoupling (s, t)
-
-# function AffineCoupling(dim, mask, s, t)
-#     return Bijectors.Coupling(θ -> Bijectors.Shift(t(θ)) ∘ Bijectors.Scale(s(θ)), mask)
-# end
-
-# function AffineCoupling(
-#     dim::Int,  # dimension of input
-#     hdims::Int, # dimension of hidden units for s and t
-#     mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on
-# )
-#     cdims = length(mask_idx) # dimension of parts used to construct coupling law
-#     s = mlp3(cdims, hdims, cdims)
-#     t = mlp3(cdims, hdims, cdims)
-#     mask = PartitionMask(dim, mask_idx)
-#     return AffineCoupling(dim, mask, s, t)
-# end
-
-
-
 ##################################
 # start demo
 #################################
@@ -129,33 +17,34 @@ T = Float32
 ######################################
 # a difficult banana target
 ######################################
-target = Banana(2, 1.0f0, 100.0f0)
+target = Banana(2, one(T), 100one(T))
 logp = Base.Fix1(logpdf, target)
 
+
 ######################################
 # learn the target using Affine coupling flow
 ######################################
 @leaf MvNormal
-q0 = MvNormal(zeros(T, 2), ones(T, 2))
+q0 = MvNormal(zeros(T, 2), I)
 
 d = 2
-hdims = 32
-
-# alternating the coupling layers
-Ls = [AffineCoupling(d, hdims, [1]) ∘ AffineCoupling(d, hdims, [2]) for i in 1:3]
+hdims = [16, 16]
+nlayers = 3
 
-flow = create_flow(Ls, q0)
+# use NormalizingFlows.realnvp to create a RealNVP flow
+flow = realnvp(q0, hdims, nlayers; paramtype=T)
 flow_untrained = deepcopy(flow)
 
 
 ######################################
 # start training
 ######################################
-sample_per_iter = 64
+sample_per_iter = 16
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
 adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
     rng, 
@@ -163,7 +52,7 @@ flow_trained, stats, _ = train_flow(
     flow,
     logp,
     sample_per_iter;
-    max_iters=100,   # change to larger number of iterations (e.g., 50_000) for better results
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
     optimiser=Optimisers.Adam(5e-4),
     ADbackend=adtype,
     show_progress=true,
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
index ffeba09..4e98c30 100644
--- a/example/demo_neural_spline_flow.jl
+++ b/example/demo_neural_spline_flow.jl
@@ -1,114 +1,12 @@
-using Flux
-using Bijectors
-using Bijectors: partition, combine, PartitionMask
-
 using Random, Distributions, LinearAlgebra
 using Functors
 using Optimisers, ADTypes
-using Mooncake
+using Zygote
 using NormalizingFlows
 
 include("SyntheticTargets.jl")
 include("utils.jl")
 
-##################################
-# define neural spline layer using Bijectors.jl interface
-#################################
-"""
-Neural Rational quadratic Spline layer 
-
-# References
-[1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML],  (2019). 
-"""
-struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector
-    dim::Int                # dimension of input
-    K::Int                  # number of knots
-    n_dims_transferred::Int  # number of dimensions that are transformed
-    nn::A   # networks that parmaterize the knots and derivatives
-    B::T                    # bound of the knots
-    mask::Bijectors.PartitionMask
-end
-
-function NeuralSplineLayer(
-    dim::T1,                # dimension of input
-    hdims::T1,              # dimension of hidden units for s and t
-    K::T1,                  # number of knots
-    B::T2,                  # bound of the knots
-    mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on
-) where {T1<:Int,T2<:Real}
-    num_of_transformed_dims = length(mask_idx)
-    input_dims = dim - num_of_transformed_dims
-    
-    # output dim of the NN
-    output_dims = (3K - 1)*num_of_transformed_dims
-    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
-    nn = mlp3(input_dims, hdims, output_dims)
-
-    mask = Bijectors.PartitionMask(dim, mask_idx)
-    return NeuralSplineLayer(dim, K, num_of_transformed_dims, nn, B, mask)
-end
-
-@functor NeuralSplineLayer (nn,)
-
-# define forward and inverse transformation
-"""
-Build a rational quadratic spline from the nn output
-Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline
-
-we just need to map the nn output to the knots and derivatives of the RQS
-"""
-function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector)
-    K, B = nsl.K, nsl.B
-    nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :)
-    ws = @view nnoutput[:, 1:K]
-    hs = @view nnoutput[:, (K + 1):(2K)]
-    ds = @view nnoutput[:, (2K + 1):(3K - 1)]
-    return Bijectors.RationalQuadraticSpline(ws, hs, ds, B)
-end
-
-function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    # instantiate rqs knots and derivatives
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1 = Bijectors.transform(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3)
-end
-
-function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, y3 = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    x1 = Bijectors.transform(Inverse(rqs), y1)
-    return Bijectors.combine(nsl.mask, x1, y2, y3)
-end
-
-function (nsl::NeuralSplineLayer)(x::AbstractVector)
-    return Bijectors.transform(nsl, x)
-end
-
-# define logabsdetjac
-function Bijectors.logabsdetjac(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, _ = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    logjac = logabsdetjac(rqs, x_1)
-    return logjac
-end
-
-function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector)
-    nsl = insl.orig
-    y1, y2, _ = partition(nsl.mask, y)
-    rqs = instantiate_rqs(nsl, y2)
-    logjac = logabsdetjac(Inverse(rqs), y1)
-    return logjac
-end
-
-function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineLayer, x::AbstractVector)
-    x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x)
-    rqs = instantiate_rqs(nsl, x_2)
-    y_1, logjac = with_logabsdet_jacobian(rqs, x_1)
-    return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac
-end
-
 ##################################
 # start demo
 #################################
@@ -117,30 +15,20 @@ rng = Random.default_rng()
 T = Float32
 
 ######################################
-# neals funnel target
+# a difficult banana target
 ######################################
-target = Funnel(2, 0.0f0, 9.0f0)
+target = Banana(2, one(T), 100one(T))
 logp = Base.Fix1(logpdf, target)
 
 ######################################
-# learn the target using Affine coupling flow
+# learn the target using Neural Spline Flow
 ######################################
 @leaf MvNormal
-q0 = MvNormal(zeros(T, 2), ones(T, 2))
+q0 = MvNormal(zeros(T, 2), I)
 
-d = 2
-hdims = 64
-K = 10
-B = 30
-Ls = [
-    NeuralSplineLayer(d, hdims, K, B, [1]) ∘ NeuralSplineLayer(d, hdims, K, B, [2]) for
-    i in 1:3
-]
 
-flow = create_flow(Ls, q0)
+flow = nsf(q0; paramtype=T)
 flow_untrained = deepcopy(flow)
-
-
 ######################################
 # start training
 ######################################
@@ -148,15 +36,16 @@ sample_per_iter = 64
 
 # callback function to log training progress
 cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
-adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+# nsf only supports AutoZygote
+adtype = ADTypes.AutoZygote()
 checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
 flow_trained, stats, _ = train_flow(
-    elbo,
+    elbo_batch,
     flow,
     logp,
     sample_per_iter;
-    max_iters=100,   # change to larger number of iterations (e.g., 50_000) for better results
-    optimiser=Optimisers.Adam(5e-5),
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
     ADbackend=adtype,
     show_progress=true,
     callback=cb,
diff --git a/example/demo_planar_flow.jl b/example/demo_planar_flow.jl
index 25a694e..3fbe7c8 100644
--- a/example/demo_planar_flow.jl
+++ b/example/demo_planar_flow.jl
@@ -20,16 +20,9 @@ logp = Base.Fix1(logpdf, target)
 ######################################
 # setup planar flow
 ######################################
-function create_planar_flow(n_layers::Int, q₀)
-    d = length(q₀)
-    Ls = [PlanarLayer(d) for _ in 1:n_layers]
-    ts = reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
-
 @leaf MvNormal
 q0 = MvNormal(zeros(T, 2), ones(T, 2))
-flow = create_planar_flow(10, q0)
+flow = planarflow(q0, 10; paramtype=T)
 flow_untrained = deepcopy(flow)
 
 ######################################
diff --git a/example/demo_radial_flow.jl b/example/demo_radial_flow.jl
index 89b4c56..044d550 100644
--- a/example/demo_radial_flow.jl
+++ b/example/demo_radial_flow.jl
@@ -19,17 +19,10 @@ logp = Base.Fix1(logpdf, target)
 ######################################
 # setup radial flow 
 ######################################
-function create_radial_flow(n_layers::Int, q₀)
-    d = length(q₀)
-    Ls = [RadialLayer(d) for _ in 1:n_layers]
-    ts = reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
-
 # create a 10-layer radial flow
 @leaf MvNormal
 q0 = MvNormal(zeros(T, 2), ones(T, 2))
-flow = create_radial_flow(10, q0)
+flow = radialflow(q0, 10; paramtype=T)
 
 flow_untrained = deepcopy(flow)
 
diff --git a/example/utils.jl b/example/utils.jl
index b5d19ca..bdd2858 100644
--- a/example/utils.jl
+++ b/example/utils.jl
@@ -1,22 +1,6 @@
 using Random, Distributions, LinearAlgebra
+using Bijectors
 using Bijectors: transformed
-using Flux
-
-"""
-A simple wrapper for a 3 layer dense MLP
-"""
-function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
-    return Chain(
-        Flux.Dense(input_dim, hidden_dims, activation),
-        Flux.Dense(hidden_dims, hidden_dims, activation),
-        Flux.Dense(hidden_dims, output_dim),
-    )
-end
-
-function create_flow(Ls, q₀)
-    ts =  reduce(∘, Ls)
-    return transformed(q₀, ts)
-end
 
 function compare_trained_and_untrained_flow(
     flow_trained::Bijectors.MultivariateTransformed,
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 72318df..70c4699 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -1,13 +1,15 @@
 module NormalizingFlows
 
 using ADTypes
-using Bijectors
 using Distributions
 using LinearAlgebra
 using Optimisers
 using ProgressMeter
 using Random
 using StatsBase
+using Bijectors
+using Bijectors: PartitionMask, Inverse, combine, partition
+using Functors
 import DifferentiationInterface as DI
 
 using DocStringExtensions
@@ -19,11 +21,12 @@ export train_flow, elbo, elbo_batch, loglikelihood
 
 Train the given normalizing flow `flow` by calling `optimize`.
 
-# Arguments
-- `rng::AbstractRNG`: random number generator
-- `vo`: variational objective
-- `flow`: normalizing flow to be trained, we recommend to define flow as `<:Bijectors.TransformedDistribution` 
-- `args...`: additional arguments for `vo`
+Arguments
+- `rng::AbstractRNG`: random number generator (default: `Random.default_rng()`)
+- `vo`: variational objective with signature `vo(rng, flow, args...)`. 
+    We implement [`elbo`](@ref), [`elbo_batch`](@ref), and [`loglikelihood`](@ref).
+- `flow`: the normalizing flow---a `Bijectors.TransformedDistribution` (recommended)
+- `args...`: additional arguments passed to `vo`
 
 # Keyword Arguments
 - `max_iters::Int=1000`: maximum number of iterations
@@ -123,4 +126,19 @@ function _device_specific_rand(
     return Random.rand(rng, td, n)
 end
 
+
+# interface of contructing common flow layers
+include("flows/utils.jl")
+include("flows/planar_radial.jl")
+include("flows/realnvp.jl")
+
+using MonotonicSplines
+include("flows/neuralspline.jl")
+
+export create_flow
+export planarflow, radialflow
+export AffineCoupling, RealNVP_layer, realnvp
+export NeuralSplineCoupling, NSF_layer, nsf
+
+
 end
diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl
new file mode 100644
index 0000000..0d8eab0
--- /dev/null
+++ b/src/flows/neuralspline.jl
@@ -0,0 +1,234 @@
+"""
+    NeuralSplineCoupling(dim, hdims, K, B, mask_idx, paramtype)
+    NeuralSplineCoupling(dim, K, n_dims_transformed, B, nn, mask)
+
+Neural Rational Quadratic Spline (RQS) coupling bijector [^DBMP2019].
+
+A conditioner network takes the unchanged partition as input and outputs the
+parameters of monotonic rational quadratic splines for the transformed
+coordinates. Batched inputs (matrices with column vectors) are supported.
+
+Arguments
+- `dim::Int`: total input dimension.
+- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner MLP.
+- `K::Int`: number of spline knots per transformed coordinate.
+- `B::AbstractFloat`: boundary/box constraint for spline domain.
+- `mask_idx::AbstractVector{Int}`: indices of the transformed coordinates.
+
+Keyword Arguments
+- `paramtype::Type{<:AbstractFloat}`: parameter element type.
+
+Fields
+- `nn::Flux.Chain`: conditioner that outputs all spline params for all transformed dim.
+- `mask::Bijectors.PartitionMask`: partition specification.
+
+Notes
+- Output dimensionality of the conditioner is `(3K - 1) * n_transformed`.
+- For computation performance, we rely on 
+[`MonotonicSplines.jl`](https://github.com/bat/MonotonicSplines.jl) for the
+building the rational quadratic spline functions.
+- See `MonotonicSplines.rqs_forward` and `MonotonicSplines.rqs_inverse` for forward/inverse 
+and log-determinant computations.
+
+[^DBMP2019]: Durkan, C., Bekasov, A., Murray, I. and Papamarkou, T. (2019). Neural Spline Flows. *NeurIPS.*
+"""
+struct NeuralSplineCoupling{T,A<:Flux.Chain} <: Bijectors.Bijector
+    dim::Int                        # dimension of input
+    K::Int                          # number of knots
+    n_dims_transformed::Int         # number of dimensions that are transformed
+    B::T                            # bound of the knots
+    nn::A                           # networks that parameterize the knots and derivatives
+    mask::Bijectors.PartitionMask
+end
+
+function NeuralSplineCoupling(
+    dim::T1,                         # dimension of input
+    hdims::AbstractVector{T1},       # dimension of hidden units for s and t
+    K::T1,                           # number of knots
+    B::T2,                           # bound of the knots
+    mask_idx::AbstractVector{T1},    # indices of the transformed dimensions
+    paramtype::Type{T2},             # type of the parameters, e.g., Float64 or Float32
+) where {T1<:Int,T2<:AbstractFloat}
+    num_of_transformed_dims = length(mask_idx)
+    input_dims = dim - num_of_transformed_dims
+    
+    output_dims = (3K - 1)*num_of_transformed_dims
+    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
+    nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
+
+    mask = Bijectors.PartitionMask(dim, mask_idx)
+    return NeuralSplineCoupling{T2, typeof(nn)}(dim, K, num_of_transformed_dims, B, nn, mask)
+end
+
+@functor NeuralSplineCoupling (nn,)
+
+function get_nsc_params(nsc::NeuralSplineCoupling, x::AbstractVecOrMat)
+    nnoutput = nsc.nn(x)
+    px, py, dydx = MonotonicSplines.rqs_params_from_nn(
+        nnoutput, nsc.n_dims_transformed, nsc.B
+    )
+    return px, py, dydx
+end
+
+# when input x is a vector instead of a matrix
+# need this to transform it to a matrix with one row
+# otherwise, rqs_forward and rqs_inverse will throw an error
+_ensure_matrix(x) = x isa AbstractVector ? reshape(x, length(x), 1) : x
+
+function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractVector)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    x1 = _ensure_matrix(x1)
+    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, vec(y1), x2, x3)
+end
+function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractMatrix)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, y1, x2, x3)
+end
+
+function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractVector)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    x1 = _ensure_matrix(x1)
+    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, vec(y1), x2, x3), logjac[1]
+end
+function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractMatrix)
+    x1, x2, x3 = Bijectors.partition(nsc.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsc_params(nsc, x2)
+    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, y1, x2, x3), vec(logjac)
+end
+
+function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    y1 = _ensure_matrix(y1)
+    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, vec(x1), y2, y3)
+end
+function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractMatrix)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, x1, y2, y3)
+end
+
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    y1 = _ensure_matrix(y1)
+    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, vec(x1), y2, y3), logjac[1]
+end
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractMatrix)
+    nsc = insl.orig
+    y1, y2, y3 = partition(nsc.mask, y)
+    px, py, dydx = get_nsc_params(nsc, y2)
+    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsc.mask, x1, y2, y3), vec(logjac)
+end
+
+function (nsc::NeuralSplineCoupling)(x::AbstractVecOrMat)
+    return Bijectors.transform(nsc, x)
+end
+
+
+"""
+    NSF_layer(dim, hdims, K, B; paramtype = Float64)
+
+Build a single Neural Spline Flow (NSF) layer by composing two
+`NeuralSplineCoupling` bijectors with complementary odd–even masks.
+
+Arguments
+- `dim::Int`: dimensionality of the problem.
+- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network.
+- `K::Int`: number of spline knots.
+- `B::AbstractFloat`: spline boundary.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type.
+
+Returns
+- A `Bijectors.Bijector` representing the NSF layer.
+
+Example
+- `layer = NSF_layer(4, [64,64], 10, 3.0)`
+- `y = layer(randn(4, 32))`
+"""
+function NSF_layer(
+    dim::T1,                      # dimension of problem
+    hdims::AbstractVector{T1},     # dimension of hidden units for nn 
+    K::T1,                           # number of knots
+    B::T2;                           # bound of the knots
+    paramtype::Type{T2} = Float64,   # type of the parameters
+) where {T1<:Int,T2<:AbstractFloat}
+
+    mask_idx1 = 1:2:dim
+    mask_idx2 = 2:2:dim
+
+    # by default use the odd-even masking strategy
+    nsf1 = NeuralSplineCoupling(dim, hdims, K, B, mask_idx1, paramtype)
+    nsf2 = NeuralSplineCoupling(dim, hdims, K, B, mask_idx2, paramtype)
+    return reduce(∘, (nsf1, nsf2))
+end
+
+"""
+    nsf(q0, hdims, K, B, nlayers; paramtype = Float64)
+    nsf(q0; paramtype = Float64)
+
+Construct an NSF by stacking `nlayers` `NSF_layer` blocks. The one-argument
+variant defaults to 10 layers with `[32, 32]` hidden sizes, 10 knots, and
+boundary `30` (scaled by `one(T)`).
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution.
+- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network.
+- `K::Int`: spline knots per coordinate.
+- `B::AbstractFloat`: spline boundary.
+- `nlayers::Int`: number of NSF layers.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type.
+
+Returns
+- `Bijectors.TransformedDistribution` representing the NSF flow.
+
+!!! note 
+    Under the hood, `nsf` relies on the rational quadratic spline function implememented in 
+    `MonotonicSplines.jl` for performance reasons.  `MonotonicSplines.jl` uses 
+    `KernelAbstractions.jl` to support batched operations. 
+    Because of this, so far `nsf` only supports `Zygote` as the AD type.
+  
+
+Example
+- `q0 = MvNormal(zeros(3), I); flow = nsf(q0, [64,64], 8, 3.0, 6)`
+- `x = rand(flow, 128); lp = logpdf(flow, x)`
+"""
+function nsf(
+    q0::Distribution{Multivariate,Continuous},  
+    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
+    K::Int,
+    B::T,
+    nlayers::Int;                   # number of RealNVP_layer 
+    paramtype::Type{T} = Float64,   # type of the parameters
+) where {T<:AbstractFloat}
+
+    dim = length(q0)  # dimension of the reference distribution == dim of the problem
+    Ls = [NSF_layer(dim, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
+    create_flow(Ls, q0)         
+end
+
+nsf(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = nsf(
+    q0, [32, 32], 10, 30*one(T), 10; paramtype=paramtype
+)
diff --git a/src/flows/planar_radial.jl b/src/flows/planar_radial.jl
new file mode 100644
index 0000000..554cd42
--- /dev/null
+++ b/src/flows/planar_radial.jl
@@ -0,0 +1,60 @@
+"""
+    planarflow(q0, nlayers; paramtype = Float64)
+
+Construct a Planar Flow by stacking `nlayers` `Bijectors.PlanarLayer` blocks
+on top of a base distribution `q0`.
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`).
+- `nlayers::Int`: number of planar layers to compose.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness).
+
+Returns
+- `Bijectors.TransformedDistribution` representing the planar flow.
+
+Example
+- `q0 = MvNormal(zeros(2), I); flow = planarflow(q0, 10)`
+- `x = rand(flow, 128); lp = logpdf(flow, x)`
+"""
+function planarflow(
+    q0::Distribution{Multivariate,Continuous},  
+    nlayers::Int;                   
+    paramtype::Type{T} = Float64,   
+) where {T<:AbstractFloat}
+    dim = length(q0)
+    Ls = [Flux._paramtype(paramtype, Bijectors.PlanarLayer(dim)) for _ in 1:nlayers]
+    return create_flow(Ls, q0)
+end
+
+
+"""
+    radialflow(q0, nlayers; paramtype = Float64)
+
+Construct a Radial Flow by stacking `nlayers` `Bijectors.RadialLayer` blocks
+on top of a base distribution `q0`.
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`).
+- `nlayers::Int`: number of radial layers to compose.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness).
+
+Returns
+- `Bijectors.TransformedDistribution` representing the radial flow.
+
+Example
+- `q0 = MvNormal(zeros(2), I); flow = radialflow(q0, 6)`
+- `x = rand(flow); lp = logpdf(flow, x)`
+"""
+function radialflow(
+    q0::Distribution{Multivariate,Continuous},  
+    nlayers::Int;                   
+    paramtype::Type{T} = Float64,   
+) where {T<:AbstractFloat}
+    dim = length(q0)
+    Ls = [Flux._paramtype(paramtype, Bijectors.RadialLayer(dim)) for _ in 1:nlayers]
+    return create_flow(Ls, q0)
+end
diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl
new file mode 100644
index 0000000..2f93dc5
--- /dev/null
+++ b/src/flows/realnvp.jl
@@ -0,0 +1,192 @@
+"""
+    AffineCoupling(dim, hdims, mask_idx, paramtype)
+    AffineCoupling(dim, mask, s, t)
+
+Affine coupling bijector used in RealNVP [^LJS2017].
+
+Two subnetworks `s` (log-scale, exponentiated in the forward pass) and `t` (shift)
+act on one partition of the input, conditioned on the complementary partition
+(as defined by `mask`). For numerical stability, the output of `s` passes
+through `tanh` before exponentiation.
+
+Arguments
+- `dim::Int`: total dimensionality of the input.
+- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner MLPs `s` and `t`.
+- `mask_idx::AbstractVector{Int}`: indices of the dimensions to transform.
+  The complement is used as the conditioner input.
+
+Keyword Arguments
+- `paramtype::Type{<:AbstractFloat}`: parameter element type (e.g. `Float32`).
+
+Fields
+- `mask::Bijectors.PartitionMask`: partition specification.
+- `s::Flux.Chain`: conditioner producing log-scales for the transformed block.
+- `t::Flux.Chain`: conditioner producing shifts for the transformed block.
+
+Notes
+- Forward: with `(x₁,x₂,x₃) = partition(mask, x)`, compute `y₁ = x₁ .* exp.(s(x₂)) .+ t(x₂)`.
+- Log-determinant: `sum(s(x₂))` (or columnwise for batched matrices).
+- All methods support both vectors and column-major batches (matrices).
+
+[^LJS2017]: Dinh, L., Sohl-Dickstein, J. and Bengio, S. (2017).  Density estimation using Real NVP. ICLR.
+"""
+struct AffineCoupling <: Bijectors.Bijector
+    dim::Int
+    mask::Bijectors.PartitionMask
+    s::Flux.Chain
+    t::Flux.Chain
+end
+
+@functor AffineCoupling (s, t)
+
+function AffineCoupling( 
+    dim::Int,                       
+    hdims::AbstractVector{Int},
+    mask_idx::AbstractVector{Int},
+    paramtype::Type{T}
+) where {T<:AbstractFloat}
+    cdims = length(mask_idx)  # dimension of parts used to construct coupling law
+    # for the scaling network s, add tanh to the output to ensure stability during training
+    s = fnn(dim-cdims, hdims, cdims; output_activation=Flux.tanh, paramtype=paramtype)  
+    # no transfomration for the output of the translation network t
+    t = fnn(dim-cdims, hdims, cdims; output_activation=nothing, paramtype=paramtype)
+    mask = PartitionMask(dim, mask_idx)
+    return AffineCoupling(dim, mask, s, t)
+end
+
+function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat)
+    # partition vector using 'af.mask::PartitionMask`
+    x₁, x₂, x₃ = partition(af.mask, x)
+    s_x₂ = af.s(x₂)
+    y₁ = x₁ .* exp.(s_x₂) .+ af.t(x₂)
+    return combine(af.mask, y₁, x₂, x₃)
+end
+
+function (af::AffineCoupling)(x::AbstractVecOrMat)
+    return transform(af, x)
+end
+
+function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector)
+    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
+    s_x2 = af.s(x_2)
+    y_1 = exp.(s_x2) .* x_1 .+ af.t(x_2)
+    logjac = sum(s_x2) # this is a scalar
+    return combine(af.mask, y_1, x_2, x_3), logjac
+end
+
+function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix)
+    x_1, x_2, x_3 = Bijectors.partition(af.mask, x)
+    s_x2 = af.s(x_2)
+    y_1 = exp.(s_x2) .* x_1 .+ af.t(x_2)
+    logjac = sum(s_x2; dims=1) # 1 × size(x, 2)
+    return combine(af.mask, y_1, x_2, x_3), vec(logjac)
+end
+
+
+function Bijectors.with_logabsdet_jacobian(
+    iaf::Inverse{<:AffineCoupling}, y::AbstractVector
+)
+    af = iaf.orig
+    # partition vector using `af.mask::PartitionMask`
+    y_1, y_2, y_3 = partition(af.mask, y)
+    # inverse transformation
+    s_y2 = af.s(y_2)
+    x_1 = (y_1 .- af.t(y_2)) .* exp.(-s_y2)
+    logjac = -sum(s_y2)
+    return combine(af.mask, x_1, y_2, y_3), logjac
+end
+
+function Bijectors.with_logabsdet_jacobian(
+    iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix
+)
+    af = iaf.orig
+    # partition vector using `af.mask::PartitionMask`
+    y_1, y_2, y_3 = partition(af.mask, y)
+    # inverse transformation
+    s_y2 = af.s(y_2)
+    x_1 = (y_1 .- af.t(y_2)) .* exp.(-s_y2)
+    logjac = -sum(s_y2; dims=1)
+    return combine(af.mask, x_1, y_2, y_3), vec(logjac)
+end
+
+"""
+    RealNVP_layer(dims, hdims; paramtype = Float64)
+
+Construct a single RealNVP layer by composing two `AffineCoupling` bijectors
+with complementary odd–even masks.
+
+Arguments
+- `dims::Int`: dimensionality of the problem.
+- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner networks.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type.
+
+Returns
+- A `Bijectors.Bijector` representing the RealNVP layer.
+
+Example
+- `layer = RealNVP_layer(4, [64, 64])`
+- `y = layer(randn(4, 16))`  # batched forward
+"""
+function RealNVP_layer(
+    dims::Int,
+    hdims::AbstractVector{Int};
+    paramtype::Type{T} = Float64,   
+) where {T<:AbstractFloat}
+
+    mask_idx1 = 1:2:dims
+    mask_idx2 = 2:2:dims
+
+    # by default use the odd-even masking strategy
+    af1 = AffineCoupling(dims, hdims, mask_idx1, paramtype)
+    af2 = AffineCoupling(dims, hdims, mask_idx2, paramtype)
+    return reduce(∘, (af1, af2))
+end
+
+"""
+    realnvp(q0, hdims, nlayers; paramtype = Float64)
+    realnvp(q0; paramtype = Float64)
+
+Construct a RealNVP flow by stacking `nlayers` `RealNVP_layer` blocks with
+odd–even masking. The 1-argument variant defaults to 10 layers with
+hidden sizes `[32, 32]` per conditioner.
+
+Arguments
+- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g. `MvNormal(zeros(d), I)`).
+- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner networks.
+- `nlayers::Int`: number of stacked RealNVP layers.
+
+Keyword Arguments
+- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness).
+
+Returns
+- `Bijectors.TransformedDistribution` representing the RealNVP flow.
+
+Example
+- `q0 = MvNormal(zeros(2), I); flow = realnvp(q0, [64,64], 8)`
+- `x = rand(flow, 128); lp = logpdf(flow, x)`
+"""
+function realnvp(
+    q0::Distribution{Multivariate,Continuous},  
+    hdims::AbstractVector{Int},     
+    nlayers::Int;
+    paramtype::Type{T} = Float64,
+) where {T<:AbstractFloat}
+
+    dims = length(q0)
+    Ls = [RealNVP_layer(dims, hdims; paramtype=paramtype) for _ in 1:nlayers] 
+    create_flow(Ls, q0)         
+end
+
+"""
+    realnvp(q0; paramtype = Float64)
+
+Default constructor: 10 layers, each conditioner uses hidden sizes `[32, 32]`.
+Follows a common RealNVP architecture similar to Appendix E of [^ASD2020].
+
+[^ASD2020]: Agrawal, A., Sheldon, D., Domke, J. (2020). Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. NeurIPS.
+"""
+realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp(
+    q0, [32, 32], 10; paramtype=paramtype
+)
diff --git a/src/flows/utils.jl b/src/flows/utils.jl
new file mode 100644
index 0000000..83c2a44
--- /dev/null
+++ b/src/flows/utils.jl
@@ -0,0 +1,100 @@
+using Bijectors: transformed
+using Flux
+
+"""
+    create_flow(layers, q0)
+
+Construct a normalizing flow by composing the provided bijector layers and
+attaching them to the base distribution `q0`.
+
+- `layers`: an iterable of `Bijectors.Bijector` objects that are composed in order
+  (left-to-right) via function composition 
+(for instance, if `layers = [l1, l2, l3]`, the flow will be `l1∘l2∘l3(q0)`).
+- `q0`: the base distribution (e.g., `MvNormal(zeros(d), I)`).
+
+Returns a `Bijectors.TransformedDistribution` representing the resulting flow.
+
+Example
+
+    using Distributions, Bijectors, LinearAlgebra
+    q0 = MvNormal(zeros(2), I)
+    flow = create_flow((Bijectors.Shift([0.0, 1.0]), Bijectors.Scale([1.0, 2.0])), q0)
+"""
+function create_flow(Ls, q₀)
+    ts =  reduce(∘, Ls)
+    return transformed(q₀, ts)
+end
+
+"""
+    mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu)
+
+A simple wrapper for a 3 layer dense MLP
+"""
+function mlp3(
+    input_dim::Int, 
+    hidden_dims::Int, 
+    output_dim::Int; 
+    activation=Flux.leakyrelu,
+    paramtype::Type{T} = Float64
+) where {T<:AbstractFloat}
+    m = Chain(
+        Flux.Dense(input_dim, hidden_dims, activation),
+        Flux.Dense(hidden_dims, hidden_dims, activation),
+        Flux.Dense(hidden_dims, output_dim),
+    )
+    return Flux._paramtype(paramtype, m)
+end
+
+"""
+    fnn(
+        input_dim::Int,
+        hidden_dims::AbstractVector{Int},
+        output_dim::Int;
+        inlayer_activation=Flux.leakyrelu,
+        output_activation=nothing,
+        paramtype::Type{T} = Float64,
+    )
+
+Create a fully connected neural network (FNN).
+
+# Arguments
+- `input_dim::Int`: The dimension of the input layer.
+- `hidden_dims::AbstractVector{<:Int}`: A vector of integers specifying the dimensions of the hidden layers.
+- `output_dim::Int`: The dimension of the output layer.
+- `inlayer_activation`: The activation function for the hidden layers. Defaults to `Flux.leakyrelu`.
+- `output_activation`: The activation function for the output layer. Defaults to `nothing`.
+- `paramtype::Type{T} = Float64`: The type of the parameters in the network, defaults to `Float64`.
+
+# Returns
+- A `Flux.Chain` representing the FNN.
+"""
+function fnn(
+    input_dim::Int,
+    hidden_dims::AbstractVector{Int},
+    output_dim::Int;
+    inlayer_activation=Flux.leakyrelu,
+    output_activation=nothing,
+    paramtype::Type{T} = Float64,
+) where {T<:AbstractFloat}
+    # Create a chain of dense layers
+    # First layer
+    layers = Flux.Dense[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)]
+
+    # Hidden layers
+    for i in 1:(length(hidden_dims) - 1)
+        push!(
+            layers,
+            Flux.Dense(hidden_dims[i], hidden_dims[i + 1], inlayer_activation),
+        )
+    end
+
+    # Output layer
+    if output_activation === nothing
+        push!(layers, Flux.Dense(hidden_dims[end], output_dim))
+    else
+        push!(layers, Flux.Dense(hidden_dims[end], output_dim, output_activation))
+    end
+
+    m = Chain(layers...)
+    return Flux._paramtype(paramtype, m)
+end
diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
index d282e1e..6ddd47b 100644
--- a/src/objectives/elbo.jl
+++ b/src/objectives/elbo.jl
@@ -7,10 +7,11 @@ function elbo_single_sample(flow::Bijectors.TransformedDistribution, logp, x)
 end
 
 """
-    elbo(flow, logp, xs) 
-    elbo([rng, ]flow, logp, n_samples)
+    elbo(flow, logp, xs)
+    elbo([rng, ] flow, logp, n_samples)
 
-Compute the ELBO for a batch of samples `xs` from the reference distribution `flow.dist`.
+Monte Carlo estimates of the ELBO from a batch of samples `xs` from the 
+reference distribution `flow.dist`.
 
 # Arguments
 - `rng`: random number generator
@@ -46,36 +47,53 @@ end
 
 
 """
-    elbo_batch(flow, logp, xs) 
-    elbo_batch([rng, ]flow, logp, n_samples)
+    _batched_elbos(flow, logp, xs)
 
-Instead of broadcasting over elbo_single_sample, this function directly
-computes the ELBO in a batched manner, which requires the flow.transform to be able to 
-handle batched transformation directly.
+Batched ELBO estimates that transforms a matrix of samples (each column represents a single
+sample) in one call. 
+This is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages
+the batched operation of the neural networks.
 
-This will be more efficient than `elbo` for invertible neural networks such as RealNVP, 
-Neural Spline Flow, etc.
-
-# Arguments
-- `rng`: random number generator
-- `flow`: variational distribution to be trained. In particular 
-  `flow = transformed(q₀, T::Bijectors.Bijector)`, 
-  q₀ is a reference distribution that one can easily sample and compute logpdf
-- `logp`: log-pdf of the target distribution (not necessarily normalized)
-- `xs`: samples from reference dist q₀
-- `n_samples`: number of samples from reference dist q₀
+Inputs
+- `flow::Bijectors.MultivariateTransformed`
+- `logp`: function returning log-density of target
+- `xs`: column-wise sample batch
 
+Returns
+- a vector of ELBO estimates for each sample in the batch
 """
-function elbo_batch(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
+function _batched_elbos(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
     # requires the flow transformation to be able to handle batched inputs
     ys, logabsdetjac = with_logabsdet_jacobian(flow.transform, xs) 
     elbos = logp(ys) .- logpdf(flow.dist, xs) .+ logabsdetjac
     return elbos
 end
+
+"""
+    elbo_batch(flow, logp, xs)
+    elbo_batch([rng, ] flow, logp, n_samples)
+
+Batched ELBO estimates that transforms a matrix of samples (each column represents a single
+sample) in one call. 
+This is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages
+the batched operation of the neural networks.
+
+Inputs
+- `flow::Bijectors.MultivariateTransformed`
+- `logp`: function returning log-density of target
+- `xs` or `n_samples`: column-wise sample batch or number of samples
+
+Returns
+- Scalar estimate of the ELBO
+"""
+function elbo_batch(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
+    elbos = _batched_elbos(flow, logp, xs)
+    return mean(elbos)
+end
 function elbo_batch(rng::AbstractRNG, flow::Bijectors.MultivariateTransformed, logp, n_samples)
     xs = _device_specific_rand(rng, flow.dist, n_samples)
-    elbos = elbo_batch(flow, logp, xs)
+    elbos = _batched_elbos(flow, logp, xs)
     return mean(elbos)
 end
-elbo_batch(flow::Bijectors.UnivariateTransformed, logp, n_samples) = 
+elbo_batch(flow::Bijectors.TransformedDistribution, logp, n_samples) = 
     elbo_batch(Random.default_rng(), flow, logp, n_samples)
diff --git a/test/Project.toml b/test/Project.toml
index be5ffa0..6e9455f 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,6 +8,7 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
@@ -17,4 +18,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-Mooncake = "0.4.101"
+Mooncake = "0.4.142"
diff --git a/test/ad.jl b/test/ad.jl
index 725b354..b8a4211 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -73,3 +73,98 @@ end
         end
     end
 end
+
+
+@testset "AD for ELBO on realnvp" begin
+    @testset "$at" for at in [
+        ADTypes.AutoZygote(),
+        ADTypes.AutoForwardDiff(),
+        ADTypes.AutoReverseDiff(; compile=false),
+        ADTypes.AutoEnzyme(;
+            mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+            function_annotation=Enzyme.Const,
+        ),
+        ADTypes.AutoMooncake(; config=Mooncake.Config()),
+    ]
+        @testset "$T" for T in [Float32, Float64]
+            μ = 10 * ones(T, 2)
+            Σ = Diagonal(4 * ones(T, 2))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # necessary for Zygote/mooncake to differentiate through the flow
+            # prevent updating params of q0
+            @leaf MvNormal
+            q₀ = MvNormal(zeros(T, 2), ones(T, 2))
+            flow = realnvp(q₀, [8, 8], 3; paramtype=T)
+
+            θ, re = Optimisers.destructure(flow)
+
+            # check grad computation for elbo
+            function loss(θ, rng, logp, sample_per_iter)
+                return -NormalizingFlows.elbo_batch(rng, re(θ), logp, sample_per_iter)
+            end
+
+            rng = Random.default_rng()
+            sample_per_iter = 10
+
+            prep = NormalizingFlows._prepare_gradient(
+                loss, at, θ, rng, logp, sample_per_iter
+            )
+            value, grad = NormalizingFlows._value_and_gradient(
+                loss, prep, at, θ, rng, logp, sample_per_iter
+            )
+
+            @test value !== nothing
+            @test all(grad .!= nothing)
+        end
+    end
+end
+
+@testset "AD for ELBO on NSF" begin
+    @testset "$at" for at in [
+        # now NSF only works with Zygote
+        # TODO: make it work with other ADs (possibly by adapting MonotonicSplines/src/rqspline_pullbacks.jl to rrules?)
+        ADTypes.AutoZygote(),
+        # ADTypes.AutoForwardDiff(),
+        # ADTypes.AutoReverseDiff(; compile=false),
+        # ADTypes.AutoEnzyme(;
+        #     mode=Enzyme.set_runtime_activity(Enzyme.Reverse),
+        #     function_annotation=Enzyme.Const,
+        # ),
+        # ADTypes.AutoMooncake(; config=Mooncake.Config()),
+    ]
+        @testset "$T" for T in [Float32, Float64]
+            μ = 10 * ones(T, 2)
+            Σ = Diagonal(4 * ones(T, 2))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # necessary for Zygote/mooncake to differentiate through the flow
+            # prevent updating params of q0
+            @leaf MvNormal
+            q₀ = MvNormal(zeros(T, 2), ones(T, 2))
+            flow = nsf(q₀, [8, 8], 10, 5one(T), 3; paramtype=T)
+
+            θ, re = Optimisers.destructure(flow)
+
+            # check grad computation for elbo
+            function loss(θ, rng, logp, sample_per_iter)
+                return -NormalizingFlows.elbo_batch(rng, re(θ), logp, sample_per_iter)
+            end
+
+            rng = Random.default_rng()
+            sample_per_iter = 10
+
+            prep = NormalizingFlows._prepare_gradient(
+                loss, at, θ, rng, logp, sample_per_iter
+            )
+            value, grad = NormalizingFlows._value_and_gradient(
+                loss, prep, at, θ, rng, logp, sample_per_iter
+            )
+
+            @test value !== nothing
+            @test all(grad .!= nothing)
+        end
+    end
+end
diff --git a/test/flow.jl b/test/flow.jl
new file mode 100644
index 0000000..6968558
--- /dev/null
+++ b/test/flow.jl
@@ -0,0 +1,264 @@
+@testset "RealNVP flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 2
+    hdims = [32, 32]
+    for T in [Float32, Float64]
+        # Create a RealNVP flow
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+        flow = NormalizingFlows.realnvp(q₀, hdims, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-6
+            @test lj_fwd ≈ -lj_bwd rtol=1e-6
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-6
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-6
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
+            # test elbo_value is not NaN and not Inf
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
+        end
+    end
+end
+
+@testset "Neural Spline flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 2
+    K = 10
+    hdims = [32, 32]
+    for T in [Float32, Float64]
+        # Create a nsf 
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+
+        B = 5one(T)
+        flow = NormalizingFlows.nsf(q₀, hdims, K, B, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-4
+            @test lj_fwd ≈ -lj_bwd rtol=1e-4
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-4
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-4
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
+            # test elbo_value is not NaN and not Inf
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
+        end
+    end
+end
+
+
+
+@testset "Planar flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 10
+    for T in [Float32, Float64]
+        # Create a nsf 
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+
+        flow = NormalizingFlows.planarflow(q₀, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-4
+            @test lj_fwd ≈ -lj_bwd rtol=1e-4
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-4
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-4
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
+            # test elbo_value is not NaN and not Inf
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
+        end
+    end
+end
+
+
+
+@testset "Radial flow" begin
+    Random.seed!(123)
+    
+    dim = 5
+    nlayers = 10
+    for T in [Float32, Float64]
+        # Create a nsf 
+        q₀ = MvNormal(zeros(T, dim), I)
+        @leaf MvNormal
+
+        flow = NormalizingFlows.radialflow(q₀, nlayers; paramtype=T)
+
+        @testset "Sampling and density estimation for type: $T" begin
+            ys = rand(flow, 100) 
+            ℓs = logpdf(flow, ys)
+
+            @test size(ys) == (dim, 100)
+            @test length(ℓs) == 100            
+
+            @test eltype(ys) == T
+            @test eltype(ℓs) == T
+        end
+            
+
+        @testset "Inverse compatibility for type: $T" begin
+            x = rand(q₀)
+            y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x)
+            x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y)
+
+            @test x ≈ x_reconstructed rtol=1e-4
+            @test lj_fwd ≈ -lj_bwd rtol=1e-4
+
+            x_batch = rand(q₀, 10)
+            y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch)
+            x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch)
+
+            @test x_batch ≈ x_batch_reconstructed rtol=1e-4
+            @test ljs_fwd ≈ -ljs_bwd rtol=1e-4
+        end
+
+
+        @testset "ELBO test for type: $T" begin
+            μ = randn(T, dim)
+            Σ = Diagonal(rand(T, dim) .+ T(1e-3))
+            target = MvNormal(μ, Σ)
+            logp(z) = logpdf(target, z)
+
+            # Compute ELBO
+            batchsize = 64
+            elbo_value = elbo(Random.default_rng(), flow, logp, batchsize)
+            elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize)
+
+            # test when batchsize == 1
+            batchsize_single = 1
+            elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single)
+
+            # test elbo_value is not NaN and not Inf
+            @test isfinite(elbo_value)
+            @test isfinite(elbo_batch_value)
+            @test isfinite(elbo_value_single)
+        end
+    end
+end
+
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 33a9808..2808fb1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,6 +11,7 @@ import DifferentiationInterface as DI
 
 using Test
 
-include("ad.jl")
 include("objectives.jl")
 include("interface.jl")
+include("flow.jl")
+include("ad.jl")