diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml index 64d5e3d..7a98981 100644 --- a/.github/workflows/Docs.yml +++ b/.github/workflows/Docs.yml @@ -15,7 +15,7 @@ concurrency: permissions: contents: write - pull-requests: read + pull-requests: write jobs: docs: @@ -25,9 +25,10 @@ jobs: - name: Build and deploy Documenter.jl docs uses: TuringLang/actions/DocsDocumenter@main - - run: | - julia --project=docs -e ' - using Documenter: DocMeta, doctest - using NormalizingFlows - DocMeta.setdocmeta!(NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true) - doctest(NormalizingFlows)' + - name: Run doctests + shell: julia --project=docs --color=yes {0} + run: | + using Documenter: DocMeta, doctest + using NormalizingFlows + DocMeta.setdocmeta!(NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true) + doctest(NormalizingFlows) diff --git a/Project.toml b/Project.toml index f03ebb6..4c49b8f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "NormalizingFlows" uuid = "50e4474d-9f12-44b7-af7a-91ab30ff6256" -version = "0.2.1" +version = "0.2.2" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" @@ -8,7 +8,10 @@ Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -27,7 +30,10 @@ CUDA = "5" DifferentiationInterface = "0.6, 0.7" Distributions = "0.25" DocStringExtensions = "0.9" +Flux = "0.16" +Functors = "0.5.2" +MonotonicSplines = "0.3.3" Optimisers = "0.2.16, 0.3, 0.4" ProgressMeter = "1.0.0" StatsBase = "0.33, 0.34" -julia = "1.10" +julia = "1.10.8" diff --git a/README.md b/README.md index d4cb2c0..5bf5651 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Build Status](https://github.com/TuringLang/NormalizingFlows.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TuringLang/NormalizingFlows.jl/actions/workflows/CI.yml?query=branch%3Amain) -**Last updated: 2025-Mar-04** +**Last updated: 2025-Aug-08** A normalizing flow library for Julia. @@ -17,6 +17,8 @@ without being tied to specific probabilistic programming frameworks or applicati See the [documentation](https://turinglang.org/NormalizingFlows.jl/dev/) for more. +We also provide several demos and examples in [example](https://github.com/TuringLang/NormalizingFlows.jl/tree/main/example). + ## Installation To install the package, run the following command in the Julia REPL: ```julia @@ -90,3 +92,4 @@ where one wants to learn the underlying distribution of some data. - [Flux.jl](https://fluxml.ai/Flux.jl/stable/) - [Optimisers.jl](https://github.com/FluxML/Optimisers.jl) - [AdvancedVI.jl](https://github.com/TuringLang/AdvancedVI.jl) +- [MonotonicSplines.jl](https://github.com/bat/MonotonicSplines.jl) diff --git a/docs/Project.toml b/docs/Project.toml index 27726a5..6d3e869 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -5,5 +5,6 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/docs/make.jl b/docs/make.jl index 304e983..e402e10 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,19 +1,27 @@ using NormalizingFlows using Documenter +using Random +using Distributions + DocMeta.setdocmeta!( NormalizingFlows, :DocTestSetup, :(using NormalizingFlows); recursive=true ) makedocs(; modules=[NormalizingFlows], - repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}", sitename="NormalizingFlows.jl", - format=Documenter.HTML(), + repo="https://github.com/TuringLang/NormalizingFlows.jl/blob/{commit}{path}#{line}", + format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"), pages=[ "Home" => "index.md", + "General usage" => "usage.md", "API" => "api.md", - "Example" => "example.md", + "Example" => [ + "Planar Flow" => "PlanarFlow.md", + "RealNVP" => "RealNVP.md", + "Neural Spline Flow" => "NSF.md", + ], "Customize your own flow layer" => "customized_layer.md", ], checkdocs=:exports, diff --git a/docs/src/NSF.md b/docs/src/NSF.md new file mode 100644 index 0000000..df03a41 --- /dev/null +++ b/docs/src/NSF.md @@ -0,0 +1,47 @@ +# Demo of NSF on 2D Banana Distribution + +```julia +using Random, Distributions, LinearAlgebra +using Functors +using Optimisers, ADTypes +using Zygote +using NormalizingFlows + + +target = Banana(2, one(T), 100one(T)) +logp = Base.Fix1(logpdf, target) + +###################################### +# learn the target using Neural Spline Flow +###################################### +@leaf MvNormal +q0 = MvNormal(zeros(T, 2), I) + + +flow = nsf(q0; paramtype=T) +flow_untrained = deepcopy(flow) +###################################### +# start training +###################################### +sample_per_iter = 64 + +# callback function to log training progress +cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype) +# nsf only supports AutoZygote +adtype = ADTypes.AutoZygote() +checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000 +flow_trained, stats, _ = train_flow( + elbo_batch, + flow, + logp, + sample_per_iter; + max_iters=10, # change to larger number of iterations (e.g., 50_000) for better results + optimiser=Optimisers.Adam(1e-4), + ADbackend=adtype, + show_progress=true, + callback=cb, + hasconverged=checkconv, +) +θ, re = Optimisers.destructure(flow_trained) +losses = map(x -> x.loss, stats) +``` \ No newline at end of file diff --git a/docs/src/PlanarFlow.md b/docs/src/PlanarFlow.md new file mode 100644 index 0000000..696f433 --- /dev/null +++ b/docs/src/PlanarFlow.md @@ -0,0 +1,135 @@ +# Planar Flow on a 2D Banana Distribution + +This example demonstrates learning a synthetic 2D banana distribution with a planar normalizing flow [^RM2015] by maximizing the Evidence Lower BOund (ELBO). + +The two required ingredients are: + +- A log-density function `logp` for the target distribution. +- A parametrised invertible transformation (the planar flow) applied to a simple base distribution. + +## Target Distribution + +The banana target used here is defined in `example/targets/banana.jl` (see source for details): + +```julia +using Random, Distributions +Random.seed!(123) + +target = Banana(2, 1.0, 10.0) # (dimension, nonlinearity, scale) +logp = Base.Fix1(logpdf, target) +``` + +You can visualise its contour and samples (figure shipped as `banana.png`). + +![Banana](banana.png) + +## Planar Flow + +A planar flow of length N applies a sequence of planar layers to a base distribution q₀: + +```math +T_{n,\theta_n}(x) = x + u_n \tanh(w_n^T x + b_n), \qquad n = 1,\ldots,N. +``` + +Parameters θₙ = (uₙ, wₙ, bₙ) are learned. `Bijectors.jl` provides `PlanarLayer`. + +```julia +using Bijectors +using Functors # for @leaf + +function create_planar_flow(n_layers::Int, q₀) + d = length(q₀) + Ls = [PlanarLayer(d) for _ in 1:n_layers] + ts = reduce(∘, Ls) # alternatively: FunctionChains.fchain(Ls) + return transformed(q₀, ts) +end + +@leaf MvNormal # prevent updating base distribution parameters +q₀ = MvNormal(zeros(2), ones(2)) +flow = create_planar_flow(10, q₀) +flow_untrained = deepcopy(flow) # keep copy for comparison +``` + +If you build *many* layers (e.g. > ~30) you may reduce compilation time by using `FunctionChains.jl`: + +```julia +# uncomment the following lines to use FunctionChains +# using FunctionChains +# ts = fchain([PlanarLayer(d) for _ in 1:n_layers]) +``` +See [this comment](https://github.com/TuringLang/NormalizingFlows.jl/blob/8f4371d48228adf368d851e221af076ff929f1cf/src/NormalizingFlows.jl#L52) +for how the compilation time might be a concern. + +## Training the Flow + +We maximize the ELBO (here using the minibatch estimator `elbo_batch`) with the generic `train_flow` interface. + +```julia +using NormalizingFlows +using ADTypes, Optimisers +using Mooncake + +sample_per_iter = 32 +adtype = ADTypes.AutoMooncake(; config=Mooncake.Config()) # try AutoZygote() / AutoForwardDiff() / etc. +# optional: callback function to track the batch size per iteration and the AD backend used +cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter, ad=adtype) +# optional: defined stopping criteria when the gradient norm is less than 1e-3 +checkconv(iter, stat, re, θ, st) = stat.gradient_norm < 1e-3 + +flow_trained, stats, _ = train_flow( + elbo_batch, + flow, + logp, + sample_per_iter; + max_iters = 20_000, + optimiser = Optimisers.Adam(1e-2), + ADbackend = adtype, + callback = cb, + hasconverged = checkconv, + show_progress = false, +) + +losses = map(x -> x.loss, stats) +``` + +Plot the losses (negative ELBO): + +```julia +using Plots +plot(losses; xlabel = "iteration", ylabel = "negative ELBO", label = "", lw = 2) +``` + +![elbo](elbo.png) + +## Evaluating the Trained Flow + +The trained flow is a `Bijectors.TransformedDistribution`, so we can call `rand` to draw iid samples and call `logpdf` to evaluate the log-density function of the flow. +See [documentation of `Bijectors.jl`](https://turinglang.org/Bijectors.jl/dev/distributions/) for details. +```julia +n_samples = 1_000 +samples_trained = rand(flow_trained, n_samples) +samples_untrained = rand(flow_untrained, n_samples) +samples_true = rand(target, n_samples) +``` + +Simple visual comparison: + +```julia +using Plots +scatter(samples_true[1, :], samples_true[2, :]; label="Target", ms=2, alpha=0.5) +scatter!(samples_untrained[1, :], samples_untrained[2, :]; label="Untrained", ms=2, alpha=0.5) +scatter!(samples_trained[1, :], samples_trained[2, :]; label="Trained", ms=2, alpha=0.5) +plot!(title = "Planar Flow: Before vs After Training", xlabel = "x₁", ylabel = "x₂", legend = :topleft) +``` + +![compare](comparison.png) + +## Notes + +- Use `elbo` instead of `elbo_batch` for a single-sample estimator. +- Switch AD backends by changing `adtype` (see `ADTypes.jl`). +- Marking the base distribution with `@leaf` prevents its parameters from being updated during training. + +## Reference + +[^RM2015]: Rezende, D. & Mohamed, S. (2015). Variational Inference with Normalizing Flows. ICML. diff --git a/docs/src/RealNVP.md b/docs/src/RealNVP.md new file mode 100644 index 0000000..53fd2c5 --- /dev/null +++ b/docs/src/RealNVP.md @@ -0,0 +1,54 @@ +# Demo of RealNVP on 2D Banana Distribution + +```julia +using Random, Distributions, LinearAlgebra +using Functors +using Optimisers, ADTypes +using Mooncake +using NormalizingFlows + + +target = Banana(2, one(T), 100one(T)) +logp = Base.Fix1(logpdf, target) + +###################################### +# set up the RealNVP +###################################### +@leaf MvNormal +q0 = MvNormal(zeros(T, 2), I) + +d = 2 +hdims = [16, 16] +nlayers = 3 + +# use NormalizingFlows.realnvp to create a RealNVP flow +flow = realnvp(q0, hdims, nlayers; paramtype=T) +flow_untrained = deepcopy(flow) + + +###################################### +# start training +###################################### +sample_per_iter = 16 + +# callback function to log training progress +cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype) +adtype = ADTypes.AutoMooncake(; config = Mooncake.Config()) + +checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000 +flow_trained, stats, _ = train_flow( + rng, + elbo, # using elbo_batch instead of elbo achieves 4-5 times speedup + flow, + logp, + sample_per_iter; + max_iters=10, # change to larger number of iterations (e.g., 50_000) for better results + optimiser=Optimisers.Adam(5e-4), + ADbackend=adtype, + show_progress=true, + callback=cb, + hasconverged=checkconv, +) +θ, re = Optimisers.destructure(flow_trained) +losses = map(x -> x.loss, stats) +``` \ No newline at end of file diff --git a/docs/src/api.md b/docs/src/api.md index 13fe0c2..e6ec251 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -1,62 +1,27 @@ -## API +# API ```@index ``` +## Variational Objectives -## Main Function +We provide ELBO (reverse KL) and expected log-likelihood (forward KL). You can also +supply your own objective with the signature `vo(rng, flow, args...)`. -```@docs -NormalizingFlows.train_flow -``` +### Evidence Lower Bound (ELBO) -The flow object can be constructed by `transformed` function in `Bijectors.jl` package. -For example of Gaussian VI, we can construct the flow as follows: -```@julia -using Distributions, Bijectors -T= Float32 -@leaf MvNormal # to prevent params in q₀ from being optimized -q₀ = MvNormal(zeros(T, 2), ones(T, 2)) -flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2))) -``` -To train the Gaussian VI targeting at distirbution $p$ via ELBO maiximization, we can run -```@julia -using NormalizingFlows - -sample_per_iter = 10 -flow_trained, stats, _ = train_flow( - elbo, - flow, - logp, - sample_per_iter; - max_iters=2_000, - optimiser=Optimisers.ADAM(0.01 * one(T)), -) -``` -## Variational Objectives -We have implemented two variational objectives, namely, ELBO and the log-likelihood objective. -Users can also define their own objective functions, and pass it to the [`train_flow`](@ref) function. -`train_flow` will optimize the flow parameters by maximizing `vo`. -The objective function should take the following general form: -```julia -vo(rng, flow, args...) -``` -where `rng` is the random number generator, `flow` is the flow object, and `args...` are the -additional arguments that users can pass to the objective function. +By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence between $q_\theta$ and $p$: -#### Evidence Lower Bound (ELBO) -By maximizing the ELBO, it is equivalent to minimizing the reverse KL divergence between $q_\theta$ and $p$, i.e., -```math +```math \begin{aligned} &\min _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right] \quad \text{(Reverse KL)}\\ & = \max _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ -T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ -F_1(X)\right)\right] \quad \text{(ELBO)} +T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(T_n \circ \cdots \circ +T_1(X)\right)\right] \quad \text{(ELBO)} \end{aligned} ``` -Reverse KL minimization is typically used for **Bayesian computation**, -where one only has access to the log-(unnormalized)density of the target distribution $p$ (e.g., a Bayesian posterior distribution), -and hope to generate approximate samples from it. + +Reverse KL minimization is typically used for Bayesian computation when only `logp` is available. ```@docs NormalizingFlows.elbo @@ -66,26 +31,67 @@ NormalizingFlows.elbo NormalizingFlows.elbo_batch ``` -#### Log-likelihood +### Log-likelihood + +By maximizing the log-likelihood, it is equivalent to minimizing the forward KL divergence between $q_\theta$ and $p$: -By maximizing the log-likelihood, it is equivalent to minimizing the forward KL divergence between $q_\theta$ and $p$, i.e., -```math +```math \begin{aligned} & \min_{\theta} \mathbb{E}_{p}\left[\log q_{\theta}(Z)-\log p(Z)\right] \quad \text{(Forward KL)} \\ & = \max_{\theta} \mathbb{E}_{p}\left[\log q_{\theta}(Z)\right] \quad \text{(Expected log-likelihood)} \end{aligned} ``` -Forward KL minimization is typically used for **generative modeling**, -where one is given a set of samples from the target distribution $p$ (e.g., images) -and aims to learn the density or a generative process that outputs high quality samples. + +Forward KL minimization is typically used for generative modeling when samples from `p` are given. ```@docs NormalizingFlows.loglikelihood ``` - ## Training Loop ```@docs NormalizingFlows.optimize ``` + + +## Available Flows + +`NormalizingFlows.jl` provides two commonly used normalizing flows---`RealNVP` and +`Neural Spline Flow (NSF)`---and two simple flows---`Planar Flow` and `Radial Flow`. + +### RealNVP (Affine Coupling Flow) + +These helpers construct commonly used coupling-based flows with sensible defaults. + +```@docs +NormalizingFlows.realnvp +NormalizingFlows.RealNVP_layer +NormalizingFlows.AffineCoupling +``` + +### Neural Spline Flow (NSF) + +```@docs +NormalizingFlows.nsf +NormalizingFlows.NSF_layer +NormalizingFlows.NeuralSplineCoupling +``` + +#### Planar and Radial Flows + +```@docs +NormalizingFlows.planarflow +NormalizingFlows.radialflow +``` + +## Utility Functions + +```@docs +NormalizingFlows.create_flow +``` + +```@docs +NormalizingFlows.fnn +``` + diff --git a/docs/src/customized_layer.md b/docs/src/customized_layer.md index af9062c..9fb1d1d 100644 --- a/docs/src/customized_layer.md +++ b/docs/src/customized_layer.md @@ -12,7 +12,11 @@ for more details. In this tutorial, we demonstrate how to define a customized normalizing flow -layer -- an `Affine Coupling Layer` (Dinh *et al.*, 2016) -- using `Bijectors.jl` and `Flux.jl`. +layer -- an `Affine Coupling Layer` -- using `Bijectors.jl` and `Flux.jl`, +which is the building block of the RealNVP flow [^LJS2017]. +It's worth mentioning that the [`realnvp`](@ref) implemented in `NormalizingFlows.jl` +is slightly different from this tutorial with some optimization for the training stability +and performance. ## Affine Coupling Flow @@ -176,5 +180,4 @@ logpdf(flow, x[:,1]) ## Reference -Dinh, L., Sohl-Dickstein, J. and Bengio, S., 2016. *Density estimation using real nvp.* -arXiv:1605.08803. \ No newline at end of file +[^LJS2017]: Dinh, L., Sohl-Dickstein, J. and Bengio, S., 2017. Density estimation using real nvp. in *ICLR* \ No newline at end of file diff --git a/docs/src/example.md b/docs/src/example.md deleted file mode 100644 index 01a9a67..0000000 --- a/docs/src/example.md +++ /dev/null @@ -1,122 +0,0 @@ -## Example: Using Planar Flow - -Here we provide a minimal demonstration of learning a synthetic 2d banana distribution -using *planar flows* (Renzende *et al.* 2015) by maximizing the [Evidence Lower Bound (ELBO)](@ref). -To complete this task, the two key inputs are: -- the log-density function of the target distribution, -- the planar flow. - -#### The Target Distribution - -The `Banana` object is defined in `example/targets/banana.jl`, see the [source code](https://github.com/zuhengxu/NormalizingFlows.jl/blob/main/example/targets/banana.jl) for details. -```julia -p = Banana(2, 1.0f-1, 100.0f0) -logp = Base.Fix1(logpdf, p) -``` -Visualize the contour of the log-density and the sample scatters of the target distribution: -![Banana](banana.png) - - - - -#### The Planar Flow - -The planar flow is defined by repeated applying a sequence of invertible -transformations to a base distribution $q_0$. The building blocks for a planar flow -of length $N$ are the following invertible transformations, called *planar layers*: -```math -\text{planar layers}: -T_{n, \theta_n}(x)=x+u_n \cdot \tanh \left(w_n^T x+b_n\right), \quad n=1, \ldots, N, -``` -where $\theta_n = (u_n, w_n, b_n), n=1, \dots, N$ are the parameters to be learned. -Thankfully, [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) -provides a nice framework to define a normalizing flow. -Here we used the `PlanarLayer()` from `Bijectors.jl` to construct a -20-layer planar flow, of which the base distribution is a 2d standard Gaussian distribution. - -```julia -using Bijectors, FunctionChains -using Functors - -function create_planar_flow(n_layers::Int, q₀) - d = length(q₀) - Ls = [f32(PlanarLayer(d)) for _ in 1:n_layers] - ts = fchain(Ls) - return transformed(q₀, ts) -end - -# create a 20-layer planar flow -@leaf MvNormal # to prevent params in q₀ from being optimized -q₀ = MvNormal(zeros(Float32, 2), I) -flow = create_planar_flow(20, q₀) -flow_untrained = deepcopy(flow) # keep a copy of the untrained flow for comparison -``` -*Notice that here the flow layers are chained together using `fchain` function from [`FunctionChains.jl`](https://github.com/oschulz/FunctionChains.jl). -Alternatively, one can do* -```julia -ts = reduce(∘, [f32(PlanarLayer(d)) for i in 1:20]) -``` -*However, we recommend using `fchain` to reduce the compilation time when the number of layers is large. -See [this comment](https://github.com/TuringLang/NormalizingFlows.jl/blob/8f4371d48228adf368d851e221af076ff929f1cf/src/NormalizingFlows.jl#L52) -for how the compilation time might be a concern.* - - -#### Flow Training -Then we can train the flow by maximizing the ELBO using the [`train_flow`](@ref) function as follows: -```julia -using NormalizingFlows -using ADTypes -using Optimisers - -sample_per_iter = 10 -# callback function to track the number of samples used per iteration -cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,) -# defined stopping criteria when the gradient norm is less than 1e-3 -checkconv(iter, stat, re, θ, st) = stat.gradient_norm < 1e-3 -flow_trained, stats, _ = train_flow( - elbo, - flow, - logp, - sample_per_iter; - max_iters=200_00, - optimiser=Optimisers.ADAM(), - callback=cb, - hasconverged=checkconv, - ADbackend=AutoZygote(), # using Zygote as the AD backend -) -``` - -Examine the loss values during training: -```julia -using Plots - -losses = map(x -> x.loss, stats) -plot(losses; xlabel = "#iteration", ylabel= "negative ELBO", label="", linewidth=2) -``` -![elbo](elbo.png) - -## Evaluating Trained Flow -Finally, we can evaluate the trained flow by sampling from it and compare it with the target distribution. -Since the flow is defined as a `Bijectors.TransformedDistribution`, one can -easily sample from it using `rand` function, or examine the density using `logpdf` function. -See [documentation of `Bijectors.jl`](https://turinglang.org/Bijectors.jl/dev/distributions/) for details. -```julia -using Random, Distributions - -nsample = 1000 -samples_trained = rand(flow_trained, n_samples) # 1000 iid samples from the trained flow -samples_untrained = rand(flow_untrained, n_samples) # 1000 iid samples from the untrained flow -samples_true = rand(p, n_samples) # 1000 iid samples from the target - -# plot -scatter(samples_true[1, :], samples_true[2, :]; label="True Distribution", color=:blue, markersize=2, alpha=0.5) -scatter!(samples_untrained[1, :], samples_untrained[2, :]; label="Untrained Flow", color=:red, markersize=2, alpha=0.5) -scatter!(samples_trained[1, :], samples_trained[2, :]; label="Trained Flow", color=:green, markersize=2, alpha=0.5) -plot!(title = "Comparison of Trained and Untrained Flow", xlabel = "X", ylabel= "Y", legend=:topleft) -``` -![compare](comparison.png) - - -## Reference - -- Rezende, D. and Mohamed, S., 2015. *Variational inference with normalizing flows*. International Conference on Machine Learning diff --git a/docs/src/index.md b/docs/src/index.md index f685b8b..d53c2b0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -4,7 +4,7 @@ CurrentModule = NormalizingFlows # NormalizingFlows.jl -Documentation for [NormalizingFlows](https://github.com/TuringLang/NormalizingFlows.jl). +Documentation for [NormalizingFlows.jl](https://github.com/TuringLang/NormalizingFlows.jl). The purpose of this package is to provide a simple and flexible interface for @@ -15,13 +15,11 @@ construct (e.g., define customized flow layers) and combine various components for variational approximation of general target distributions, *without being tied to specific probabilistic programming frameworks or applications*. -See the [documentation](https://turinglang.org/NormalizingFlows.jl/dev/) for more. - ## Installation To install the package, run the following command in the Julia REPL: ``` ] # enter Pkg mode -(@v1.11) pkg> add git@github.com:TuringLang/NormalizingFlows.jl.git +(@v1.11) pkg> add NormalizingFlows ``` Then simply run the following command to use the package: ```julia @@ -67,7 +65,7 @@ optimization problems: \text{Reverse KL:}\quad &\argmin _{\theta} \mathbb{E}_{q_{\theta}}\left[\log q_{\theta}(Z)-\log p(Z)\right] \\ &= \argmin _{\theta} \mathbb{E}_{q_0}\left[\log \frac{q_\theta(T_N\circ \cdots \circ T_1(Z_0))}{p(T_N\circ \cdots \circ T_1(Z_0))}\right] \\ -&= \argmax _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(F_n \circ \cdots \circ F_1(X)\right)\right] +&= \argmax _{\theta} \mathbb{E}_{q_0}\left[ \log p\left(T_N \circ \cdots \circ T_1(Z_0)\right)-\log q_0(X)+\sum_{n=1}^N \log J_n\left(T_n \circ \cdots \circ T_1(X)\right)\right] \end{aligned} ``` and @@ -82,4 +80,4 @@ Both problems can be solved via standard stochastic optimization algorithms, such as stochastic gradient descent (SGD) and its variants. - +For a detailed introduction of normalizing flows, please refer to diff --git a/docs/src/usage.md b/docs/src/usage.md new file mode 100644 index 0000000..cefb816 --- /dev/null +++ b/docs/src/usage.md @@ -0,0 +1,40 @@ +## General usage + +`train_flow` is the main function to train a normalizing flow. +The users mostly need to specify a normalizing flow `flow`, +the variational objective `vo` and its corresponding arguments `args...`. + +```@docs +NormalizingFlows.train_flow +``` + +The flow object can be constructed by `transformed` function in `Bijectors.jl`. +For example, for mean-field Gaussian VI, we can construct the flow family as follows: + +```julia +using Distributions, Bijectors +T = Float32 +@leaf MvNormal # to prevent params in q₀ from being optimized +q₀ = MvNormal(zeros(T, 2), ones(T, 2)) +# the flow family is defined by a shift and a scale +flow = Bijectors.transformed(q₀, Bijectors.Shift(zeros(T,2)) ∘ Bijectors.Scale(ones(T, 2))) +``` + +To train the Gaussian VI targeting distribution `p` via ELBO maximization, run: + +```julia +using NormalizingFlows, Optimisers +using ADTypes, Mooncake + +sample_per_iter = 10 +flow_trained, stats, _ = train_flow( + elbo, + flow, + logp, + sample_per_iter; + max_iters=5_000, + optimiser=Optimisers.Adam(one(T)/100), + ADbackend=ADTypes.AutoMooncake(; config=Mooncake.Config()), + show_progress=true, +) +``` \ No newline at end of file diff --git a/example/Project.toml b/example/Project.toml index 0b9b021..79c847b 100644 --- a/example/Project.toml +++ b/example/Project.toml @@ -3,9 +3,10 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -16,7 +17,12 @@ Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" + +[compat] +Mooncake = "0.4.142" [extras] CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" diff --git a/example/demo_RealNVP.jl b/example/demo_RealNVP.jl index 516e4c9..f3d705e 100644 --- a/example/demo_RealNVP.jl +++ b/example/demo_RealNVP.jl @@ -1,7 +1,3 @@ -using Flux -using Bijectors -using Bijectors: partition, combine, PartitionMask - using Random, Distributions, LinearAlgebra using Functors using Optimisers, ADTypes @@ -11,114 +7,6 @@ using NormalizingFlows include("SyntheticTargets.jl") include("utils.jl") -################################## -# define affine coupling layer using Bijectors.jl interface -################################# -struct AffineCoupling <: Bijectors.Bijector - dim::Int - mask::Bijectors.PartitionMask - s::Flux.Chain - t::Flux.Chain -end - -# let params track field s and t -@functor AffineCoupling (s, t) - -function AffineCoupling( - dim::Int, # dimension of input - hdims::Int, # dimension of hidden units for s and t - mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on -) - cdims = length(mask_idx) # dimension of parts used to construct coupling law - s = mlp3(cdims, hdims, cdims) - t = mlp3(cdims, hdims, cdims) - mask = PartitionMask(dim, mask_idx) - return AffineCoupling(dim, mask, s, t) -end - -function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat) - # partition vector using 'af.mask::PartitionMask` - x₁, x₂, x₃ = partition(af.mask, x) - y₁ = x₁ .* af.s(x₂) .+ af.t(x₂) - return combine(af.mask, y₁, x₂, x₃) -end - -function (af::AffineCoupling)(x::AbstractArray) - return transform(af, x) -end - -function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector) - x_1, x_2, x_3 = Bijectors.partition(af.mask, x) - y_1 = af.s(x_2) .* x_1 .+ af.t(x_2) - logjac = sum(log ∘ abs, af.s(x_2)) # this is a scalar - return combine(af.mask, y_1, x_2, x_3), logjac -end - -function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix) - x_1, x_2, x_3 = Bijectors.partition(af.mask, x) - y_1 = af.s(x_2) .* x_1 .+ af.t(x_2) - logjac = sum(log ∘ abs, af.s(x_2); dims = 1) # 1 × size(x, 2) - return combine(af.mask, y_1, x_2, x_3), vec(logjac) -end - - -function Bijectors.with_logabsdet_jacobian( - iaf::Inverse{<:AffineCoupling}, y::AbstractVector -) - af = iaf.orig - # partition vector using `af.mask::PartitionMask` - y_1, y_2, y_3 = partition(af.mask, y) - # inverse transformation - x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2) - logjac = -sum(log ∘ abs, af.s(y_2)) - return combine(af.mask, x_1, y_2, y_3), logjac -end - -function Bijectors.with_logabsdet_jacobian( - iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix -) - af = iaf.orig - # partition vector using `af.mask::PartitionMask` - y_1, y_2, y_3 = partition(af.mask, y) - # inverse transformation - x_1 = (y_1 .- af.t(y_2)) ./ af.s(y_2) - logjac = -sum(log ∘ abs, af.s(y_2); dims = 1) - return combine(af.mask, x_1, y_2, y_3), vec(logjac) -end - -################### -# an equivalent definition of AffineCoupling using Bijectors.Coupling -# (see https://github.com/TuringLang/Bijectors.jl/blob/74d52d4eda72a6149b1a89b72524545525419b3f/src/bijectors/coupling.jl#L188C1-L188C1) -################### - -# struct AffineCoupling <: Bijectors.Bijector -# dim::Int -# mask::Bijectors.PartitionMask -# s::Flux.Chain -# t::Flux.Chain -# end - -# # let params track field s and t -# @functor AffineCoupling (s, t) - -# function AffineCoupling(dim, mask, s, t) -# return Bijectors.Coupling(θ -> Bijectors.Shift(t(θ)) ∘ Bijectors.Scale(s(θ)), mask) -# end - -# function AffineCoupling( -# dim::Int, # dimension of input -# hdims::Int, # dimension of hidden units for s and t -# mask_idx::AbstractVector, # index of dimensione that one wants to apply transformations on -# ) -# cdims = length(mask_idx) # dimension of parts used to construct coupling law -# s = mlp3(cdims, hdims, cdims) -# t = mlp3(cdims, hdims, cdims) -# mask = PartitionMask(dim, mask_idx) -# return AffineCoupling(dim, mask, s, t) -# end - - - ################################## # start demo ################################# @@ -129,33 +17,34 @@ T = Float32 ###################################### # a difficult banana target ###################################### -target = Banana(2, 1.0f0, 100.0f0) +target = Banana(2, one(T), 100one(T)) logp = Base.Fix1(logpdf, target) + ###################################### # learn the target using Affine coupling flow ###################################### @leaf MvNormal -q0 = MvNormal(zeros(T, 2), ones(T, 2)) +q0 = MvNormal(zeros(T, 2), I) d = 2 -hdims = 32 - -# alternating the coupling layers -Ls = [AffineCoupling(d, hdims, [1]) ∘ AffineCoupling(d, hdims, [2]) for i in 1:3] +hdims = [16, 16] +nlayers = 3 -flow = create_flow(Ls, q0) +# use NormalizingFlows.realnvp to create a RealNVP flow +flow = realnvp(q0, hdims, nlayers; paramtype=T) flow_untrained = deepcopy(flow) ###################################### # start training ###################################### -sample_per_iter = 64 +sample_per_iter = 16 # callback function to log training progress cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype) adtype = ADTypes.AutoMooncake(; config = Mooncake.Config()) + checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000 flow_trained, stats, _ = train_flow( rng, @@ -163,7 +52,7 @@ flow_trained, stats, _ = train_flow( flow, logp, sample_per_iter; - max_iters=100, # change to larger number of iterations (e.g., 50_000) for better results + max_iters=10, # change to larger number of iterations (e.g., 50_000) for better results optimiser=Optimisers.Adam(5e-4), ADbackend=adtype, show_progress=true, diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl index ffeba09..4e98c30 100644 --- a/example/demo_neural_spline_flow.jl +++ b/example/demo_neural_spline_flow.jl @@ -1,114 +1,12 @@ -using Flux -using Bijectors -using Bijectors: partition, combine, PartitionMask - using Random, Distributions, LinearAlgebra using Functors using Optimisers, ADTypes -using Mooncake +using Zygote using NormalizingFlows include("SyntheticTargets.jl") include("utils.jl") -################################## -# define neural spline layer using Bijectors.jl interface -################################# -""" -Neural Rational quadratic Spline layer - -# References -[1] Durkan, C., Bekasov, A., Murray, I., & Papamakarios, G., Neural Spline Flows, CoRR, arXiv:1906.04032 [stat.ML], (2019). -""" -struct NeuralSplineLayer{T,A<:Flux.Chain} <: Bijectors.Bijector - dim::Int # dimension of input - K::Int # number of knots - n_dims_transferred::Int # number of dimensions that are transformed - nn::A # networks that parmaterize the knots and derivatives - B::T # bound of the knots - mask::Bijectors.PartitionMask -end - -function NeuralSplineLayer( - dim::T1, # dimension of input - hdims::T1, # dimension of hidden units for s and t - K::T1, # number of knots - B::T2, # bound of the knots - mask_idx::AbstractVector{<:Int}, # index of dimensione that one wants to apply transformations on -) where {T1<:Int,T2<:Real} - num_of_transformed_dims = length(mask_idx) - input_dims = dim - num_of_transformed_dims - - # output dim of the NN - output_dims = (3K - 1)*num_of_transformed_dims - # one big mlp that outputs all the knots and derivatives for all the transformed dimensions - nn = mlp3(input_dims, hdims, output_dims) - - mask = Bijectors.PartitionMask(dim, mask_idx) - return NeuralSplineLayer(dim, K, num_of_transformed_dims, nn, B, mask) -end - -@functor NeuralSplineLayer (nn,) - -# define forward and inverse transformation -""" -Build a rational quadratic spline from the nn output -Bijectors.jl has implemented the inverse and logabsdetjac for rational quadratic spline - -we just need to map the nn output to the knots and derivatives of the RQS -""" -function instantiate_rqs(nsl::NeuralSplineLayer, x::AbstractVector) - K, B = nsl.K, nsl.B - nnoutput = reshape(nsl.nn(x), nsl.n_dims_transferred, :) - ws = @view nnoutput[:, 1:K] - hs = @view nnoutput[:, (K + 1):(2K)] - ds = @view nnoutput[:, (2K + 1):(3K - 1)] - return Bijectors.RationalQuadraticSpline(ws, hs, ds, B) -end - -function Bijectors.transform(nsl::NeuralSplineLayer, x::AbstractVector) - x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x) - # instantiate rqs knots and derivatives - rqs = instantiate_rqs(nsl, x_2) - y_1 = Bijectors.transform(rqs, x_1) - return Bijectors.combine(nsl.mask, y_1, x_2, x_3) -end - -function Bijectors.transform(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector) - nsl = insl.orig - y1, y2, y3 = partition(nsl.mask, y) - rqs = instantiate_rqs(nsl, y2) - x1 = Bijectors.transform(Inverse(rqs), y1) - return Bijectors.combine(nsl.mask, x1, y2, y3) -end - -function (nsl::NeuralSplineLayer)(x::AbstractVector) - return Bijectors.transform(nsl, x) -end - -# define logabsdetjac -function Bijectors.logabsdetjac(nsl::NeuralSplineLayer, x::AbstractVector) - x_1, x_2, _ = Bijectors.partition(nsl.mask, x) - rqs = instantiate_rqs(nsl, x_2) - logjac = logabsdetjac(rqs, x_1) - return logjac -end - -function Bijectors.logabsdetjac(insl::Inverse{<:NeuralSplineLayer}, y::AbstractVector) - nsl = insl.orig - y1, y2, _ = partition(nsl.mask, y) - rqs = instantiate_rqs(nsl, y2) - logjac = logabsdetjac(Inverse(rqs), y1) - return logjac -end - -function Bijectors.with_logabsdet_jacobian(nsl::NeuralSplineLayer, x::AbstractVector) - x_1, x_2, x_3 = Bijectors.partition(nsl.mask, x) - rqs = instantiate_rqs(nsl, x_2) - y_1, logjac = with_logabsdet_jacobian(rqs, x_1) - return Bijectors.combine(nsl.mask, y_1, x_2, x_3), logjac -end - ################################## # start demo ################################# @@ -117,30 +15,20 @@ rng = Random.default_rng() T = Float32 ###################################### -# neals funnel target +# a difficult banana target ###################################### -target = Funnel(2, 0.0f0, 9.0f0) +target = Banana(2, one(T), 100one(T)) logp = Base.Fix1(logpdf, target) ###################################### -# learn the target using Affine coupling flow +# learn the target using Neural Spline Flow ###################################### @leaf MvNormal -q0 = MvNormal(zeros(T, 2), ones(T, 2)) +q0 = MvNormal(zeros(T, 2), I) -d = 2 -hdims = 64 -K = 10 -B = 30 -Ls = [ - NeuralSplineLayer(d, hdims, K, B, [1]) ∘ NeuralSplineLayer(d, hdims, K, B, [2]) for - i in 1:3 -] -flow = create_flow(Ls, q0) +flow = nsf(q0; paramtype=T) flow_untrained = deepcopy(flow) - - ###################################### # start training ###################################### @@ -148,15 +36,16 @@ sample_per_iter = 64 # callback function to log training progress cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype) -adtype = ADTypes.AutoMooncake(; config = Mooncake.Config()) +# nsf only supports AutoZygote +adtype = ADTypes.AutoZygote() checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000 flow_trained, stats, _ = train_flow( - elbo, + elbo_batch, flow, logp, sample_per_iter; - max_iters=100, # change to larger number of iterations (e.g., 50_000) for better results - optimiser=Optimisers.Adam(5e-5), + max_iters=10, # change to larger number of iterations (e.g., 50_000) for better results + optimiser=Optimisers.Adam(1e-4), ADbackend=adtype, show_progress=true, callback=cb, diff --git a/example/demo_planar_flow.jl b/example/demo_planar_flow.jl index 25a694e..3fbe7c8 100644 --- a/example/demo_planar_flow.jl +++ b/example/demo_planar_flow.jl @@ -20,16 +20,9 @@ logp = Base.Fix1(logpdf, target) ###################################### # setup planar flow ###################################### -function create_planar_flow(n_layers::Int, q₀) - d = length(q₀) - Ls = [PlanarLayer(d) for _ in 1:n_layers] - ts = reduce(∘, Ls) - return transformed(q₀, ts) -end - @leaf MvNormal q0 = MvNormal(zeros(T, 2), ones(T, 2)) -flow = create_planar_flow(10, q0) +flow = planarflow(q0, 10; paramtype=T) flow_untrained = deepcopy(flow) ###################################### diff --git a/example/demo_radial_flow.jl b/example/demo_radial_flow.jl index 89b4c56..044d550 100644 --- a/example/demo_radial_flow.jl +++ b/example/demo_radial_flow.jl @@ -19,17 +19,10 @@ logp = Base.Fix1(logpdf, target) ###################################### # setup radial flow ###################################### -function create_radial_flow(n_layers::Int, q₀) - d = length(q₀) - Ls = [RadialLayer(d) for _ in 1:n_layers] - ts = reduce(∘, Ls) - return transformed(q₀, ts) -end - # create a 10-layer radial flow @leaf MvNormal q0 = MvNormal(zeros(T, 2), ones(T, 2)) -flow = create_radial_flow(10, q0) +flow = radialflow(q0, 10; paramtype=T) flow_untrained = deepcopy(flow) diff --git a/example/utils.jl b/example/utils.jl index b5d19ca..bdd2858 100644 --- a/example/utils.jl +++ b/example/utils.jl @@ -1,22 +1,6 @@ using Random, Distributions, LinearAlgebra +using Bijectors using Bijectors: transformed -using Flux - -""" -A simple wrapper for a 3 layer dense MLP -""" -function mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu) - return Chain( - Flux.Dense(input_dim, hidden_dims, activation), - Flux.Dense(hidden_dims, hidden_dims, activation), - Flux.Dense(hidden_dims, output_dim), - ) -end - -function create_flow(Ls, q₀) - ts = reduce(∘, Ls) - return transformed(q₀, ts) -end function compare_trained_and_untrained_flow( flow_trained::Bijectors.MultivariateTransformed, diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl index 72318df..70c4699 100644 --- a/src/NormalizingFlows.jl +++ b/src/NormalizingFlows.jl @@ -1,13 +1,15 @@ module NormalizingFlows using ADTypes -using Bijectors using Distributions using LinearAlgebra using Optimisers using ProgressMeter using Random using StatsBase +using Bijectors +using Bijectors: PartitionMask, Inverse, combine, partition +using Functors import DifferentiationInterface as DI using DocStringExtensions @@ -19,11 +21,12 @@ export train_flow, elbo, elbo_batch, loglikelihood Train the given normalizing flow `flow` by calling `optimize`. -# Arguments -- `rng::AbstractRNG`: random number generator -- `vo`: variational objective -- `flow`: normalizing flow to be trained, we recommend to define flow as `<:Bijectors.TransformedDistribution` -- `args...`: additional arguments for `vo` +Arguments +- `rng::AbstractRNG`: random number generator (default: `Random.default_rng()`) +- `vo`: variational objective with signature `vo(rng, flow, args...)`. + We implement [`elbo`](@ref), [`elbo_batch`](@ref), and [`loglikelihood`](@ref). +- `flow`: the normalizing flow---a `Bijectors.TransformedDistribution` (recommended) +- `args...`: additional arguments passed to `vo` # Keyword Arguments - `max_iters::Int=1000`: maximum number of iterations @@ -123,4 +126,19 @@ function _device_specific_rand( return Random.rand(rng, td, n) end + +# interface of contructing common flow layers +include("flows/utils.jl") +include("flows/planar_radial.jl") +include("flows/realnvp.jl") + +using MonotonicSplines +include("flows/neuralspline.jl") + +export create_flow +export planarflow, radialflow +export AffineCoupling, RealNVP_layer, realnvp +export NeuralSplineCoupling, NSF_layer, nsf + + end diff --git a/src/flows/neuralspline.jl b/src/flows/neuralspline.jl new file mode 100644 index 0000000..0d8eab0 --- /dev/null +++ b/src/flows/neuralspline.jl @@ -0,0 +1,234 @@ +""" + NeuralSplineCoupling(dim, hdims, K, B, mask_idx, paramtype) + NeuralSplineCoupling(dim, K, n_dims_transformed, B, nn, mask) + +Neural Rational Quadratic Spline (RQS) coupling bijector [^DBMP2019]. + +A conditioner network takes the unchanged partition as input and outputs the +parameters of monotonic rational quadratic splines for the transformed +coordinates. Batched inputs (matrices with column vectors) are supported. + +Arguments +- `dim::Int`: total input dimension. +- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner MLP. +- `K::Int`: number of spline knots per transformed coordinate. +- `B::AbstractFloat`: boundary/box constraint for spline domain. +- `mask_idx::AbstractVector{Int}`: indices of the transformed coordinates. + +Keyword Arguments +- `paramtype::Type{<:AbstractFloat}`: parameter element type. + +Fields +- `nn::Flux.Chain`: conditioner that outputs all spline params for all transformed dim. +- `mask::Bijectors.PartitionMask`: partition specification. + +Notes +- Output dimensionality of the conditioner is `(3K - 1) * n_transformed`. +- For computation performance, we rely on +[`MonotonicSplines.jl`](https://github.com/bat/MonotonicSplines.jl) for the +building the rational quadratic spline functions. +- See `MonotonicSplines.rqs_forward` and `MonotonicSplines.rqs_inverse` for forward/inverse +and log-determinant computations. + +[^DBMP2019]: Durkan, C., Bekasov, A., Murray, I. and Papamarkou, T. (2019). Neural Spline Flows. *NeurIPS.* +""" +struct NeuralSplineCoupling{T,A<:Flux.Chain} <: Bijectors.Bijector + dim::Int # dimension of input + K::Int # number of knots + n_dims_transformed::Int # number of dimensions that are transformed + B::T # bound of the knots + nn::A # networks that parameterize the knots and derivatives + mask::Bijectors.PartitionMask +end + +function NeuralSplineCoupling( + dim::T1, # dimension of input + hdims::AbstractVector{T1}, # dimension of hidden units for s and t + K::T1, # number of knots + B::T2, # bound of the knots + mask_idx::AbstractVector{T1}, # indices of the transformed dimensions + paramtype::Type{T2}, # type of the parameters, e.g., Float64 or Float32 +) where {T1<:Int,T2<:AbstractFloat} + num_of_transformed_dims = length(mask_idx) + input_dims = dim - num_of_transformed_dims + + output_dims = (3K - 1)*num_of_transformed_dims + # one big mlp that outputs all the knots and derivatives for all the transformed dimensions + nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype) + + mask = Bijectors.PartitionMask(dim, mask_idx) + return NeuralSplineCoupling{T2, typeof(nn)}(dim, K, num_of_transformed_dims, B, nn, mask) +end + +@functor NeuralSplineCoupling (nn,) + +function get_nsc_params(nsc::NeuralSplineCoupling, x::AbstractVecOrMat) + nnoutput = nsc.nn(x) + px, py, dydx = MonotonicSplines.rqs_params_from_nn( + nnoutput, nsc.n_dims_transformed, nsc.B + ) + return px, py, dydx +end + +# when input x is a vector instead of a matrix +# need this to transform it to a matrix with one row +# otherwise, rqs_forward and rqs_inverse will throw an error +_ensure_matrix(x) = x isa AbstractVector ? reshape(x, length(x), 1) : x + +function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractVector) + x1, x2, x3 = Bijectors.partition(nsc.mask, x) + # instantiate rqs knots and derivatives + px, py, dydx = get_nsc_params(nsc, x2) + x1 = _ensure_matrix(x1) + y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx) + return Bijectors.combine(nsc.mask, vec(y1), x2, x3) +end +function Bijectors.transform(nsc::NeuralSplineCoupling, x::AbstractMatrix) + x1, x2, x3 = Bijectors.partition(nsc.mask, x) + # instantiate rqs knots and derivatives + px, py, dydx = get_nsc_params(nsc, x2) + y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx) + return Bijectors.combine(nsc.mask, y1, x2, x3) +end + +function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractVector) + x1, x2, x3 = Bijectors.partition(nsc.mask, x) + # instantiate rqs knots and derivatives + px, py, dydx = get_nsc_params(nsc, x2) + x1 = _ensure_matrix(x1) + y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx) + return Bijectors.combine(nsc.mask, vec(y1), x2, x3), logjac[1] +end +function Bijectors.with_logabsdet_jacobian(nsc::NeuralSplineCoupling, x::AbstractMatrix) + x1, x2, x3 = Bijectors.partition(nsc.mask, x) + # instantiate rqs knots and derivatives + px, py, dydx = get_nsc_params(nsc, x2) + y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx) + return Bijectors.combine(nsc.mask, y1, x2, x3), vec(logjac) +end + +function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector) + nsc = insl.orig + y1, y2, y3 = partition(nsc.mask, y) + px, py, dydx = get_nsc_params(nsc, y2) + y1 = _ensure_matrix(y1) + x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx) + return Bijectors.combine(nsc.mask, vec(x1), y2, y3) +end +function Bijectors.transform(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractMatrix) + nsc = insl.orig + y1, y2, y3 = partition(nsc.mask, y) + px, py, dydx = get_nsc_params(nsc, y2) + x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx) + return Bijectors.combine(nsc.mask, x1, y2, y3) +end + +function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractVector) + nsc = insl.orig + y1, y2, y3 = partition(nsc.mask, y) + px, py, dydx = get_nsc_params(nsc, y2) + y1 = _ensure_matrix(y1) + x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx) + return Bijectors.combine(nsc.mask, vec(x1), y2, y3), logjac[1] +end +function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NeuralSplineCoupling}, y::AbstractMatrix) + nsc = insl.orig + y1, y2, y3 = partition(nsc.mask, y) + px, py, dydx = get_nsc_params(nsc, y2) + x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx) + return Bijectors.combine(nsc.mask, x1, y2, y3), vec(logjac) +end + +function (nsc::NeuralSplineCoupling)(x::AbstractVecOrMat) + return Bijectors.transform(nsc, x) +end + + +""" + NSF_layer(dim, hdims, K, B; paramtype = Float64) + +Build a single Neural Spline Flow (NSF) layer by composing two +`NeuralSplineCoupling` bijectors with complementary odd–even masks. + +Arguments +- `dim::Int`: dimensionality of the problem. +- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network. +- `K::Int`: number of spline knots. +- `B::AbstractFloat`: spline boundary. + +Keyword Arguments +- `paramtype::Type{T} = Float64`: parameter element type. + +Returns +- A `Bijectors.Bijector` representing the NSF layer. + +Example +- `layer = NSF_layer(4, [64,64], 10, 3.0)` +- `y = layer(randn(4, 32))` +""" +function NSF_layer( + dim::T1, # dimension of problem + hdims::AbstractVector{T1}, # dimension of hidden units for nn + K::T1, # number of knots + B::T2; # bound of the knots + paramtype::Type{T2} = Float64, # type of the parameters +) where {T1<:Int,T2<:AbstractFloat} + + mask_idx1 = 1:2:dim + mask_idx2 = 2:2:dim + + # by default use the odd-even masking strategy + nsf1 = NeuralSplineCoupling(dim, hdims, K, B, mask_idx1, paramtype) + nsf2 = NeuralSplineCoupling(dim, hdims, K, B, mask_idx2, paramtype) + return reduce(∘, (nsf1, nsf2)) +end + +""" + nsf(q0, hdims, K, B, nlayers; paramtype = Float64) + nsf(q0; paramtype = Float64) + +Construct an NSF by stacking `nlayers` `NSF_layer` blocks. The one-argument +variant defaults to 10 layers with `[32, 32]` hidden sizes, 10 knots, and +boundary `30` (scaled by `one(T)`). + +Arguments +- `q0::Distribution{Multivariate,Continuous}`: base distribution. +- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner network. +- `K::Int`: spline knots per coordinate. +- `B::AbstractFloat`: spline boundary. +- `nlayers::Int`: number of NSF layers. + +Keyword Arguments +- `paramtype::Type{T} = Float64`: parameter element type. + +Returns +- `Bijectors.TransformedDistribution` representing the NSF flow. + +!!! note + Under the hood, `nsf` relies on the rational quadratic spline function implememented in + `MonotonicSplines.jl` for performance reasons. `MonotonicSplines.jl` uses + `KernelAbstractions.jl` to support batched operations. + Because of this, so far `nsf` only supports `Zygote` as the AD type. + + +Example +- `q0 = MvNormal(zeros(3), I); flow = nsf(q0, [64,64], 8, 3.0, 6)` +- `x = rand(flow, 128); lp = logpdf(flow, x)` +""" +function nsf( + q0::Distribution{Multivariate,Continuous}, + hdims::AbstractVector{Int}, # dimension of hidden units for s and t + K::Int, + B::T, + nlayers::Int; # number of RealNVP_layer + paramtype::Type{T} = Float64, # type of the parameters +) where {T<:AbstractFloat} + + dim = length(q0) # dimension of the reference distribution == dim of the problem + Ls = [NSF_layer(dim, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] + create_flow(Ls, q0) +end + +nsf(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = nsf( + q0, [32, 32], 10, 30*one(T), 10; paramtype=paramtype +) diff --git a/src/flows/planar_radial.jl b/src/flows/planar_radial.jl new file mode 100644 index 0000000..554cd42 --- /dev/null +++ b/src/flows/planar_radial.jl @@ -0,0 +1,60 @@ +""" + planarflow(q0, nlayers; paramtype = Float64) + +Construct a Planar Flow by stacking `nlayers` `Bijectors.PlanarLayer` blocks +on top of a base distribution `q0`. + +Arguments +- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`). +- `nlayers::Int`: number of planar layers to compose. + +Keyword Arguments +- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness). + +Returns +- `Bijectors.TransformedDistribution` representing the planar flow. + +Example +- `q0 = MvNormal(zeros(2), I); flow = planarflow(q0, 10)` +- `x = rand(flow, 128); lp = logpdf(flow, x)` +""" +function planarflow( + q0::Distribution{Multivariate,Continuous}, + nlayers::Int; + paramtype::Type{T} = Float64, +) where {T<:AbstractFloat} + dim = length(q0) + Ls = [Flux._paramtype(paramtype, Bijectors.PlanarLayer(dim)) for _ in 1:nlayers] + return create_flow(Ls, q0) +end + + +""" + radialflow(q0, nlayers; paramtype = Float64) + +Construct a Radial Flow by stacking `nlayers` `Bijectors.RadialLayer` blocks +on top of a base distribution `q0`. + +Arguments +- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g., `MvNormal(zeros(d), I)`). +- `nlayers::Int`: number of radial layers to compose. + +Keyword Arguments +- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness). + +Returns +- `Bijectors.TransformedDistribution` representing the radial flow. + +Example +- `q0 = MvNormal(zeros(2), I); flow = radialflow(q0, 6)` +- `x = rand(flow); lp = logpdf(flow, x)` +""" +function radialflow( + q0::Distribution{Multivariate,Continuous}, + nlayers::Int; + paramtype::Type{T} = Float64, +) where {T<:AbstractFloat} + dim = length(q0) + Ls = [Flux._paramtype(paramtype, Bijectors.RadialLayer(dim)) for _ in 1:nlayers] + return create_flow(Ls, q0) +end diff --git a/src/flows/realnvp.jl b/src/flows/realnvp.jl new file mode 100644 index 0000000..2f93dc5 --- /dev/null +++ b/src/flows/realnvp.jl @@ -0,0 +1,192 @@ +""" + AffineCoupling(dim, hdims, mask_idx, paramtype) + AffineCoupling(dim, mask, s, t) + +Affine coupling bijector used in RealNVP [^LJS2017]. + +Two subnetworks `s` (log-scale, exponentiated in the forward pass) and `t` (shift) +act on one partition of the input, conditioned on the complementary partition +(as defined by `mask`). For numerical stability, the output of `s` passes +through `tanh` before exponentiation. + +Arguments +- `dim::Int`: total dimensionality of the input. +- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner MLPs `s` and `t`. +- `mask_idx::AbstractVector{Int}`: indices of the dimensions to transform. + The complement is used as the conditioner input. + +Keyword Arguments +- `paramtype::Type{<:AbstractFloat}`: parameter element type (e.g. `Float32`). + +Fields +- `mask::Bijectors.PartitionMask`: partition specification. +- `s::Flux.Chain`: conditioner producing log-scales for the transformed block. +- `t::Flux.Chain`: conditioner producing shifts for the transformed block. + +Notes +- Forward: with `(x₁,x₂,x₃) = partition(mask, x)`, compute `y₁ = x₁ .* exp.(s(x₂)) .+ t(x₂)`. +- Log-determinant: `sum(s(x₂))` (or columnwise for batched matrices). +- All methods support both vectors and column-major batches (matrices). + +[^LJS2017]: Dinh, L., Sohl-Dickstein, J. and Bengio, S. (2017). Density estimation using Real NVP. ICLR. +""" +struct AffineCoupling <: Bijectors.Bijector + dim::Int + mask::Bijectors.PartitionMask + s::Flux.Chain + t::Flux.Chain +end + +@functor AffineCoupling (s, t) + +function AffineCoupling( + dim::Int, + hdims::AbstractVector{Int}, + mask_idx::AbstractVector{Int}, + paramtype::Type{T} +) where {T<:AbstractFloat} + cdims = length(mask_idx) # dimension of parts used to construct coupling law + # for the scaling network s, add tanh to the output to ensure stability during training + s = fnn(dim-cdims, hdims, cdims; output_activation=Flux.tanh, paramtype=paramtype) + # no transfomration for the output of the translation network t + t = fnn(dim-cdims, hdims, cdims; output_activation=nothing, paramtype=paramtype) + mask = PartitionMask(dim, mask_idx) + return AffineCoupling(dim, mask, s, t) +end + +function Bijectors.transform(af::AffineCoupling, x::AbstractVecOrMat) + # partition vector using 'af.mask::PartitionMask` + x₁, x₂, x₃ = partition(af.mask, x) + s_x₂ = af.s(x₂) + y₁ = x₁ .* exp.(s_x₂) .+ af.t(x₂) + return combine(af.mask, y₁, x₂, x₃) +end + +function (af::AffineCoupling)(x::AbstractVecOrMat) + return transform(af, x) +end + +function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractVector) + x_1, x_2, x_3 = Bijectors.partition(af.mask, x) + s_x2 = af.s(x_2) + y_1 = exp.(s_x2) .* x_1 .+ af.t(x_2) + logjac = sum(s_x2) # this is a scalar + return combine(af.mask, y_1, x_2, x_3), logjac +end + +function Bijectors.with_logabsdet_jacobian(af::AffineCoupling, x::AbstractMatrix) + x_1, x_2, x_3 = Bijectors.partition(af.mask, x) + s_x2 = af.s(x_2) + y_1 = exp.(s_x2) .* x_1 .+ af.t(x_2) + logjac = sum(s_x2; dims=1) # 1 × size(x, 2) + return combine(af.mask, y_1, x_2, x_3), vec(logjac) +end + + +function Bijectors.with_logabsdet_jacobian( + iaf::Inverse{<:AffineCoupling}, y::AbstractVector +) + af = iaf.orig + # partition vector using `af.mask::PartitionMask` + y_1, y_2, y_3 = partition(af.mask, y) + # inverse transformation + s_y2 = af.s(y_2) + x_1 = (y_1 .- af.t(y_2)) .* exp.(-s_y2) + logjac = -sum(s_y2) + return combine(af.mask, x_1, y_2, y_3), logjac +end + +function Bijectors.with_logabsdet_jacobian( + iaf::Inverse{<:AffineCoupling}, y::AbstractMatrix +) + af = iaf.orig + # partition vector using `af.mask::PartitionMask` + y_1, y_2, y_3 = partition(af.mask, y) + # inverse transformation + s_y2 = af.s(y_2) + x_1 = (y_1 .- af.t(y_2)) .* exp.(-s_y2) + logjac = -sum(s_y2; dims=1) + return combine(af.mask, x_1, y_2, y_3), vec(logjac) +end + +""" + RealNVP_layer(dims, hdims; paramtype = Float64) + +Construct a single RealNVP layer by composing two `AffineCoupling` bijectors +with complementary odd–even masks. + +Arguments +- `dims::Int`: dimensionality of the problem. +- `hdims::AbstractVector{Int}`: hidden sizes of the conditioner networks. + +Keyword Arguments +- `paramtype::Type{T} = Float64`: parameter element type. + +Returns +- A `Bijectors.Bijector` representing the RealNVP layer. + +Example +- `layer = RealNVP_layer(4, [64, 64])` +- `y = layer(randn(4, 16))` # batched forward +""" +function RealNVP_layer( + dims::Int, + hdims::AbstractVector{Int}; + paramtype::Type{T} = Float64, +) where {T<:AbstractFloat} + + mask_idx1 = 1:2:dims + mask_idx2 = 2:2:dims + + # by default use the odd-even masking strategy + af1 = AffineCoupling(dims, hdims, mask_idx1, paramtype) + af2 = AffineCoupling(dims, hdims, mask_idx2, paramtype) + return reduce(∘, (af1, af2)) +end + +""" + realnvp(q0, hdims, nlayers; paramtype = Float64) + realnvp(q0; paramtype = Float64) + +Construct a RealNVP flow by stacking `nlayers` `RealNVP_layer` blocks with +odd–even masking. The 1-argument variant defaults to 10 layers with +hidden sizes `[32, 32]` per conditioner. + +Arguments +- `q0::Distribution{Multivariate,Continuous}`: base distribution (e.g. `MvNormal(zeros(d), I)`). +- `hdims::AbstractVector{Int}`: hidden sizes for the conditioner networks. +- `nlayers::Int`: number of stacked RealNVP layers. + +Keyword Arguments +- `paramtype::Type{T} = Float64`: parameter element type (use `Float32` for GPU friendliness). + +Returns +- `Bijectors.TransformedDistribution` representing the RealNVP flow. + +Example +- `q0 = MvNormal(zeros(2), I); flow = realnvp(q0, [64,64], 8)` +- `x = rand(flow, 128); lp = logpdf(flow, x)` +""" +function realnvp( + q0::Distribution{Multivariate,Continuous}, + hdims::AbstractVector{Int}, + nlayers::Int; + paramtype::Type{T} = Float64, +) where {T<:AbstractFloat} + + dims = length(q0) + Ls = [RealNVP_layer(dims, hdims; paramtype=paramtype) for _ in 1:nlayers] + create_flow(Ls, q0) +end + +""" + realnvp(q0; paramtype = Float64) + +Default constructor: 10 layers, each conditioner uses hidden sizes `[32, 32]`. +Follows a common RealNVP architecture similar to Appendix E of [^ASD2020]. + +[^ASD2020]: Agrawal, A., Sheldon, D., Domke, J. (2020). Advances in Black-Box VI: Normalizing Flows, Importance Weighting, and Optimization. NeurIPS. +""" +realnvp(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = realnvp( + q0, [32, 32], 10; paramtype=paramtype +) diff --git a/src/flows/utils.jl b/src/flows/utils.jl new file mode 100644 index 0000000..83c2a44 --- /dev/null +++ b/src/flows/utils.jl @@ -0,0 +1,100 @@ +using Bijectors: transformed +using Flux + +""" + create_flow(layers, q0) + +Construct a normalizing flow by composing the provided bijector layers and +attaching them to the base distribution `q0`. + +- `layers`: an iterable of `Bijectors.Bijector` objects that are composed in order + (left-to-right) via function composition +(for instance, if `layers = [l1, l2, l3]`, the flow will be `l1∘l2∘l3(q0)`). +- `q0`: the base distribution (e.g., `MvNormal(zeros(d), I)`). + +Returns a `Bijectors.TransformedDistribution` representing the resulting flow. + +Example + + using Distributions, Bijectors, LinearAlgebra + q0 = MvNormal(zeros(2), I) + flow = create_flow((Bijectors.Shift([0.0, 1.0]), Bijectors.Scale([1.0, 2.0])), q0) +""" +function create_flow(Ls, q₀) + ts = reduce(∘, Ls) + return transformed(q₀, ts) +end + +""" + mlp3(input_dim::Int, hidden_dims::Int, output_dim::Int; activation=Flux.leakyrelu) + +A simple wrapper for a 3 layer dense MLP +""" +function mlp3( + input_dim::Int, + hidden_dims::Int, + output_dim::Int; + activation=Flux.leakyrelu, + paramtype::Type{T} = Float64 +) where {T<:AbstractFloat} + m = Chain( + Flux.Dense(input_dim, hidden_dims, activation), + Flux.Dense(hidden_dims, hidden_dims, activation), + Flux.Dense(hidden_dims, output_dim), + ) + return Flux._paramtype(paramtype, m) +end + +""" + fnn( + input_dim::Int, + hidden_dims::AbstractVector{Int}, + output_dim::Int; + inlayer_activation=Flux.leakyrelu, + output_activation=nothing, + paramtype::Type{T} = Float64, + ) + +Create a fully connected neural network (FNN). + +# Arguments +- `input_dim::Int`: The dimension of the input layer. +- `hidden_dims::AbstractVector{<:Int}`: A vector of integers specifying the dimensions of the hidden layers. +- `output_dim::Int`: The dimension of the output layer. +- `inlayer_activation`: The activation function for the hidden layers. Defaults to `Flux.leakyrelu`. +- `output_activation`: The activation function for the output layer. Defaults to `nothing`. +- `paramtype::Type{T} = Float64`: The type of the parameters in the network, defaults to `Float64`. + +# Returns +- A `Flux.Chain` representing the FNN. +""" +function fnn( + input_dim::Int, + hidden_dims::AbstractVector{Int}, + output_dim::Int; + inlayer_activation=Flux.leakyrelu, + output_activation=nothing, + paramtype::Type{T} = Float64, +) where {T<:AbstractFloat} + # Create a chain of dense layers + # First layer + layers = Flux.Dense[Flux.Dense(input_dim, hidden_dims[1], inlayer_activation)] + + # Hidden layers + for i in 1:(length(hidden_dims) - 1) + push!( + layers, + Flux.Dense(hidden_dims[i], hidden_dims[i + 1], inlayer_activation), + ) + end + + # Output layer + if output_activation === nothing + push!(layers, Flux.Dense(hidden_dims[end], output_dim)) + else + push!(layers, Flux.Dense(hidden_dims[end], output_dim, output_activation)) + end + + m = Chain(layers...) + return Flux._paramtype(paramtype, m) +end diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl index d282e1e..6ddd47b 100644 --- a/src/objectives/elbo.jl +++ b/src/objectives/elbo.jl @@ -7,10 +7,11 @@ function elbo_single_sample(flow::Bijectors.TransformedDistribution, logp, x) end """ - elbo(flow, logp, xs) - elbo([rng, ]flow, logp, n_samples) + elbo(flow, logp, xs) + elbo([rng, ] flow, logp, n_samples) -Compute the ELBO for a batch of samples `xs` from the reference distribution `flow.dist`. +Monte Carlo estimates of the ELBO from a batch of samples `xs` from the +reference distribution `flow.dist`. # Arguments - `rng`: random number generator @@ -46,36 +47,53 @@ end """ - elbo_batch(flow, logp, xs) - elbo_batch([rng, ]flow, logp, n_samples) + _batched_elbos(flow, logp, xs) -Instead of broadcasting over elbo_single_sample, this function directly -computes the ELBO in a batched manner, which requires the flow.transform to be able to -handle batched transformation directly. +Batched ELBO estimates that transforms a matrix of samples (each column represents a single +sample) in one call. +This is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages +the batched operation of the neural networks. -This will be more efficient than `elbo` for invertible neural networks such as RealNVP, -Neural Spline Flow, etc. - -# Arguments -- `rng`: random number generator -- `flow`: variational distribution to be trained. In particular - `flow = transformed(q₀, T::Bijectors.Bijector)`, - q₀ is a reference distribution that one can easily sample and compute logpdf -- `logp`: log-pdf of the target distribution (not necessarily normalized) -- `xs`: samples from reference dist q₀ -- `n_samples`: number of samples from reference dist q₀ +Inputs +- `flow::Bijectors.MultivariateTransformed` +- `logp`: function returning log-density of target +- `xs`: column-wise sample batch +Returns +- a vector of ELBO estimates for each sample in the batch """ -function elbo_batch(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix) +function _batched_elbos(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix) # requires the flow transformation to be able to handle batched inputs ys, logabsdetjac = with_logabsdet_jacobian(flow.transform, xs) elbos = logp(ys) .- logpdf(flow.dist, xs) .+ logabsdetjac return elbos end + +""" + elbo_batch(flow, logp, xs) + elbo_batch([rng, ] flow, logp, n_samples) + +Batched ELBO estimates that transforms a matrix of samples (each column represents a single +sample) in one call. +This is more efficient for invertible neural-network flows (RealNVP/NSF) as it leverages +the batched operation of the neural networks. + +Inputs +- `flow::Bijectors.MultivariateTransformed` +- `logp`: function returning log-density of target +- `xs` or `n_samples`: column-wise sample batch or number of samples + +Returns +- Scalar estimate of the ELBO +""" +function elbo_batch(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix) + elbos = _batched_elbos(flow, logp, xs) + return mean(elbos) +end function elbo_batch(rng::AbstractRNG, flow::Bijectors.MultivariateTransformed, logp, n_samples) xs = _device_specific_rand(rng, flow.dist, n_samples) - elbos = elbo_batch(flow, logp, xs) + elbos = _batched_elbos(flow, logp, xs) return mean(elbos) end -elbo_batch(flow::Bijectors.UnivariateTransformed, logp, n_samples) = +elbo_batch(flow::Bijectors.TransformedDistribution, logp, n_samples) = elbo_batch(Random.default_rng(), flow, logp, n_samples) diff --git a/test/Project.toml b/test/Project.toml index be5ffa0..6e9455f 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -8,6 +8,7 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1" Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" @@ -17,4 +18,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] -Mooncake = "0.4.101" +Mooncake = "0.4.142" diff --git a/test/ad.jl b/test/ad.jl index 725b354..b8a4211 100644 --- a/test/ad.jl +++ b/test/ad.jl @@ -73,3 +73,98 @@ end end end end + + +@testset "AD for ELBO on realnvp" begin + @testset "$at" for at in [ + ADTypes.AutoZygote(), + ADTypes.AutoForwardDiff(), + ADTypes.AutoReverseDiff(; compile=false), + ADTypes.AutoEnzyme(; + mode=Enzyme.set_runtime_activity(Enzyme.Reverse), + function_annotation=Enzyme.Const, + ), + ADTypes.AutoMooncake(; config=Mooncake.Config()), + ] + @testset "$T" for T in [Float32, Float64] + μ = 10 * ones(T, 2) + Σ = Diagonal(4 * ones(T, 2)) + target = MvNormal(μ, Σ) + logp(z) = logpdf(target, z) + + # necessary for Zygote/mooncake to differentiate through the flow + # prevent updating params of q0 + @leaf MvNormal + q₀ = MvNormal(zeros(T, 2), ones(T, 2)) + flow = realnvp(q₀, [8, 8], 3; paramtype=T) + + θ, re = Optimisers.destructure(flow) + + # check grad computation for elbo + function loss(θ, rng, logp, sample_per_iter) + return -NormalizingFlows.elbo_batch(rng, re(θ), logp, sample_per_iter) + end + + rng = Random.default_rng() + sample_per_iter = 10 + + prep = NormalizingFlows._prepare_gradient( + loss, at, θ, rng, logp, sample_per_iter + ) + value, grad = NormalizingFlows._value_and_gradient( + loss, prep, at, θ, rng, logp, sample_per_iter + ) + + @test value !== nothing + @test all(grad .!= nothing) + end + end +end + +@testset "AD for ELBO on NSF" begin + @testset "$at" for at in [ + # now NSF only works with Zygote + # TODO: make it work with other ADs (possibly by adapting MonotonicSplines/src/rqspline_pullbacks.jl to rrules?) + ADTypes.AutoZygote(), + # ADTypes.AutoForwardDiff(), + # ADTypes.AutoReverseDiff(; compile=false), + # ADTypes.AutoEnzyme(; + # mode=Enzyme.set_runtime_activity(Enzyme.Reverse), + # function_annotation=Enzyme.Const, + # ), + # ADTypes.AutoMooncake(; config=Mooncake.Config()), + ] + @testset "$T" for T in [Float32, Float64] + μ = 10 * ones(T, 2) + Σ = Diagonal(4 * ones(T, 2)) + target = MvNormal(μ, Σ) + logp(z) = logpdf(target, z) + + # necessary for Zygote/mooncake to differentiate through the flow + # prevent updating params of q0 + @leaf MvNormal + q₀ = MvNormal(zeros(T, 2), ones(T, 2)) + flow = nsf(q₀, [8, 8], 10, 5one(T), 3; paramtype=T) + + θ, re = Optimisers.destructure(flow) + + # check grad computation for elbo + function loss(θ, rng, logp, sample_per_iter) + return -NormalizingFlows.elbo_batch(rng, re(θ), logp, sample_per_iter) + end + + rng = Random.default_rng() + sample_per_iter = 10 + + prep = NormalizingFlows._prepare_gradient( + loss, at, θ, rng, logp, sample_per_iter + ) + value, grad = NormalizingFlows._value_and_gradient( + loss, prep, at, θ, rng, logp, sample_per_iter + ) + + @test value !== nothing + @test all(grad .!= nothing) + end + end +end diff --git a/test/flow.jl b/test/flow.jl new file mode 100644 index 0000000..6968558 --- /dev/null +++ b/test/flow.jl @@ -0,0 +1,264 @@ +@testset "RealNVP flow" begin + Random.seed!(123) + + dim = 5 + nlayers = 2 + hdims = [32, 32] + for T in [Float32, Float64] + # Create a RealNVP flow + q₀ = MvNormal(zeros(T, dim), I) + @leaf MvNormal + flow = NormalizingFlows.realnvp(q₀, hdims, nlayers; paramtype=T) + + @testset "Sampling and density estimation for type: $T" begin + ys = rand(flow, 100) + ℓs = logpdf(flow, ys) + + @test size(ys) == (dim, 100) + @test length(ℓs) == 100 + + @test eltype(ys) == T + @test eltype(ℓs) == T + end + + + @testset "Inverse compatibility for type: $T" begin + x = rand(q₀) + y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x) + x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y) + + @test x ≈ x_reconstructed rtol=1e-6 + @test lj_fwd ≈ -lj_bwd rtol=1e-6 + + x_batch = rand(q₀, 10) + y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch) + x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch) + + @test x_batch ≈ x_batch_reconstructed rtol=1e-6 + @test ljs_fwd ≈ -ljs_bwd rtol=1e-6 + end + + + @testset "ELBO test for type: $T" begin + μ = randn(T, dim) + Σ = Diagonal(rand(T, dim) .+ T(1e-3)) + target = MvNormal(μ, Σ) + logp(z) = logpdf(target, z) + + # Compute ELBO + batchsize = 64 + elbo_value = elbo(Random.default_rng(), flow, logp, batchsize) + elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize) + + # test when batchsize == 1 + batchsize_single = 1 + elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single) + + # test elbo_value is not NaN and not Inf + @test isfinite(elbo_value) + @test isfinite(elbo_batch_value) + @test isfinite(elbo_value_single) + end + end +end + +@testset "Neural Spline flow" begin + Random.seed!(123) + + dim = 5 + nlayers = 2 + K = 10 + hdims = [32, 32] + for T in [Float32, Float64] + # Create a nsf + q₀ = MvNormal(zeros(T, dim), I) + @leaf MvNormal + + B = 5one(T) + flow = NormalizingFlows.nsf(q₀, hdims, K, B, nlayers; paramtype=T) + + @testset "Sampling and density estimation for type: $T" begin + ys = rand(flow, 100) + ℓs = logpdf(flow, ys) + + @test size(ys) == (dim, 100) + @test length(ℓs) == 100 + + @test eltype(ys) == T + @test eltype(ℓs) == T + end + + + @testset "Inverse compatibility for type: $T" begin + x = rand(q₀) + y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x) + x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y) + + @test x ≈ x_reconstructed rtol=1e-4 + @test lj_fwd ≈ -lj_bwd rtol=1e-4 + + x_batch = rand(q₀, 10) + y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch) + x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch) + + @test x_batch ≈ x_batch_reconstructed rtol=1e-4 + @test ljs_fwd ≈ -ljs_bwd rtol=1e-4 + end + + + @testset "ELBO test for type: $T" begin + μ = randn(T, dim) + Σ = Diagonal(rand(T, dim) .+ T(1e-3)) + target = MvNormal(μ, Σ) + logp(z) = logpdf(target, z) + + # Compute ELBO + batchsize = 64 + elbo_value = elbo(Random.default_rng(), flow, logp, batchsize) + elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize) + + # test when batchsize == 1 + batchsize_single = 1 + elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single) + + # test elbo_value is not NaN and not Inf + @test isfinite(elbo_value) + @test isfinite(elbo_batch_value) + @test isfinite(elbo_value_single) + end + end +end + + + +@testset "Planar flow" begin + Random.seed!(123) + + dim = 5 + nlayers = 10 + for T in [Float32, Float64] + # Create a nsf + q₀ = MvNormal(zeros(T, dim), I) + @leaf MvNormal + + flow = NormalizingFlows.planarflow(q₀, nlayers; paramtype=T) + + @testset "Sampling and density estimation for type: $T" begin + ys = rand(flow, 100) + ℓs = logpdf(flow, ys) + + @test size(ys) == (dim, 100) + @test length(ℓs) == 100 + + @test eltype(ys) == T + @test eltype(ℓs) == T + end + + + @testset "Inverse compatibility for type: $T" begin + x = rand(q₀) + y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x) + x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y) + + @test x ≈ x_reconstructed rtol=1e-4 + @test lj_fwd ≈ -lj_bwd rtol=1e-4 + + x_batch = rand(q₀, 10) + y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch) + x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch) + + @test x_batch ≈ x_batch_reconstructed rtol=1e-4 + @test ljs_fwd ≈ -ljs_bwd rtol=1e-4 + end + + + @testset "ELBO test for type: $T" begin + μ = randn(T, dim) + Σ = Diagonal(rand(T, dim) .+ T(1e-3)) + target = MvNormal(μ, Σ) + logp(z) = logpdf(target, z) + + # Compute ELBO + batchsize = 64 + elbo_value = elbo(Random.default_rng(), flow, logp, batchsize) + elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize) + + # test when batchsize == 1 + batchsize_single = 1 + elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single) + + # test elbo_value is not NaN and not Inf + @test isfinite(elbo_value) + @test isfinite(elbo_batch_value) + @test isfinite(elbo_value_single) + end + end +end + + + +@testset "Radial flow" begin + Random.seed!(123) + + dim = 5 + nlayers = 10 + for T in [Float32, Float64] + # Create a nsf + q₀ = MvNormal(zeros(T, dim), I) + @leaf MvNormal + + flow = NormalizingFlows.radialflow(q₀, nlayers; paramtype=T) + + @testset "Sampling and density estimation for type: $T" begin + ys = rand(flow, 100) + ℓs = logpdf(flow, ys) + + @test size(ys) == (dim, 100) + @test length(ℓs) == 100 + + @test eltype(ys) == T + @test eltype(ℓs) == T + end + + + @testset "Inverse compatibility for type: $T" begin + x = rand(q₀) + y, lj_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x) + x_reconstructed, lj_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y) + + @test x ≈ x_reconstructed rtol=1e-4 + @test lj_fwd ≈ -lj_bwd rtol=1e-4 + + x_batch = rand(q₀, 10) + y_batch, ljs_fwd = Bijectors.with_logabsdet_jacobian(flow.transform, x_batch) + x_batch_reconstructed, ljs_bwd = Bijectors.with_logabsdet_jacobian(inverse(flow.transform), y_batch) + + @test x_batch ≈ x_batch_reconstructed rtol=1e-4 + @test ljs_fwd ≈ -ljs_bwd rtol=1e-4 + end + + + @testset "ELBO test for type: $T" begin + μ = randn(T, dim) + Σ = Diagonal(rand(T, dim) .+ T(1e-3)) + target = MvNormal(μ, Σ) + logp(z) = logpdf(target, z) + + # Compute ELBO + batchsize = 64 + elbo_value = elbo(Random.default_rng(), flow, logp, batchsize) + elbo_batch_value = elbo_batch(Random.default_rng(), flow, logp, batchsize) + + # test when batchsize == 1 + batchsize_single = 1 + elbo_value_single = elbo(Random.default_rng(), flow, logp, batchsize_single) + + # test elbo_value is not NaN and not Inf + @test isfinite(elbo_value) + @test isfinite(elbo_batch_value) + @test isfinite(elbo_value_single) + end + end +end + + diff --git a/test/runtests.jl b/test/runtests.jl index 33a9808..2808fb1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,7 @@ import DifferentiationInterface as DI using Test -include("ad.jl") include("objectives.jl") include("interface.jl") +include("flow.jl") +include("ad.jl")