diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 00000000..db82ba79 --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,28 @@ +env: + # SECRET_CODECOV_TOKEN can be added here if needed for coverage reporting + +steps: + - label: "Julia v{{matrix.version}}, {{matrix.label}}" + plugins: + - JuliaCI/julia#v1: + version: "{{matrix.version}}" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - ext + command: julia --eval='println(pwd()); println(readdir()); include("test/ext/CUDA/cuda.jl")' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + env: + LABEL: "{{matrix.label}}" + TEST_TYPE: ext + matrix: + setup: + version: + - "1" + - "1.10" + label: + - "cuda" \ No newline at end of file diff --git a/Project.toml b/Project.toml index 0b5ec051..d6bc8216 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,12 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +NormalizingFlowsCUDAExt = "CUDA" + [compat] ADTypes = "1" Bijectors = "0.12.6, 0.13, 0.14, 0.15" diff --git a/docs/make.jl b/docs/make.jl index 0cd188d7..304e9838 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,4 +16,5 @@ makedocs(; "Example" => "example.md", "Customize your own flow layer" => "customized_layer.md", ], + checkdocs=:exports, ) diff --git a/example/targets/banana.jl b/example/targets/banana.jl index ee89cb58..c3d1d1ba 100644 --- a/example/targets/banana.jl +++ b/example/targets/banana.jl @@ -15,20 +15,19 @@ $(FIELDS) The banana distribution is obtained by applying a transformation ϕ to a multivariate normal distribution ``\\mathcal{N}(0, \\text{diag}(var, 1, 1, …, 1))``. The transformation ϕ is defined as ```math -\phi(x_1, … , x_p) = (x_1, x_2 - B x_1^² + \text{var}*B, x_3, … , x_p) -```` +\\phi(x_1, … , x_p) = (x_1, x_2 - B x_1^² + \\text{var}*B, x_3, … , x_p) +``` which has a unit Jacobian determinant. Hence the density "fb" of a p-dimensional banana distribution is given by ```math -fb(x_1, \dots, x_p) = \exp\left[ -\frac{1}{2}\frac{x_1^2}{\text{var}} - -\frac{1}{2}(x_2 + B x_1^2 - \text{var}*B)^2 - \frac{1}{2}(x_3^2 + x_4^2 + \dots -+ x_p^2) \right] / Z, +fb(x_1, \\dots, x_p) = \\exp\\left[ -\\frac{1}{2}\\frac{x_1^2}{\\text{var}} - +\\frac{1}{2}(x_2 + B x_1^2 - \\text{var}*B)^2 - \\frac{1}{2}(x_3^2 + x_4^2 + \\dots ++ x_p^2) \\right] / Z, ``` where "B" is the "banananicity" constant, determining the curvature of a banana, and ``Z = \\sqrt{\\text{var} * (2\\pi)^p)}`` is the normalization constant. - # Reference Gareth O. Roberts and Jeffrey S. Rosenthal diff --git a/example/targets/cross.jl b/example/targets/cross.jl index 7c843f8e..f905b86e 100644 --- a/example/targets/cross.jl +++ b/example/targets/cross.jl @@ -11,13 +11,13 @@ The Cross distribution is a 2-dimension 4-component Gaussian distribution with a shape that is symmetric about the y- and x-axises. The mixture is defined as ```math -\begin{aligned} +\\begin{aligned} p(x) = -& 0.25 \mathcal{N}(x | (0, \mu), (\sigma, 1)) + \\ -& 0.25 \mathcal{N}(x | (\mu, 0), (1, \sigma)) + \\ -& 0.25 \mathcal{N}(x | (0, -\mu), (\sigma, 1)) + \\ -& 0.25 \mathcal{N}(x | (-\mu, 0), (1, \sigma))) -\end{aligned} +& 0.25 \\mathcal{N}(x | (0, \\mu), (\\sigma, 1)) + \\\\ +& 0.25 \\mathcal{N}(x | (\\mu, 0), (1, \\sigma)) + \\\\ +& 0.25 \\mathcal{N}(x | (0, -\\mu), (\\sigma, 1)) + \\\\ +& 0.25 \\mathcal{N}(x | (-\\mu, 0), (1, \\sigma)) +\\end{aligned} ``` where ``μ`` and ``σ`` are the mean and standard deviation of the Gaussian components, diff --git a/example/targets/neal_funnel.jl b/example/targets/neal_funnel.jl index 66f58b07..6cdf84a2 100644 --- a/example/targets/neal_funnel.jl +++ b/example/targets/neal_funnel.jl @@ -13,14 +13,13 @@ $(FIELDS) The Neal's Funnel distribution is a p-dimensional distribution with a funnel shape, originally proposed by Radford Neal in [2]. The marginal distribution of ``x_1`` is Gaussian with mean "μ" and standard -deviation "σ". The conditional distribution of ``x_2, \dots, x_p | x_1`` are independent +deviation "σ". The conditional distribution of ``x_2, \\dots, x_p | x_1`` are independent Gaussian distributions with mean 0 and standard deviation ``\\exp(x_1/2)``. The generative process is given by ```math -x_1 \sim \mathcal{N}(\mu, \sigma^2), \quad x_2, \ldots, x_p \sim \mathcal{N}(0, \exp(x_1)) +x_1 \\sim \\mathcal{N}(\\mu, \\sigma^2), \\quad x_2, \\ldots, x_p \\sim \\mathcal{N}(0, \\exp(x_1)) ``` - # Reference [1] Stan User’s Guide: https://mc-stan.org/docs/2_18/stan-users-guide/reparameterization-section.html#ref-Neal:2003 diff --git a/example/targets/warped_gaussian.jl b/example/targets/warped_gaussian.jl index 2179046d..a63012ed 100644 --- a/example/targets/warped_gaussian.jl +++ b/example/targets/warped_gaussian.jl @@ -13,13 +13,12 @@ $(FIELDS) The banana distribution is obtained by applying a transformation ϕ to a 2-dimensional normal distribution ``\\mathcal{N}(0, diag(\\sigma_1, \\sigma_2))``. The transformation ϕ(x) is defined as ```math -ϕ(x_1, x_2) = (r*\cos(\theta + r/2), r*\sin(\theta + r/2)), +\\phi(x_1, x_2) = (r*\\cos(\\theta + r/2), r*\\sin(\\theta + r/2)), ``` -where ``r = \\sqrt{x\_1^2 + x_2^2}``, ``\\theta = \\atan(x₂, x₁)``, -and "atan(y, x) ∈ [-π, π]" is the angle, in radians, between the positive x axis and the +where ``r = \\sqrt{x_1^2 + x_2^2}``, ``\\theta = \\atan(x_2, x_1)``, +and ``\\atan(y, x) \\in [-\\pi, \\pi]`` is the angle, in radians, between the positive x axis and the ray to the point "(x, y)". See page 18. of [1] for reference. - # Reference [1] Zuheng Xu, Naitong Chen, Trevor Campbell "MixFlows: principled variational inference via mixed flows." diff --git a/ext/NormalizingFlowsCUDAExt.jl b/ext/NormalizingFlowsCUDAExt.jl new file mode 100644 index 00000000..1459cad7 --- /dev/null +++ b/ext/NormalizingFlowsCUDAExt.jl @@ -0,0 +1,76 @@ +module NormalizingFlowsCUDAExt + +using CUDA +using NormalizingFlows +using NormalizingFlows: Bijectors, Distributions, Random + +function NormalizingFlows._device_specific_rand( + rng::CUDA.RNG, + s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous}, +) + return _cuda_rand(rng, s) +end + +function NormalizingFlows._device_specific_rand( + rng::CUDA.RNG, + s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous}, + n::Int, +) + return _cuda_rand(rng, s, n) +end + +function _cuda_rand( + rng::CUDA.RNG, + s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous}, +) + return @inbounds Distributions.rand!( + rng, Distributions.sampler(s), CuArray{float(eltype(s))}(undef, size(s)) + ) +end + +function _cuda_rand( + rng::CUDA.RNG, + s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous}, + n::Int, +) + return @inbounds Distributions.rand!( + rng, Distributions.sampler(s), CuArray{float(eltype(s))}(undef, size(s)..., n) + ) +end + +# ! this is type piracy +# replacing original function with scalar indexing +function Distributions._rand!(rng::CUDA.RNG, d::Distributions.MvNormal, x::CuVecOrMat) + Random.randn!(rng, x) + Distributions.unwhiten!(d.Σ, x) + x .+= d.μ + return x +end + +# to enable `_device_specific_rand(rng:CUDA.RNG, flow[, num_samples])` +function NormalizingFlows._device_specific_rand(rng::CUDA.RNG, td::Bijectors.TransformedDistribution) + return _cuda_rand(rng, td) +end + +function NormalizingFlows._device_specific_rand( + rng::CUDA.RNG, td::Bijectors.TransformedDistribution, num_samples::Int +) + return _cuda_rand(rng, td, num_samples) +end + +function _cuda_rand(rng::CUDA.RNG, td::Bijectors.TransformedDistribution) + return td.transform(_cuda_rand(rng, td.dist)) +end + +function _cuda_rand(rng::CUDA.RNG, td::Bijectors.TransformedDistribution, num_samples::Int) + samples = _cuda_rand(rng, td.dist, num_samples) + res = reduce( + hcat, + map(axes(samples, 2)) do i + return td.transform(view(samples, :, i)) + end, + ) + return res +end + +end diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl index 709f7586..7c4deccb 100644 --- a/src/NormalizingFlows.jl +++ b/src/NormalizingFlows.jl @@ -1,10 +1,13 @@ module NormalizingFlows +using ADTypes using Bijectors +using Distributions +using LinearAlgebra using Optimisers -using LinearAlgebra, Random, Distributions, StatsBase using ProgressMeter -using ADTypes +using Random +using StatsBase import DifferentiationInterface as DI using DocStringExtensions @@ -22,7 +25,6 @@ Train the given normalizing flow `flow` by calling `optimize`. - `flow`: normalizing flow to be trained, we recommend to define flow as `<:Bijectors.TransformedDistribution` - `args...`: additional arguments for `vo` - # Keyword Arguments - `max_iters::Int=1000`: maximum number of iterations - `optimiser::Optimisers.AbstractRule=Optimisers.ADAM()`: optimiser to compute the steps @@ -81,6 +83,44 @@ function train_flow( end include("optimize.jl") -include("objectives.jl") + +# objectives +include("objectives/elbo.jl") +include("objectives/loglikelihood.jl") # not fully tested + +""" + _device_specific_rand + +By default dispatch to `Random.rand`, but maybe overload when the random number +generator is device specific (e.g. `CUDA.RNG`). +""" +function _device_specific_rand end + +function _device_specific_rand( + rng::Random.AbstractRNG, + s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous}, +) + return Random.rand(rng, s) +end + +function _device_specific_rand( + rng::Random.AbstractRNG, + s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous}, + n::Int, +) + return Random.rand(rng, s, n) +end + +function _device_specific_rand( + rng::Random.AbstractRNG, td::Bijectors.TransformedDistribution +) + return Random.rand(rng, td) +end + +function _device_specific_rand( + rng::Random.AbstractRNG, td::Bijectors.TransformedDistribution, n::Int +) + return Random.rand(rng, td, n) +end end diff --git a/src/objectives.jl b/src/objectives.jl deleted file mode 100644 index 1d7ac5a2..00000000 --- a/src/objectives.jl +++ /dev/null @@ -1,2 +0,0 @@ -include("objectives/elbo.jl") -include("objectives/loglikelihood.jl") # not fully tested diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl index 2751ed90..14425f4c 100644 --- a/src/objectives/elbo.jl +++ b/src/objectives/elbo.jl @@ -33,11 +33,11 @@ function elbo(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix) end function elbo(rng::AbstractRNG, flow::Bijectors.MultivariateTransformed, logp, n_samples) - return elbo(flow, logp, rand(rng, flow.dist, n_samples)) + return elbo(flow, logp, _device_specific_rand(rng, flow.dist, n_samples)) end function elbo(rng::AbstractRNG, flow::Bijectors.UnivariateTransformed, logp, n_samples) - return elbo(flow, logp, rand(rng, flow.dist, n_samples)) + return elbo(flow, logp, _device_specific_rand(rng, flow.dist, n_samples)) end function elbo(flow::Bijectors.TransformedDistribution, logp, n_samples) diff --git a/test/ext/CUDA/Project.toml b/test/ext/CUDA/Project.toml new file mode 100644 index 00000000..42a8d06b --- /dev/null +++ b/test/ext/CUDA/Project.toml @@ -0,0 +1,8 @@ +[deps] +Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" \ No newline at end of file diff --git a/test/ext/CUDA/cuda.jl b/test/ext/CUDA/cuda.jl new file mode 100644 index 00000000..e8fc7596 --- /dev/null +++ b/test/ext/CUDA/cuda.jl @@ -0,0 +1,62 @@ +using Pkg +Pkg.activate(@__DIR__) +Pkg.develop(; path=joinpath(@__DIR__, "..", "..", "..")) + +using NormalizingFlows +using Bijectors, CUDA, Distributions, Flux, LinearAlgebra, Test + +@testset "rand with CUDA" begin + + # Bijectors versions use dot for broadcasting, which causes issues with CUDA. + # https://github.com/TuringLang/Bijectors.jl/blob/6f0d383f73afd150a018b65a3ea4ac9306065d38/src/bijectors/planar_layer.jl#L65-L80 + function Bijectors.get_u_hat(u::CuVector{T}, w::CuVector{T}) where {T<:Real} + wT_u = dot(w, u) + scale = (Bijectors.LogExpFunctions.log1pexp(-wT_u) - 1) / sum(abs2, w) + û = CUDA.broadcast(+, u, CUDA.broadcast(*, scale, w)) + wT_û = Bijectors.LogExpFunctions.log1pexp(wT_u) - 1 + return û, wT_û + end + function Bijectors._transform(flow::PlanarLayer, z::CuArray{T}) where {T<:Real} + w = CuArray(flow.w) + b = T(first(flow.b)) # Scalar + + û, wT_û = Bijectors.get_u_hat(CuArray(flow.u), w) + wT_z = Bijectors.aT_b(w, z) + + tanh_term = CUDA.tanh.(CUDA.broadcast(+, wT_z, b)) + transformed = CUDA.broadcast(+, z, CUDA.broadcast(*, û, tanh_term)) + + return (transformed=transformed, wT_û=wT_û, wT_z=wT_z) + end + + dists = [ + MvNormal(CUDA.zeros(2), cu(Matrix{Float64}(I, 2, 2))), + MvNormal(CUDA.zeros(2), cu([1.0 0.5; 0.5 1.0])), + ] + + @testset "$dist" for dist in dists + CUDA.allowscalar(true) + x = NormalizingFlows._device_specific_rand(CUDA.default_rng(), dist) + xs = NormalizingFlows._device_specific_rand(CUDA.default_rng(), dist, 100) + @test_nowarn logpdf(dist, x) + @test x isa CuArray + @test xs isa CuArray + end + + @testset "$dist" for dist in dists + CUDA.allowscalar(true) + pl1 = PlanarLayer( + identity(CUDA.rand(2)), identity(CUDA.rand(2)), identity(CUDA.rand(1)) + ) + pl2 = PlanarLayer( + identity(CUDA.rand(2)), identity(CUDA.rand(2)), identity(CUDA.rand(1)) + ) + flow = Bijectors.transformed(dist, ComposedFunction(pl1, pl2)) + + y = NormalizingFlows._device_specific_rand(CUDA.default_rng(), flow) + ys = NormalizingFlows._device_specific_rand(CUDA.default_rng(), flow, 100) + @test_nowarn logpdf(flow, y) + @test y isa CuArray + @test ys isa CuArray + end +end