diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 00000000..db82ba79
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,28 @@
+env:
+  # SECRET_CODECOV_TOKEN can be added here if needed for coverage reporting
+
+steps:
+  - label: "Julia v{{matrix.version}}, {{matrix.label}}"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.version}}"
+      # - JuliaCI/julia-coverage#v1:
+      #     dirs:
+      #       - src
+      #       - ext
+    command: julia --eval='println(pwd()); println(readdir()); include("test/ext/CUDA/cuda.jl")'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+    env:
+      LABEL: "{{matrix.label}}"
+      TEST_TYPE: ext
+    matrix:
+      setup:
+        version:
+          - "1"
+          - "1.10"
+        label:
+          - "cuda"
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index 0b5ec051..d6bc8216 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,12 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+NormalizingFlowsCUDAExt = "CUDA"
+
 [compat]
 ADTypes = "1"
 Bijectors = "0.12.6, 0.13, 0.14, 0.15"
diff --git a/docs/make.jl b/docs/make.jl
index 0cd188d7..304e9838 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -16,4 +16,5 @@ makedocs(;
         "Example" => "example.md",
         "Customize your own flow layer" => "customized_layer.md",
     ],
+    checkdocs=:exports,
 )
diff --git a/example/targets/banana.jl b/example/targets/banana.jl
index ee89cb58..c3d1d1ba 100644
--- a/example/targets/banana.jl
+++ b/example/targets/banana.jl
@@ -15,20 +15,19 @@ $(FIELDS)
 The banana distribution is obtained by applying a transformation ϕ to a multivariate normal 
 distribution ``\\mathcal{N}(0, \\text{diag}(var, 1, 1, …, 1))``. The transformation ϕ is defined as
 ```math
-\phi(x_1, … , x_p) = (x_1, x_2 - B x_1^² + \text{var}*B, x_3, … , x_p)
-````
+\\phi(x_1, … , x_p) = (x_1, x_2 - B x_1^² + \\text{var}*B, x_3, … , x_p)
+```
 which has a unit Jacobian determinant.
 
 Hence the density "fb" of a p-dimensional banana distribution is given by
 ```math
-fb(x_1, \dots, x_p) = \exp\left[ -\frac{1}{2}\frac{x_1^2}{\text{var}} -
-\frac{1}{2}(x_2 + B x_1^2 - \text{var}*B)^2 - \frac{1}{2}(x_3^2 + x_4^2 + \dots
-+ x_p^2) \right] / Z,
+fb(x_1, \\dots, x_p) = \\exp\\left[ -\\frac{1}{2}\\frac{x_1^2}{\\text{var}} -
+\\frac{1}{2}(x_2 + B x_1^2 - \\text{var}*B)^2 - \\frac{1}{2}(x_3^2 + x_4^2 + \\dots
++ x_p^2) \\right] / Z,
 ```
 where "B" is the "banananicity" constant, determining the curvature of a banana, and   
 ``Z = \\sqrt{\\text{var} * (2\\pi)^p)}`` is the normalization constant.
 
-
 # Reference
 
 Gareth O. Roberts and Jeffrey S. Rosenthal
diff --git a/example/targets/cross.jl b/example/targets/cross.jl
index 7c843f8e..f905b86e 100644
--- a/example/targets/cross.jl
+++ b/example/targets/cross.jl
@@ -11,13 +11,13 @@ The Cross distribution is a 2-dimension 4-component Gaussian distribution with a
 shape that is symmetric about the y- and x-axises. The mixture is defined as
 
 ```math
-\begin{aligned}
+\\begin{aligned}
 p(x) =
-& 0.25 \mathcal{N}(x | (0, \mu), (\sigma, 1)) + \\
-& 0.25 \mathcal{N}(x | (\mu, 0), (1, \sigma)) + \\
-& 0.25 \mathcal{N}(x | (0, -\mu), (\sigma, 1)) + \\
-& 0.25 \mathcal{N}(x | (-\mu, 0), (1, \sigma)))
-\end{aligned}
+& 0.25 \\mathcal{N}(x | (0, \\mu), (\\sigma, 1)) + \\\\
+& 0.25 \\mathcal{N}(x | (\\mu, 0), (1, \\sigma)) + \\\\
+& 0.25 \\mathcal{N}(x | (0, -\\mu), (\\sigma, 1)) + \\\\
+& 0.25 \\mathcal{N}(x | (-\\mu, 0), (1, \\sigma))
+\\end{aligned}
 ```
 
 where ``μ`` and ``σ`` are the mean and standard deviation of the Gaussian components, 
diff --git a/example/targets/neal_funnel.jl b/example/targets/neal_funnel.jl
index 66f58b07..6cdf84a2 100644
--- a/example/targets/neal_funnel.jl
+++ b/example/targets/neal_funnel.jl
@@ -13,14 +13,13 @@ $(FIELDS)
 The Neal's Funnel distribution is a p-dimensional distribution with a funnel shape, 
 originally proposed by Radford Neal in [2]. 
 The marginal distribution of ``x_1`` is Gaussian with mean "μ" and standard
-deviation "σ". The conditional distribution of ``x_2, \dots, x_p | x_1`` are independent 
+deviation "σ". The conditional distribution of ``x_2, \\dots, x_p | x_1`` are independent 
 Gaussian distributions with mean 0 and standard deviation ``\\exp(x_1/2)``. 
 The generative process is given by
 ```math
-x_1 \sim \mathcal{N}(\mu, \sigma^2), \quad x_2, \ldots, x_p \sim \mathcal{N}(0, \exp(x_1))
+x_1 \\sim \\mathcal{N}(\\mu, \\sigma^2), \\quad x_2, \\ldots, x_p \\sim \\mathcal{N}(0, \\exp(x_1))
 ```
 
-
 # Reference
 [1] Stan User’s Guide: 
 https://mc-stan.org/docs/2_18/stan-users-guide/reparameterization-section.html#ref-Neal:2003
diff --git a/example/targets/warped_gaussian.jl b/example/targets/warped_gaussian.jl
index 2179046d..a63012ed 100644
--- a/example/targets/warped_gaussian.jl
+++ b/example/targets/warped_gaussian.jl
@@ -13,13 +13,12 @@ $(FIELDS)
 The banana distribution is obtained by applying a transformation ϕ to a 2-dimensional normal 
 distribution ``\\mathcal{N}(0, diag(\\sigma_1, \\sigma_2))``. The transformation ϕ(x) is defined as
 ```math
-ϕ(x_1, x_2) = (r*\cos(\theta + r/2), r*\sin(\theta + r/2)), 
+\\phi(x_1, x_2) = (r*\\cos(\\theta + r/2), r*\\sin(\\theta + r/2)), 
 ```
-where ``r = \\sqrt{x\_1^2 + x_2^2}``, ``\\theta = \\atan(x₂, x₁)``, 
-and "atan(y, x) ∈ [-π, π]" is the angle, in radians, between the positive x axis and the 
+where ``r = \\sqrt{x_1^2 + x_2^2}``, ``\\theta = \\atan(x_2, x_1)``, 
+and ``\\atan(y, x) \\in [-\\pi, \\pi]`` is the angle, in radians, between the positive x axis and the 
 ray to the point "(x, y)". See page 18. of [1] for reference.
 
-
 # Reference
 [1] Zuheng Xu, Naitong Chen, Trevor Campbell
 "MixFlows: principled variational inference via mixed flows."
diff --git a/ext/NormalizingFlowsCUDAExt.jl b/ext/NormalizingFlowsCUDAExt.jl
new file mode 100644
index 00000000..1459cad7
--- /dev/null
+++ b/ext/NormalizingFlowsCUDAExt.jl
@@ -0,0 +1,76 @@
+module NormalizingFlowsCUDAExt
+
+using CUDA
+using NormalizingFlows
+using NormalizingFlows: Bijectors, Distributions, Random
+
+function NormalizingFlows._device_specific_rand(
+    rng::CUDA.RNG,
+    s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous},
+)
+    return _cuda_rand(rng, s)
+end
+
+function NormalizingFlows._device_specific_rand(
+    rng::CUDA.RNG,
+    s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous},
+    n::Int,
+)
+    return _cuda_rand(rng, s, n)
+end
+
+function _cuda_rand(
+    rng::CUDA.RNG,
+    s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous},
+)
+    return @inbounds Distributions.rand!(
+        rng, Distributions.sampler(s), CuArray{float(eltype(s))}(undef, size(s))
+    )
+end
+
+function _cuda_rand(
+    rng::CUDA.RNG,
+    s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous},
+    n::Int,
+)
+    return @inbounds Distributions.rand!(
+        rng, Distributions.sampler(s), CuArray{float(eltype(s))}(undef, size(s)..., n)
+    )
+end
+
+# ! this is type piracy
+# replacing original function with scalar indexing
+function Distributions._rand!(rng::CUDA.RNG, d::Distributions.MvNormal, x::CuVecOrMat)
+    Random.randn!(rng, x)
+    Distributions.unwhiten!(d.Σ, x)
+    x .+= d.μ
+    return x
+end
+
+# to enable `_device_specific_rand(rng:CUDA.RNG, flow[, num_samples])`
+function NormalizingFlows._device_specific_rand(rng::CUDA.RNG, td::Bijectors.TransformedDistribution)
+    return _cuda_rand(rng, td)
+end
+
+function NormalizingFlows._device_specific_rand(
+    rng::CUDA.RNG, td::Bijectors.TransformedDistribution, num_samples::Int
+)
+    return _cuda_rand(rng, td, num_samples)
+end
+
+function _cuda_rand(rng::CUDA.RNG, td::Bijectors.TransformedDistribution)
+    return td.transform(_cuda_rand(rng, td.dist))
+end
+
+function _cuda_rand(rng::CUDA.RNG, td::Bijectors.TransformedDistribution, num_samples::Int)
+    samples = _cuda_rand(rng, td.dist, num_samples)
+    res = reduce(
+        hcat,
+        map(axes(samples, 2)) do i
+            return td.transform(view(samples, :, i))
+        end,
+    )
+    return res
+end
+
+end
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
index 709f7586..7c4deccb 100644
--- a/src/NormalizingFlows.jl
+++ b/src/NormalizingFlows.jl
@@ -1,10 +1,13 @@
 module NormalizingFlows
 
+using ADTypes
 using Bijectors
+using Distributions
+using LinearAlgebra
 using Optimisers
-using LinearAlgebra, Random, Distributions, StatsBase
 using ProgressMeter
-using ADTypes
+using Random
+using StatsBase
 import DifferentiationInterface as DI
 
 using DocStringExtensions
@@ -22,7 +25,6 @@ Train the given normalizing flow `flow` by calling `optimize`.
 - `flow`: normalizing flow to be trained, we recommend to define flow as `<:Bijectors.TransformedDistribution` 
 - `args...`: additional arguments for `vo`
 
-
 # Keyword Arguments
 - `max_iters::Int=1000`: maximum number of iterations
 - `optimiser::Optimisers.AbstractRule=Optimisers.ADAM()`: optimiser to compute the steps
@@ -81,6 +83,44 @@ function train_flow(
 end
 
 include("optimize.jl")
-include("objectives.jl")
+
+# objectives
+include("objectives/elbo.jl")
+include("objectives/loglikelihood.jl") # not fully tested
+
+"""
+    _device_specific_rand
+
+By default dispatch to `Random.rand`, but maybe overload when the random number 
+generator is device specific (e.g. `CUDA.RNG`).
+"""
+function _device_specific_rand end
+
+function _device_specific_rand(
+    rng::Random.AbstractRNG,
+    s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous},
+)
+    return Random.rand(rng, s)
+end
+
+function _device_specific_rand(
+    rng::Random.AbstractRNG,
+    s::Distributions.Sampleable{<:Distributions.ArrayLikeVariate,Distributions.Continuous},
+    n::Int,
+)
+    return Random.rand(rng, s, n)
+end
+
+function _device_specific_rand(
+    rng::Random.AbstractRNG, td::Bijectors.TransformedDistribution
+)
+    return Random.rand(rng, td)
+end
+
+function _device_specific_rand(
+    rng::Random.AbstractRNG, td::Bijectors.TransformedDistribution, n::Int
+)
+    return Random.rand(rng, td, n)
+end
 
 end
diff --git a/src/objectives.jl b/src/objectives.jl
deleted file mode 100644
index 1d7ac5a2..00000000
--- a/src/objectives.jl
+++ /dev/null
@@ -1,2 +0,0 @@
-include("objectives/elbo.jl")
-include("objectives/loglikelihood.jl") # not fully tested
diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
index 2751ed90..14425f4c 100644
--- a/src/objectives/elbo.jl
+++ b/src/objectives/elbo.jl
@@ -33,11 +33,11 @@ function elbo(flow::Bijectors.MultivariateTransformed, logp, xs::AbstractMatrix)
 end
 
 function elbo(rng::AbstractRNG, flow::Bijectors.MultivariateTransformed, logp, n_samples)
-    return elbo(flow, logp, rand(rng, flow.dist, n_samples))
+    return elbo(flow, logp, _device_specific_rand(rng, flow.dist, n_samples))
 end
 
 function elbo(rng::AbstractRNG, flow::Bijectors.UnivariateTransformed, logp, n_samples)
-    return elbo(flow, logp, rand(rng, flow.dist, n_samples))
+    return elbo(flow, logp, _device_specific_rand(rng, flow.dist, n_samples))
 end
 
 function elbo(flow::Bijectors.TransformedDistribution, logp, n_samples)
diff --git a/test/ext/CUDA/Project.toml b/test/ext/CUDA/Project.toml
new file mode 100644
index 00000000..42a8d06b
--- /dev/null
+++ b/test/ext/CUDA/Project.toml
@@ -0,0 +1,8 @@
+[deps]
+Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NormalizingFlows = "50e4474d-9f12-44b7-af7a-91ab30ff6256"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
\ No newline at end of file
diff --git a/test/ext/CUDA/cuda.jl b/test/ext/CUDA/cuda.jl
new file mode 100644
index 00000000..e8fc7596
--- /dev/null
+++ b/test/ext/CUDA/cuda.jl
@@ -0,0 +1,62 @@
+using Pkg
+Pkg.activate(@__DIR__)
+Pkg.develop(; path=joinpath(@__DIR__, "..", "..", ".."))
+
+using NormalizingFlows
+using Bijectors, CUDA, Distributions, Flux, LinearAlgebra, Test
+
+@testset "rand with CUDA" begin
+
+    # Bijectors versions use dot for broadcasting, which causes issues with CUDA.
+    # https://github.com/TuringLang/Bijectors.jl/blob/6f0d383f73afd150a018b65a3ea4ac9306065d38/src/bijectors/planar_layer.jl#L65-L80
+    function Bijectors.get_u_hat(u::CuVector{T}, w::CuVector{T}) where {T<:Real}
+        wT_u = dot(w, u)
+        scale = (Bijectors.LogExpFunctions.log1pexp(-wT_u) - 1) / sum(abs2, w)
+        û = CUDA.broadcast(+, u, CUDA.broadcast(*, scale, w))
+        wT_û = Bijectors.LogExpFunctions.log1pexp(wT_u) - 1
+        return û, wT_û
+    end
+    function Bijectors._transform(flow::PlanarLayer, z::CuArray{T}) where {T<:Real}
+        w = CuArray(flow.w)
+        b = T(first(flow.b))  # Scalar
+
+        û, wT_û = Bijectors.get_u_hat(CuArray(flow.u), w)
+        wT_z = Bijectors.aT_b(w, z)
+
+        tanh_term = CUDA.tanh.(CUDA.broadcast(+, wT_z, b))
+        transformed = CUDA.broadcast(+, z, CUDA.broadcast(*, û, tanh_term))
+
+        return (transformed=transformed, wT_û=wT_û, wT_z=wT_z)
+    end
+
+    dists = [
+        MvNormal(CUDA.zeros(2), cu(Matrix{Float64}(I, 2, 2))),
+        MvNormal(CUDA.zeros(2), cu([1.0 0.5; 0.5 1.0])),
+    ]
+
+    @testset "$dist" for dist in dists
+        CUDA.allowscalar(true)
+        x = NormalizingFlows._device_specific_rand(CUDA.default_rng(), dist)
+        xs = NormalizingFlows._device_specific_rand(CUDA.default_rng(), dist, 100)
+        @test_nowarn logpdf(dist, x)
+        @test x isa CuArray
+        @test xs isa CuArray
+    end
+
+    @testset "$dist" for dist in dists
+        CUDA.allowscalar(true)
+        pl1 = PlanarLayer(
+            identity(CUDA.rand(2)), identity(CUDA.rand(2)), identity(CUDA.rand(1))
+        )
+        pl2 = PlanarLayer(
+            identity(CUDA.rand(2)), identity(CUDA.rand(2)), identity(CUDA.rand(1))
+        )
+        flow = Bijectors.transformed(dist, ComposedFunction(pl1, pl2))
+
+        y = NormalizingFlows._device_specific_rand(CUDA.default_rng(), flow)
+        ys = NormalizingFlows._device_specific_rand(CUDA.default_rng(), flow, 100)
+        @test_nowarn logpdf(flow, y)
+        @test y isa CuArray
+        @test ys isa CuArray
+    end
+end