add new nsf implementation and demo; much faster than the original nsf

zuhengxu · zuhengxu · commit c4128faf2743 · 2025-08-03T16:51:50.000-07:00
diff --git a/.github/workflows/Examples.yml b/.github/workflows/Examples.yml
@@ -38,5 +38,7 @@ jobs:
           include("demo_RealNVP.jl");
           @info "Running neural spline flow demo";
           include("demo_neural_spline_flow.jl");
+          @info "Running new neural spline flow demo";
+          include("demo_new_nsf.jl");
           @info "Running Hamiltonian flow demo";
           include("demo_hamiltonian_flow.jl");'
diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MonotonicSplines = "568f7cb4-8305-41bc-b90d-d32b39cc99d1"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -31,6 +32,7 @@ Distributions = "0.25"
 DocStringExtensions = "0.9"
 Flux = "0.16"
 Functors = "0.5.2"
+MonotonicSplines = "0.3.3"
 Optimisers = "0.2.16, 0.3, 0.4"
 ProgressMeter = "1.0.0"
 StatsBase = "0.33, 0.34"
diff --git a/example/demo_neural_spline_flow.jl b/example/demo_neural_spline_flow.jl
@@ -1,4 +1,3 @@
-using Flux
 using Bijectors
 using Bijectors: partition, combine, PartitionMask
 
@@ -19,21 +18,20 @@ rng = Random.default_rng()
 T = Float32
 
 ######################################
-# neals funnel target
+# a difficult banana target
 ######################################
-target = Funnel(2, 0.0f0, 9.0f0)
-logp = Base.Fix1(logpdf, target)
 
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
 ######################################
-# learn the target using Affine coupling flow
+# learn the target using Neural Spline Flow
 ######################################
 @leaf MvNormal
 q0 = MvNormal(zeros(T, 2), I)
 
-flow = nsf(q0; paramtype=Float32)
-flow_untrained = deepcopy(flow)
-
 
+flow = nsf(q0; paramtype=T)
+flow_untrained = deepcopy(flow)
 ######################################
 # start training
 ######################################
@@ -48,8 +46,8 @@ flow_trained, stats, _ = train_flow(
     flow,
     logp,
     sample_per_iter;
-    max_iters=100,   # change to larger number of iterations (e.g., 50_000) for better results
-    optimiser=Optimisers.Adam(5e-5),
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
     ADbackend=adtype,
     show_progress=true,
     callback=cb,
@@ -63,35 +61,3 @@ losses = map(x -> x.loss, stats)
 ######################################
 plot(losses; label="Loss", linewidth=2) # plot the loss
 compare_trained_and_untrained_flow(flow_trained, flow_untrained, target, 1000)
-
-
-
-
-
-
-
-
-
-# using MonotonicSplines, Plots, InverseFunctions, ChangesOfVariables
-
-# f = rand(RQSpline)
-# f.pX, f.pY, f.dYdX
-
-# plot(f, xlims = (-6, 6)); plot!(inverse(f), xlims = (-6, 6))
-
-# x = 1.2
-# y = f(x)
-# with_logabsdet_jacobian(f, x)
-# inverse(f)(y)
-# with_logabsdet_jacobian(inverse(f), y)
-
-
-
-# # test auto grad
-# function loss(x)
-#     y, laj = MonotonicSplines.rqs_forward(x, f.pX, f.pY, f.dYdX)
-#     return laj + 0.5 * sum((y .- 1).^2)
-# end
-
-# xx = rand()
-# val, g = DifferentiationInterface.value_and_gradient(loss, adtype, xx)
diff --git a/example/demo_new_nsf.jl b/example/demo_new_nsf.jl
@@ -0,0 +1,65 @@
+using Bijectors
+using Bijectors: partition, combine, PartitionMask
+
+using Random, Distributions, LinearAlgebra
+using Functors
+using Optimisers, ADTypes
+using Mooncake
+using NormalizingFlows
+
+include("SyntheticTargets.jl")
+include("utils.jl")
+
+##################################
+# start demo
+#################################
+Random.seed!(123)
+rng = Random.default_rng()
+T = Float32
+
+######################################
+# a difficult banana target
+######################################
+target = Banana(2, one(T), 100one(T))
+logp = Base.Fix1(logpdf, target)
+
+######################################
+# learn the target using Neural Spline Flow
+######################################
+@leaf MvNormal
+q0 = MvNormal(zeros(T, 2), I)
+
+
+flow = new_nsf(q0; paramtype=T)
+flow_untrained = deepcopy(flow)
+######################################
+# start training
+######################################
+sample_per_iter = 64
+
+# callback function to log training progress
+cb(iter, opt_stats, re, θ) = (sample_per_iter=sample_per_iter,ad=adtype)
+# TODO: mooncake has some issues with kernelabstractions?
+# adtype = ADTypes.AutoMooncake(; config = Mooncake.Config())
+adtype = ADTypes.AutoZygote()
+checkconv(iter, stat, re, θ, st) = stat.gradient_norm < one(T)/1000
+flow_trained, stats, _ = train_flow(
+    elbo_batch,
+    flow,
+    logp,
+    sample_per_iter;
+    max_iters=10,   # change to larger number of iterations (e.g., 50_000) for better results
+    optimiser=Optimisers.Adam(1e-4),
+    ADbackend=adtype,
+    show_progress=true,
+    callback=cb,
+    hasconverged=checkconv,
+)
+θ, re = Optimisers.destructure(flow_trained)
+losses = map(x -> x.loss, stats)
+
+######################################
+# evaluate trained flow
+######################################
+plot(losses; label="Loss", linewidth=2) # plot the loss
+compare_trained_and_untrained_flow(flow_trained, flow_untrained, target, 1000)
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
@@ -130,10 +130,15 @@ end
 include("flows/utils.jl")
 include("flows/realnvp.jl")
 include("flows/neuralspline.jl")
+# a new implementation of Neural Spline Flow based on MonotonicSplines.jl
+# the construction of the RQS seems to be more efficient than the one in Bijectors.jl
+# and supports batched operations.
+include("flows/new_nsf.jl")
 
 export create_flow
 export AffineCoupling, RealNVP_layer, realnvp
 export NeuralSplineCoupling, NSF_layer, nsf
+export NSC, new_NSF_layer, new_nsf
 
 
 end
diff --git a/src/flows/new_nsf.jl b/src/flows/new_nsf.jl
@@ -0,0 +1,113 @@
+using MonotonicSplines
+
+struct NSC{T,A<:Flux.Chain} <: Bijectors.Bijector
+    dim::Int                        # dimension of input
+    K::Int                          # number of knots
+    n_dims_transferred::Int         # number of dimensions that are transformed
+    B::T                            # bound of the knots
+    nn::A                           # networks that parmaterize the knots and derivatives
+    mask::Bijectors.PartitionMask
+end
+
+function NSC(
+    dim::T1,                         # dimension of input
+    hdims::AbstractVector{T1},       # dimension of hidden units for s and t
+    K::T1,                           # number of knots
+    B::T2,                           # bound of the knots
+    mask_idx::AbstractVector{T1}, # index of dimensione that one wants to apply transformations on
+    paramtype::Type{T2},             # type of the parameters, e.g., Float64 or Float32
+) where {T1<:Int,T2<:AbstractFloat}
+    num_of_transformed_dims = length(mask_idx)
+    input_dims = dim - num_of_transformed_dims
+    
+    # output dim of the NN
+    output_dims = (3K - 1)*num_of_transformed_dims
+    # one big mlp that outputs all the knots and derivatives for all the transformed dimensions
+    nn = fnn(input_dims, hdims, output_dims; output_activation=nothing, paramtype=paramtype)
+
+    mask = Bijectors.PartitionMask(dim, mask_idx)
+    return NSC{T2, typeof(nn)}(dim, K, num_of_transformed_dims, B, nn, mask)
+end
+
+@functor NSC (nn,)
+
+function get_nsl_params(nsl::NSC, x::AbstractVecOrMat)
+    nnoutput = nsl.nn(x)
+    px, py, dydx = MonotonicSplines.rqs_params_from_nn(nnoutput, nsl.n_dims_transferred, nsl.B)
+    return px, py, dydx
+end
+
+function Bijectors.transform(nsl::NSC, x::AbstractVecOrMat)
+    x1, x2, x3 = Bijectors.partition(nsl.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsl_params(nsl, x2)
+    if x1 isa AbstractVector
+        x1 = reshape(x1, 1, length(x1))  # ensure x1 is a matrix
+    end
+    y1, _ = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, y1, x2, x3)
+end
+
+function Bijectors.with_logabsdet_jacobian(nsl::NSC, x::AbstractVecOrMat)
+    x1, x2, x3 = Bijectors.partition(nsl.mask, x)
+    # instantiate rqs knots and derivatives
+    px, py, dydx = get_nsl_params(nsl, x2)
+    y1, logjac = MonotonicSplines.rqs_forward(x1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, y1, x2, x3), vec(logjac)
+end
+
+function Bijectors.transform(insl::Inverse{<:NSC}, y::AbstractVecOrMat)
+    nsl = insl.orig
+    y1, y2, y3 = partition(nsl.mask, y)
+    px, py, dydx = get_nsl_params(nsl, y2)
+    x1, _ = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, x1, y2, y3)
+end
+
+function Bijectors.with_logabsdet_jacobian(insl::Inverse{<:NSC}, y::AbstractVecOrMat)
+    nsl = insl.orig
+    y1, y2, y3 = partition(nsl.mask, y)
+    px, py, dydx = get_nsl_params(nsl, y2)
+    x1, logjac = MonotonicSplines.rqs_inverse(y1, px, py, dydx)
+    return Bijectors.combine(nsl.mask, x1, y2, y3), logjac isa Real ? logjac : vec(logjac)
+end
+
+function (nsl::NSC)(x::AbstractVecOrMat)
+    return Bijectors.transform(nsl, x)
+end
+
+
+function new_NSF_layer(
+    dims::T1,                      # dimension of problem
+    hdims::AbstractVector{T1},     # dimension of hidden units for nn 
+    K::T1,                           # number of knots
+    B::T2;                           # bound of the knots
+    paramtype::Type{T2} = Float64,   # type of the parameters
+) where {T1<:Int,T2<:AbstractFloat}
+
+    mask_idx1 = 1:2:dims
+    mask_idx2 = 2:2:dims
+
+    # by default use the odd-even masking strategy
+    nsf1 = NSC(dims, hdims, K, B, mask_idx1, paramtype)
+    nsf2 = NSC(dims, hdims, K, B, mask_idx2, paramtype)
+    return reduce(∘, (nsf1, nsf2))
+end
+
+function new_nsf(
+    q0::Distribution{Multivariate,Continuous},  
+    hdims::AbstractVector{Int},     # dimension of hidden units for s and t
+    K::Int,
+    B::T,
+    nlayers::Int;                   # number of RealNVP_layer 
+    paramtype::Type{T} = Float64,   # type of the parameters
+) where {T<:AbstractFloat}
+
+    dims = length(q0)  # dimension of the reference distribution == dim of the problem
+    Ls = [new_NSF_layer(dims, hdims, K, B; paramtype=paramtype) for _ in 1:nlayers] 
+    create_flow(Ls, q0)         
+end
+
+new_nsf(q0; paramtype::Type{T} = Float64) where {T<:AbstractFloat} = new_nsf(
+    q0, [32, 32], 10, 30*one(T), 10; paramtype=paramtype
+)