switch to differentiationinterface from diffresults

zuhengxu · zuhengxu · commit 6220e96ee270 · 2025-02-16T15:12:23.000-08:00
diff --git a/src/NormalizingFlows.jl b/src/NormalizingFlows.jl
@@ -4,14 +4,12 @@ using Bijectors
 using Optimisers
 using LinearAlgebra, Random, Distributions, StatsBase
 using ProgressMeter
-using ADTypes, DiffResults
+using ADTypes
+using DifferentiationInterface
 
 using DocStringExtensions
 
-export train_flow, elbo, loglikelihood, value_and_gradient!
-
-using ADTypes
-using DiffResults
+export train_flow, elbo, loglikelihood
 
 """
     train_flow([rng::AbstractRNG, ]vo, flow, args...; kwargs...)
@@ -56,47 +54,29 @@ function train_flow(
     # use FunctionChains instead of simple compositions to construct the flow when many flow layers are involved
     # otherwise the compilation time for destructure will be too long
     θ_flat, re = Optimisers.destructure(flow)
+    
+    loss(θ, rng, args...) = -vo(rng, re(θ), args...)
 
     # Normalizing flow training loop 
-    θ_flat_trained, opt_stats, st = optimize(
-        rng,
+    θ_flat_trained, opt_stats, st, time_elapsed = optimize(
         ADbackend,
-        vo,
+        loss,
         θ_flat,
-        re,
-        args...;
+        re, 
+        (rng, args...)...;
         max_iters=max_iters,
         optimiser=optimiser,
         kwargs...,
     )
 
     flow_trained = re(θ_flat_trained)
-    return flow_trained, opt_stats, st
+    return flow_trained, opt_stats, st, time_elapsed
 end
 
-include("train.jl")
+
+
+include("optimize.jl")
 include("objectives.jl")
 
-# optional dependencies 
-if !isdefined(Base, :get_extension) # check whether :get_extension is defined in Base
-    using Requires
-end
 
-# Question: should Exts be loaded here or in train.jl? 
-function __init__()
-    @static if !isdefined(Base, :get_extension)
-        @require ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" include(
-            "../ext/NormalizingFlowsForwardDiffExt.jl"
-        )
-        @require ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" include(
-            "../ext/NormalizingFlowsReverseDiffExt.jl"
-        )
-        @require Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" include(
-            "../ext/NormalizingFlowsEnzymeExt.jl"
-        )
-        @require Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" include(
-            "../ext/NormalizingFlowsZygoteExt.jl"
-        )
-    end
-end
 end
diff --git a/src/objectives.jl b/src/objectives.jl
@@ -1,2 +1,2 @@
 include("objectives/elbo.jl")
-include("objectives/loglikelihood.jl")
+include("objectives/loglikelihood.jl") # not tested
diff --git a/src/objectives/elbo.jl b/src/objectives/elbo.jl
@@ -42,4 +42,4 @@ end
 
 function elbo(flow::Bijectors.TransformedDistribution, logp, n_samples)
     return elbo(Random.default_rng(), flow, logp, n_samples)
-end
+end
diff --git a/src/objectives/loglikelihood.jl b/src/objectives/loglikelihood.jl
@@ -2,29 +2,32 @@
 # training by minimizing forward KL (MLE)
 ####################################    
 """
-    loglikelihood(flow::Bijectors.TransformedDistribution, xs::AbstractVecOrMat)
+    loglikelihood(rng, flow::Bijectors.TransformedDistribution, xs::AbstractVecOrMat)
 
 Compute the log-likelihood for variational distribution flow at a batch of samples xs from 
 the target distribution p. 
 
 # Arguments
+- `rng`: random number generator (empty argument, only needed to ensure the same signature as other variational objectives)
 - `flow`: variational distribution to be trained. In particular 
   "flow = transformed(q₀, T::Bijectors.Bijector)", 
   q₀ is a reference distribution that one can easily sample and compute logpdf
 - `xs`: samples from the target distribution p.
 
 """
 function loglikelihood(
+    rng::AbstractRNG,                         # empty argument
     flow::Bijectors.UnivariateTransformed,    # variational distribution to be trained
     xs::AbstractVector,                       # sample batch from target dist p
 )
     return mean(Base.Fix1(logpdf, flow), xs)
 end
 
 function loglikelihood(
+    rng::AbstractRNG,                           # empty argument
     flow::Bijectors.MultivariateTransformed,    # variational distribution to be trained
     xs::AbstractMatrix,                         # sample batch from target dist p
 )
     llhs = map(x -> logpdf(flow, x), eachcol(xs))
     return mean(llhs)
-end
+end
diff --git a/src/optimize.jl b/src/optimize.jl
@@ -0,0 +1,122 @@
+#######################################################
+# training loop for variational objectives 
+#######################################################
+function pm_next!(pm, stats::NamedTuple)
+    return ProgressMeter.next!(pm; showvalues=[tuple(s...) for s in pairs(stats)])
+end
+
+_wrap_in_DI_context(args...) = DifferentiationInterface.Constant.([args...]) 
+
+function _prepare_gradient(loss, adbackend, θ, args...)
+    if isempty(args...)
+        return DifferentiationInterface.prepare_gradient(loss, adbackend, θ)
+    end
+    return DifferentiationInterface.prepare_gradient(loss, adbackend, θ, _wrap_in_DI_context(args)...)
+end
+
+function _value_and_gradient(loss, prep, adbackend, θ, args...)
+    if isempty(args...)
+        return DifferentiationInterface.value_and_gradient(loss, prep, adbackend, θ)
+    end
+    return DifferentiationInterface.value_and_gradient(loss, prep, adbackend, θ, _wrap_in_DI_context(args)...)
+end
+
+
+"""
+    optimize(
+        ad::ADTypes.AbstractADType, 
+        loss, 
+        θ₀::AbstractVector{T}, 
+        re, 
+        args...; 
+        kwargs...
+    )
+
+Iteratively updating the parameters `θ` of the normalizing flow `re(θ)` by calling `grad!`
+ and using the given `optimiser` to compute the steps.
+
+# Arguments
+- `ad::ADTypes.AbstractADType`: automatic differentiation backend
+- `loss`: a general loss function θ -> loss(θ, args...) returning a scalar loss value that will be minimised
+- `θ₀::AbstractVector{T}`: initial parameters for the loss function (in the context of normalizing flows, it will be the flattened flow parameters)
+- `re`: reconstruction function that maps the flattened parameters to the normalizing flow
+- `args...`: additional arguments for `loss` (will be set as DifferentiationInterface.Constant)
+
+
+# Keyword Arguments
+- `max_iters::Int=10000`: maximum number of iterations
+- `optimiser::Optimisers.AbstractRule=Optimisers.ADAM()`: optimiser to compute the steps
+- `show_progress::Bool=true`: whether to show the progress bar. The default
+  information printed in the progress bar is the iteration number, the loss value,
+  and the gradient norm.
+- `callback=nothing`: callback function with signature `cb(iter, opt_state, re, θ)`
+  which returns a dictionary-like object of statistics to be displayed in the progress bar.
+  re and θ are used for reconstructing the normalizing flow in case that user 
+  want to further axamine the status of the flow.
+- `hasconverged = (iter, opt_stats, re, θ, st) -> false`: function that checks whether the
+  training has converged. The default is to always return false.
+- `prog=ProgressMeter.Progress(
+            max_iters; desc="Training", barlen=31, showspeed=true, enabled=show_progress
+        )`: progress bar configuration
+
+# Returns
+- `θ`: trained parameters of the normalizing flow
+- `opt_stats`: statistics of the optimiser
+- `st`: optimiser state for potential continuation of training
+"""
+function optimize(
+    adbackend,
+    loss::Function, 
+    θ₀::AbstractVector{<:Real}, 
+    reconstruct::Function,
+    args...;
+    max_iters::Int=10000,
+    optimiser::Optimisers.AbstractRule=Optimisers.ADAM(),
+    show_progress::Bool=true,
+    callback=nothing,
+    hasconverged=(i, stats, re, θ, st) -> false,
+    prog=ProgressMeter.Progress(
+        max_iters; desc="Training", barlen=31, showspeed=true, enabled=show_progress
+    ),
+)
+    time_elapsed = @elapsed begin 
+        opt_stats = []
+
+        # prepare loss and autograd
+        θ = copy(θ₀)
+        # grad = similar(θ)
+        prep = _prepare_gradient(loss, adbackend, θ₀, args...)
+
+
+        # initialise optimiser state
+        st = Optimisers.setup(optimiser, θ)
+
+        # general `hasconverged(...)` approach to allow early termination.
+        converged = false
+        i = 1
+        while (i ≤ max_iters) && !converged
+            # ls, g = DifferentiationInterface.value_and_gradient!(loss, grad, prep, adbackend, θ)
+            ls, g = _value_and_gradient(loss, prep, adbackend, θ, args...)
+
+            # Save stats
+            stat = (iteration=i, loss=ls, gradient_norm=norm(g))
+
+            # callback
+            if !isnothing(callback)
+                new_stat = callback(i, opt_stats, reconstruct, θ)
+                stat = !isnothing(new_stat) ? merge(stat, new_stat) : stat
+            end
+            push!(opt_stats, stat)
+
+            # update optimiser state and parameters
+            st, θ = Optimisers.update!(st, θ, g)
+
+            # check convergence
+            i += 1
+            converged = hasconverged(i, stat, reconstruct, θ, st)
+            pm_next!(prog, stat)
+        end
+    end
+    # return status of the optimiser for potential continuation of training
+    return θ, map(identity, opt_stats), st, time_elapsed
+end

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`include("objectives/elbo.jl")`
`2`		`-include("objectives/loglikelihood.jl")`
	`2`	`+include("objectives/loglikelihood.jl") # not tested`