Adding from https://arxiv.org/pdf/2407.15595

murrellb · murrellb · commit 72546b49deb8 · 2025-02-15T16:39:07.000+01:00
diff --git a/examples/discrete.jl b/examples/discrete.jl
@@ -0,0 +1,91 @@
+using Pkg
+Pkg.activate(".")
+using Revise
+Pkg.develop(path="../../ForwardBackward/")
+Pkg.develop(path="../")
+using ForwardBackward, Flowfusion, NNlib, Flux, RandomFeatureMaps, Optimisers, Plots
+
+struct DModel{A}
+    layers::A
+end
+
+Flux.@layer DModel
+
+function DModel(; embeddim = 64, l = 2, K = 32, layers = 5)
+    embed_time = Chain(RandomFourierFeatures(1 => embeddim, 2.0f0), Dense(embeddim => embeddim, leakyrelu))
+    embed_char = Dense(K => embeddim, bias = false)
+    mix = Dense(l*embeddim => embeddim, leakyrelu)
+    ffs = [Dense(embeddim => embeddim, leakyrelu) for _ in 1:layers]
+    decode = Dense(embeddim => l*K)
+    layers = (; embed_time, embed_char, mix, ffs, decode)
+    DModel(layers)
+end
+
+function (f::DModel)(t, Xt)
+    l = f.layers
+    tXt = tensor(Xt)
+    len = size(tXt)[end]
+    tv = zero(similar(Float32.(tXt), 1, len)) .+ expand(t, 2)
+    x = l.mix(reshape(l.embed_char(tXt), :, len))  .+ l.embed_time(tv)
+    for ff in l.ffs
+        x = x .+ ff(x)
+    end
+    reshape(l.decode(x), :, 2, len)
+end
+
+T = Float32
+n_samples = 1000
+
+sampleX1(n_samples) = Flowfusion.random_discrete_cat(n_samples)
+sampleX0(n_samples) = rand(25:32, 2, n_samples)
+#sampleX0(n_samples) = [33 for _ in zeros(2, n_samples)] #Required if you want to use a UniformUnmasking process
+
+P = NoisyInterpolatingDiscreteFlow(0.1)
+#P = InterpolatingDiscreteFlow()
+#P = UniformUnmasking()
+    
+model = DModel(embeddim = 128, l = 2, K = 33, layers = 2)
+
+eta = 0.005
+opt_state = Flux.setup(Adam(eta), model)
+
+iters = 4000
+for i in 1:iters
+    #Set up a batch of training pairs, and t
+    X1 = DiscreteState(33, sampleX1(n_samples))
+    X0 = DiscreteState(33, sampleX0(n_samples))
+    t = rand(T, 1, n_samples)
+    #Construct the bridge:
+    Xt = stochastic(Float32, bridge(P, X0, X1, t))
+    #Gradient
+    l,g = Flux.withgradient(model) do m
+        floss(P, m(t,Xt), onehot(X1), t) #CE loss - Scaling with t doesn't seem critical for this one
+    end
+    #Update
+    Flux.update!(opt_state, model, g[1])
+    if i % 10 == 0
+        if i > iters - 1000
+            eta *= 0.975
+            Optimisers.adjust!(opt_state, eta)
+        end
+        println("i: $i; Loss: $l; eta: $eta")
+    end
+end
+
+n_inference_samples = 10000
+X0 = DiscreteState(33, sampleX0(n_inference_samples))
+paths = Tracker()
+samp = gen(P, X0, (t,Xt) -> softmax(model(t,onehot(Xt))), 0f0:0.001f0:1f0, tracker = paths) #Note the softmax here
+
+pl = scatter(X0.state[1,:],X0.state[2,:], msw = 0, color = "blue", alpha = 0.4, label = "Initial", size = (400,400), legend = :topleft, xlim = (1,33), ylim = (1,33))
+scatter!(samp.state[1,:],samp.state[2,:], msw = 0, color = "green", alpha = 0.04, label = :none)
+scatter!([-10],[-10], msw = 0, color = "green", alpha = 0.3, label = "Sampled")
+tvec = stack_tracker(paths, :t)
+xttraj = stack_tracker(paths, :xt)
+for i in 1:200:n_inference_samples
+    plot!(xttraj[1,i,:], xttraj[2,i,:], color = "red", label = :none, alpha = 0.15)
+end
+plot!([-10],[-10], color = "red", label = "Trajectory", alpha = 0.4)
+pl
+savefig("discrete_$P.svg")
+
diff --git a/examples/torus.jl b/examples/torus.jl
@@ -3,7 +3,7 @@ Pkg.activate(".")
 using Revise
 Pkg.develop(path="../../ForwardBackward/")
 Pkg.develop(path="../")
-using ForwardBackward, Flowfusion, NNlib, Flux, RandomFeatureMaps, Optimisers, Plots
+using ForwardBackward, Flowfusion, NNlib, Flux, RandomFeatureMaps, Optimisers, Plots, Manifolds
 
 #Set up a Flux model: ξhat = model(t,Xt)
 struct TModel{A}
@@ -39,8 +39,8 @@ sampleX1(n_samples) = Flowfusion.random_literal_cat(n_samples, sigma = T(0.05))[
 n_samples = 500
 
 M = Torus(2)
-#P = ManifoldProcess(0.2f0)
-P = Deterministic()
+P = ManifoldProcess(0.2f0)
+#P = Deterministic()
 
 eta = 0.01
 opt_state = Flux.setup(AdamW(eta = eta, lambda = 0.00001), model)
@@ -76,7 +76,7 @@ n_inference_samples = 2000
 X0 = ManifoldState(M, eachcol(sampleX0(n_inference_samples)))
 paths = Tracker()
 #We wrap the model, because it was predicting tangent coordinates, not the actual state:
-X1pred = (t,Xt) -> apply_tangent_coordinates(Xt, model(t,tensor(Xt)))
+X1pred = (t,Xt) -> BackwardGuide(model(t,tensor(Xt)))
 samp = gen(P, X0, X1pred, 0f0:0.002f0:1f0, tracker = paths)
 
 #Plot the torus, with samples, and trajectories:
diff --git a/src/Flowfusion.jl b/src/Flowfusion.jl
@@ -4,8 +4,12 @@ using ForwardBackward, OneHotArrays, Adapt, Manifolds, NNlib
 
 include("bridge.jl")
 include("loss.jl")
+include("processes.jl")
 
-export 
+export
+    #Processes not in ForwardBackward.jl
+    InterpolatingDiscreteFlow,
+    NoisyInterpolatingDiscreteFlow,
     MaskedState,
     bridge,    
     scalefloss,
@@ -15,6 +19,7 @@ export
     onehot,
     FProcess,
     tangent_coordinates,
+    BackwardGuide,
     apply_tangent_coordinates,
     floss,
     tcloss
diff --git a/src/bridge.jl b/src/bridge.jl
@@ -17,20 +17,37 @@ process(P::Process) = P
 tscale(P::Process, t) = t
 tscale(P::FProcess, t) = P.F.(t)
 
+#=#####################
+Conditioning mask behavior:
+The typical use is that it makes sense, during training, to construct the conditioning mask on the training observation, X1.
+During inference, the conditioning mask (and conditioned-upon state) has to be present on X1.
+This dictates the behavior of the masking:
+- When bridge() is called, the mask, and the state where mask=1, are inherited from X1.
+- When gen is called, the state and mask will be propogated from X0 through all of the Xts.
+=#####################
 struct MaskedState{A,B,C}
     S::A     #State
     cmask::B #Conditioning mask. 1 = Xt=X1
     lmask::C #Loss mask.         1 = included in loss
 end
 
+#For when we want to predict the transitions instead of X1hat
+struct BackwardGuide{A}
+    H::A
+end
+ForwardBackward.:⊙(a::CategoricalLikelihood, b::BackwardGuide) = ⊙(a,copytensor!(copy(a),b.H))
+
+#⊙ itself doesn't force the masks - it just propogates them. The forcing happens elsewhere.
+ForwardBackward.:⊙(a::MaskedState, b::MaskedState; kwargs...) = MaskedState(⊙(a.S, b.S; kwargs...), a.cmask .* b.cmask, a.lmask .* b.lmask)
+
 Adapt.adapt_structure(to, S::ForwardBackward.DiscreteState) = ForwardBackward.DiscreteState(S.K, Adapt.adapt(to, S.state))
 Adapt.adapt_structure(to, S::ForwardBackward.ContinuousState) = ForwardBackward.ContinuousState(Adapt.adapt(to, S.state))
 Adapt.adapt_structure(to, S::ForwardBackward.CategoricalLikelihood) = ForwardBackward.CategoricalLikelihood(Adapt.adapt(to, S.dist), Adapt.adapt(to, S.log_norm_const))
 Adapt.adapt_structure(to, MS::MaskedState{<:State}) = MaskedState(Adapt.adapt(to, MS.S), Adapt.adapt(to, MS.cmask), Adapt.adapt(to, MS.lmask))
 Adapt.adapt_structure(to, MS::MaskedState{<:CategoricalLikelihood}) = MaskedState(Adapt.adapt(to, MS.S), Adapt.adapt(to, MS.cmask), Adapt.adapt(to, MS.lmask))
 Adapt.adapt_structure(to, S::ForwardBackward.ManifoldState) = ForwardBackward.ManifoldState(S.M, Adapt.adapt(to, S.state))
 
-UState = Union{State,MaskedState}
+UState = Union{State,MaskedState, BackwardGuide}
 
 ForwardBackward.tensor(X::MaskedState) = tensor(X.S)
 
@@ -63,6 +80,40 @@ cmask!(Xt, X1::MaskedState) = cmask!(Xt.S.state, X1.S.state, X1.cmask)
 cmask!(Xt, X1::MaskedState{<:CategoricalLikelihood}) = error("Cannot condition on a CategoricalLikelihood")
 cmask!(x̂₁::Tuple, x₀::Tuple) = map(cmask!, x̂₁, x₀)
 
+
+#copytensor! and predictresolve are used handle the state translation that happens in gen(...).
+#We want the user's X̂₁predictor, which is a DL model, to return a plain tensor (since that will be on the GPU, in the loss, etc).
+#This means we need to automagically create a State (typical for the continuous case) or Likelihood (typical for the discrete case) from the tensor.
+#But the user may return a State in the Discrete case (for massive state spaces with sub-linear sampling), and a Likelihood in the Continuous case (for variance matching models)
+#This also needs to handle MaskedStates (needs testing).
+#We need: X̂₁ =  fix(X̂₁predictor(t, Xₜ))
+#Plan: When X̂₁predictor(t, Xₜ) is a State or Likelihood, just pass through.
+#When X̂₁predictor(t, Xₜ) is a plain tensor, we apply default conversion rules.
+
+function copytensor!(dest, src)
+    tensor(dest) .= tensor(src)
+    return dest
+end
+#copytensor!(dest::Tuple, src::Tuple) = map(copytensor!, dest, src)
+
+#resolveprediction exists to stop bridge from needing multiple definitions.
+#Tuple broadcast:
+resolveprediction(dest::Tuple, src::Tuple) = map(resolveprediction, dest, src)
+#Default if X̂₁ is a plain tensor:
+resolveprediction(X̂₁, Xₜ::DiscreteState) = copytensor!(stochastic(Xₜ), X̂₁) #Returns a Likelihood
+resolveprediction(X̂₁, Xₜ::State) = copytensor!(copy(Xₜ), X̂₁) #Returns a State - Handles Continuous and Manifold cases
+#Passthrough if the user returns a State or Likelihood
+resolveprediction(X̂₁::State, Xₜ) = X̂₁
+resolveprediction(X̂₁::State, Xₜ::State) = X̂₁
+resolveprediction(X̂₁::StateLikelihood, Xₜ) = X̂₁
+
+#Passthrough if the model returns a BackwardGuide, because we have a custom bridge for that.
+resolveprediction(G::BackwardGuide, Xₜ::DiscreteState) = G
+resolveprediction(G::BackwardGuide, Xₜ::ManifoldState) = apply_tangent_coordinates(Xₜ, G.H)
+#We could also add a case for where the guide is a tangent coordinate and X₀ is a ManifoldState.
+
+
+
 """
     bridge(P, X0, X1, t)
     bridge(P, X0, X1, t0, t)
@@ -82,59 +133,38 @@ end
 bridge(P, X0, X1, t) = bridge(P, X0, X1, eltype(t)(0.0), t)
 bridge(P::Tuple{Vararg{UProcess}}, X0::Tuple{Vararg{UState}}, X1::Tuple, t0, t) = bridge.(P, X0, X1, (t0,), (t, ))
 
+#Step is like bridge (and falls back to where possible). But sometimes we only have enough to take an Euler step (which is ok when `s₂-s₁` is small).
+step(P, Xₜ, hat, s₁, s₂) = bridge(P, Xₜ, hat, s₁, s₂)
+step(P::Tuple{Vararg{UProcess}}, Xₜ::Tuple{Vararg{UState}}, hat::Tuple, s₁, s₂) = step.(P, Xₜ, hat, (s₁,), (s₂, ))
+#step(P::DiscreteProcess, Xₜ::DiscreteState, hat::BackwardGuide, s₁, s₂) = rand(forward(Xₜ, P, s₂ .- s₁) ⊙ hat) #<- Doesn't work
 
 
-#copytensor! and predictresolve are used handle the state translation that happens in gen(...).
-#We want the user's X̂₁predictor, which is a DL model, to return a plain tensor (since that will be on the GPU, in the loss, etc).
-#This means we need to automagically create a State (typical for the continuous case) or Likelihood (typical for the discrete case) from the tensor.
-#But the user may return a State in the Discrete case (for massive state spaces with sub-linear sampling), and a Likelihood in the Continuous case (for variance matching models)
-#This also needs to handle MaskedStates (needs testing).
-#We need: X̂₁ =  fix(X̂₁predictor(t, Xₜ))
-#Plan: When X̂₁predictor(t, Xₜ) is a State or Likelihood, just pass through.
-#When X̂₁predictor(t, Xₜ) is a plain tensor, we apply default conversion rules.
-
-function copytensor!(dest, src)
-    tensor(dest) .= tensor(src)
-    return dest
-end
-#copytensor!(dest::Tuple, src::Tuple) = map(copytensor!, dest, src)
-
-#Tuple broadcast:
-resolveprediction(dest::Tuple, src::Tuple) = map(resolveprediction, dest, src)
-#Default if X̂₁ is a plain tensor:
-resolveprediction(X̂₁, X₀::DiscreteState) = copytensor!(stochastic(X₀), X̂₁) #Returns a Likelihood
-resolveprediction(X̂₁, X₀::State) = copytensor!(copy(X₀), X̂₁) #Returns a State - Handles Continuous and Manifold cases
-#Passthrough if the user returns a State or Likelihood
-resolveprediction(X̂₁::State, X₀) = X̂₁
-resolveprediction(X̂₁::State, X₀::State) = X̂₁
-resolveprediction(X̂₁::StateLikelihood, X₀) = X̂₁
 #####Add MaskedState case(s)######
 
 ##################################
 
-
-
 """
-    gen(P, X0, X̂₁predictor, steps; tracker=Returns(nothing), midpoint = false)
+    gen(P, X0, model, steps; tracker=Returns(nothing), midpoint = false)
 
 Constructs a sequence of (stochastic) bridges between `X0` and the predicted `X̂₁` under the process `P`.
-`P`, `X0`, can also be tuples where the Nth element of `P` will be used for the Nth elements of `X0` and `X̂₁predictor`.
-X̂₁predictor is a function that takes `t` (scalar) and `Xₜ` (optionally a tuple) and returns `X̂₁` (a `UState`, a flat tensor with the right shape, or a tuple of either).
-If `X0` is a `MaskedState` (or has a ), then anything  `X̂₁` will be conditioned on `X0` where the conditioning mask `X0.cmask` is 1.
+`P`, `X0`, can also be tuples where the Nth element of `P` will be used for the Nth elements of `X0` and `model`.
+model is a function that takes `t` (scalar) and `Xₜ` (optionally a tuple) and returns `hat` (a `UState`, a flat tensor with the right shape, or a tuple of either if you're combining processes).
+If `X0` is a `MaskedState`, then anything in `X̂₁` will be conditioned on `X0` where the conditioning mask `X0.cmask` is 1.
 """
-function gen(P::Tuple{Vararg{UProcess}}, X₀::Tuple{Vararg{UState}}, X̂₁predictor, steps::AbstractVector; tracker::Function=Returns(nothing), midpoint = false)
+function gen(P::Tuple{Vararg{UProcess}}, X₀::Tuple{Vararg{UState}}, model, steps::AbstractVector; tracker::Function=Returns(nothing), midpoint = false)
     Xₜ = copy.(X₀)
     for (s₁, s₂) in zip(steps, steps[begin+1:end])
         t = midpoint ? (s₁ + s₂) / 2 : t = s₁
-        X̂₁ = resolveprediction(X̂₁predictor(t, Xₜ), X₀)
-        cmask!(X̂₁, X₀)
-        Xₜ = bridge(P, Xₜ, X̂₁, s₁, s₂)
-        tracker(t, Xₜ, X̂₁)
+        hat = resolveprediction(model(t, Xₜ), Xₜ)
+        Xₜ = step(P, Xₜ, hat, s₁, s₂)
+        cmask!(Xₜ, X₀)
+        tracker(t, Xₜ, hat)
     end
     return Xₜ
 end
 
-gen(P, X₀, X̂₁predictor, args...; kwargs...) = gen((P,), (X₀,), (t, Xₜ) -> (X̂₁predictor(t[1], Xₜ[1]),), args...; kwargs...)[1]
+
+gen(P, X₀, model, args...; kwargs...) = gen((P,), (X₀,), (t, Xₜ) -> (model(t[1], Xₜ[1]),), args...; kwargs...)[1]
 
 struct Tracker <: Function
     t::Vector
diff --git a/src/loss.jl b/src/loss.jl
@@ -20,16 +20,17 @@ ForwardBackward.stochastic(T::Type, o::DiscreteState{<:OneHotArray}) = Categoric
 getlmask(X1::UState) = X1.lmask
 getlmask(X1::State) = nothing
 
-rotangle(rots::AbstractArray{T,3}) where T = acos.(clamp.((rots[1,1,:] .+ rots[2,2,:] .+ rots[3,3,:] .- 1) ./ 2, T(-0.99), T(0.99)))
-rotangle(rots::AbstractArray) = reshape(rotangle(reshape(rots, 3, 3, :)), 1, size(rots)[3:end]...)
-torangle(x, y) = mod.(y .- x .+ π, 2π) .- π
-
-
+#This is badness that doesn't work:
+#rotangle(rots::AbstractArray{T,3}) where T = acos.(clamp.((rots[1,1,:] .+ rots[2,2,:] .+ rots[3,3,:] .- 1) ./ 2, T(-0.99), T(0.99)))
+#rotangle(rots::AbstractArray) = reshape(rotangle(reshape(rots, 3, 3, :)), 1, size(rots)[3:end]...)
+#torangle(x, y) = mod.(y .- x .+ π, 2π) .- π
+#msra(X̂₁, X₁) = rotangle(batched_mul(batched_transpose(tensor(X̂₁)), tensor(X₁))).^2 #Mean Squared Angle
+#msta(X̂₁, X₁) = sum(torangle(tensor(X̂₁), tensor(X₁)), dims=1).^2 #Mean Squared Toroidal Angle
 
 mse(X̂₁, X₁) = abs2.(tensor(X̂₁) .- tensor(X₁)) #Mean Squared Error
 lce(X̂₁, X₁) = -sum(tensor(X₁) .* logsoftmax(tensor(X̂₁)), dims=1) #Logit Cross Entropy
-msra(X̂₁, X₁) = rotangle(batched_mul(batched_transpose(tensor(X̂₁)), tensor(X₁))).^2 #Mean Squared Angle
-msta(X̂₁, X₁) = sum(torangle(tensor(X̂₁), tensor(X₁)), dims=1).^2 #Mean Squared Toroidal Angle
+kl(P,Q) = sum(softmax(tensor(P)) .* (logsoftmax(tensor(P)) .- log.(tensor(Q))), dims=1) #Kullback-Leibler Divergence
+rkl(P,Q) = sum(tensor(Q) .* (log.(tensor(Q)) .- logsoftmax(tensor(P))), dims=1) #Reverse Kullback-Leibler Divergence
 
 function scaledmaskedmean(l::AbstractArray{T}, c::Union{AbstractArray, Real}, m::Union{AbstractArray, Real}) where T
     expanded_m = expand(m, ndims(l))
@@ -58,8 +59,8 @@ floss(P::fbu(BrownianMotion),               X̂₁, X₁::msu(ContinuousState),
 floss(P::fbu(OrnsteinUhlenbeck),            X̂₁, X₁::msu(ContinuousState), c) = scaledmaskedmean(mse(X̂₁, X₁), c, getlmask(X₁))
 floss(P::fbu(ManifoldProcess{<:Euclidean}), X̂₁, X₁::msu(ContinuousState), c) = scaledmaskedmean(mse(X̂₁, X₁), c, getlmask(X₁))
 #For a discrete process, X̂₁ will be a distribution, and X₁ will have to be a onehot before going onto the gpu.
-floss(P::fbu(ForwardBackward.DiscreteProcess), X̂₁, X₁::msu(DiscreteState{<:AbstractArray{<:Integer}}), c) = error("X₁ needs to be onehot encoded with `onehot(X₁)`. You might need to do this before moving it to the GPU.")
-floss(P::fbu(ForwardBackward.DiscreteProcess), X̂₁, X₁::msu(DiscreteState{<:OneHotArray}), c) = scaledmaskedmean(lce(X̂₁, X₁), c, getlmask(X₁))
+floss(P::fbu(DiscreteProcess), X̂₁, X₁::msu(DiscreteState{<:AbstractArray{<:Integer}}), c) = error("X₁ needs to be onehot encoded with `onehot(X₁)`. You might need to do this before moving it to the GPU.")
+floss(P::fbu(DiscreteProcess), X̂₁, X₁::msu(DiscreteState{<:OneHotArray}), c) = scaledmaskedmean(lce(X̂₁, X₁), c, getlmask(X₁))
 floss(P::fbu(ManifoldProcess{Rotations(3)}), X̂₁, X₁::msu(ManifoldState{Rotations(3)}), c) = scaledmaskedmean(msra(X̂₁, X₁), c, getlmask(X₁))
 floss(P::fbu(ManifoldProcess{SpecialOrthogonal(3)}), X̂₁, X₁::msu(ManifoldState{SpecialOrthogonal(3)}), c) = scaledmaskedmean(msra(X̂₁, X₁), c, getlmask(X₁))
 floss(P::fbu(ManifoldProcess), X̂₁, X₁::msu(ManifoldState{<:Torus}), c) = scaledmaskedmean(msta(X̂₁, X₁), c, getlmask(X₁))
@@ -70,6 +71,7 @@ floss(P::fbu(ManifoldProcess), X̂₁, X₁::msu(ManifoldState{<:Torus}), c) = s
 Where `ξhat` is the predicted tangent coordinates, and `ξ` is the true tangent coordinates.
 """
 tcloss(P::Union{fbu(ManifoldProcess), fbu(Deterministic)}, ξhat, ξ, c, mask = nothing) = scaledmaskedmean(mse(ξhat, ξ), c, mask)
+tcloss(P::fbu(DiscreteProcess), ξhat, ξ, c, mask = nothing) = scaledmaskedmean(rkl(ξhat, ξ), c, mask)
 
 #=If we want the model to directly predict the tangent coordinates, we use:
 - tangent_coordinates outside the gradient call to get the thing the model will predict
@@ -109,6 +111,26 @@ function apply_tangent_coordinates(Xt::ManifoldState, ξ; retraction_method=defa
 end
 
 
+#=
+#Doesn't help to do it this way
+"""
+    tangent_coordinates(P::DiscreteProcess, Xt::DiscreteState, X1)
+
+Computes (a weighted mixture of) Doob's h-transform(s) that would condition the current state Xt (which must be a discrete value)
+to end at X1 (which can be a distribution) under P. Maybe.
+"""
+function tangent_coordinates(P::DiscreteProcess, X1::DiscreteState, t)
+    #(for a single column) for state=i at 1-t, H_j(t)/H_i(t) is the rate scaling ratio per Doob's h-transform.
+    #If the model can learn this directly, we can gen.
+    H = backward(X1, P, 1 .- t)
+    scale = sum(H.dist, dims = 1)
+    H.dist ./= scale
+    return H
+end
+=#
+
+
+
 ########################################################################
 #Manifold-specific helper functions
 ########################################################################
diff --git a/src/processes.jl b/src/processes.jl