Working doob, with example

murrellb · murrellb · commit 6d312a185eef · 2025-07-10T13:17:20.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "0.1.2"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ForwardBackward = "e879419d-bb0f-4252-adee-d266c51ac92d"
+LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 Manifolds = "1cead3c2-87b3-11e9-0ccd-23c62b72b94e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
@@ -14,6 +15,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 [compat]
 Adapt = "4.1.1"
 ForwardBackward = "0.1.0"
+LogExpFunctions = "0.3.29"
 Manifolds = "0.10.12"
 NNlib = "0.9.27"
 OneHotArrays = "0.2.6"
diff --git a/examples/doob.jl b/examples/doob.jl
@@ -0,0 +1,93 @@
+#Note: Haven't figured out exactly what, in the literature, this is
+using Pkg
+Pkg.activate(".")
+using ForwardBackward, Flowfusion, NNlib, Flux, RandomFeatureMaps, Optimisers, Plots
+
+struct DModel{A}
+    layers::A
+end
+
+Flux.@layer DModel
+
+function DModel(; embeddim = 64, l = 2, K = 32, layers = 5)
+    embed_time = Chain(RandomFourierFeatures(1 => embeddim, 2.0f0), Dense(embeddim => embeddim, leakyrelu))
+    embed_char = Dense(K => embeddim, bias = false)
+    mix = Dense(l*embeddim => embeddim, leakyrelu)
+    ffs = [Dense(embeddim => embeddim, leakyrelu) for _ in 1:layers]
+    decode = Dense(embeddim => l*K)
+    layers = (; embed_time, embed_char, mix, ffs, decode)
+    DModel(layers)
+end
+
+function (f::DModel)(t, Xt)
+    l = f.layers
+    tXt = tensor(Xt)
+    len = size(tXt)[end]
+    tv = zero(similar(Float32.(tXt), 1, len)) .+ expand(t, 2)
+    x = l.mix(reshape(l.embed_char(tXt), :, len))  .+ l.embed_time(tv)
+    for ff in l.ffs
+        x = x .+ ff(x)
+    end
+    reshape(l.decode(x), :, 2, len)
+end
+
+T = Float32
+n_samples = 1000
+
+sampleX1(n_samples) = Flowfusion.random_discrete_cat(n_samples)
+sampleX0(n_samples) = rand(25:32, 2, n_samples)
+P = DoobMatchingFlow(UniformDiscrete(1f0)) #The rate of the inner process controls how noisy the paths are
+
+#If you use a UniformUnmasking process, you must start in the last token for Doob h to be defined.
+#Generally, an X0 without token overlap with the training data might give better results!
+#sampleX0(n_samples) = [33 for _ in zeros(2, n_samples)] 
+#P = DoobMatchingFlow(UniformUnmasking(1f0)) #The rate of the inner process controls how noisy the paths are
+
+model = DModel(embeddim = 128, l = 2, K = 33, layers = 2)
+
+orig_eta = eta = 0.001
+opt_state = Flux.setup(AdamW(eta = eta, lambda = 0.0001), model)
+
+iters = 3500
+for i in 1:iters
+    #Set up a batch of training pairs, and t
+    X1 = DiscreteState(33, sampleX1(n_samples))
+    X0 = DiscreteState(33, sampleX0(n_samples))
+    t = rand(T, 1, n_samples)
+    #Construct the bridge:
+    Xt = bridge(P, X0, X1, t) 
+    Xt = onehot(Xt) #<-Need this for the doob loss
+    denseXt = dense(Xt) #<-Zygote doesn't like the onehot input, so we make it dense.
+    G = Guide(P, t, Xt, onehot(X1)) #This sets up the "training target rate" via a Doob h-transform
+    #Gradient
+    l,g = Flux.withgradient(model) do m
+        floss(P, Xt, m(t,denseXt), G, scalefloss(P,t,1)) 
+    end
+    #Update
+    Flux.update!(opt_state, model, g[1])
+    if i % 10 == 0
+        if i > iters - 1000
+            eta = max(eta - orig_eta/100, 1e-9)
+            Optimisers.adjust!(opt_state, eta)
+        end
+        println("i: $i; Loss: $l; eta: $eta")
+    end
+end
+
+
+n_inference_samples = 10000
+X0 = DiscreteState(33, sampleX0(n_inference_samples))
+paths = Tracker()
+@time samp = gen(P, X0, (t,Xt) -> model(t,onehot(Xt)), 0f0:0.005f0:1f0, tracker = paths)
+
+pl = scatter(X0.state[1,:],X0.state[2,:], msw = 0, color = "blue", alpha = 0.4, label = "Initial", size = (400,400), legend = :topleft, xlim = (1,33), ylim = (1,33))
+scatter!(samp.state[1,:],samp.state[2,:], msw = 0, color = "green", alpha = 0.04, label = :none)
+scatter!([-10],[-10], msw = 0, color = "green", alpha = 0.3, label = "Sampled")
+tvec = stack_tracker(paths, :t)
+xttraj = stack_tracker(paths, :xt)
+for i in 1:200:n_inference_samples
+    plot!(xttraj[1,i,:], xttraj[2,i,:], color = "red", label = :none, alpha = 0.15)
+end
+plot!([-10],[-10], color = "red", label = "Trajectory", alpha = 0.4)
+pl
+savefig("discrete_doob.svg")
diff --git a/src/Flowfusion.jl b/src/Flowfusion.jl
@@ -22,7 +22,7 @@ Later:
 
 module Flowfusion
 
-using ForwardBackward, OneHotArrays, Adapt, Manifolds, NNlib
+using ForwardBackward, OneHotArrays, Adapt, Manifolds, NNlib, LogExpFunctions
 
 include("types.jl")
 include("mask.jl")
@@ -35,6 +35,7 @@ export
     #Processes not in ForwardBackward.jl
     InterpolatingDiscreteFlow,
     NoisyInterpolatingDiscreteFlow,
+    DoobMatchingFlow,
     MaskedState,
     Guide,
     tangent_guide,
diff --git a/src/bridge.jl b/src/bridge.jl
@@ -50,9 +50,13 @@ end
 #resolveprediction exists to stop bridge from needing multiple definitions.
 #Tuple broadcast:
 resolveprediction(dest::Tuple, src::Tuple) = map(resolveprediction, dest, src)
+
 #Default if X̂₁ is a plain tensor:
-resolveprediction(X̂₁, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}) = copytensor!(stochastic(Xₜ), X̂₁) #Returns a Likelihood
-resolveprediction(X̂₁, Xₜ::DiscreteState{<:Union{OneHotArray, OneHotMatrix}}) = copytensor!(stochastic(unhot(Xₜ)), X̂₁) #Probably inefficient
+#I think these were serving processes with a faulty assumption, so I'm swapping them out to make Doob flows easier.
+#resolveprediction(X̂₁, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}) = copytensor!(stochastic(Xₜ), X̂₁) #Returns a Likelihood
+#resolveprediction(X̂₁, Xₜ::DiscreteState{<:Union{OneHotArray, OneHotMatrix}}) = copytensor!(stochastic(unhot(Xₜ)), X̂₁) #Probably inefficient
+resolveprediction(X̂₁, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}) = X̂₁ #<-Need to test if this breaking anything else
+resolveprediction(X̂₁, Xₜ::DiscreteState{<:Union{OneHotArray, OneHotMatrix}}) = X̂₁ #<-Need to test if this breaking anything else
 
 resolveprediction(X̂₁, Xₜ::State) = copytensor!(copy(Xₜ), X̂₁) #Returns a State - Handles Continuous and Manifold cases
 #Passthrough if the user returns a State or Likelihood
diff --git a/src/doob.jl b/src/doob.jl
@@ -1,14 +1,21 @@
-#ToDo: Incorporate FProcesses, with their schedules. The bridge behavior should already be correct,
-#and the fallback doob should be correct if delta is passed through the schedule.
-#But for the closed form we'll need to mod the velocities per the gradient, etc, and the same when stepping.
+#Note: Haven't figured out exactly what, in the literature, this is. Not very tested!
 
-struct DoobMatchingFlow{Proc} <: Process
+struct DoobMatchingFlow{Proc, B, F} <: Process
     P::Proc
+    onescale::B #Controls whether the "step" is unit scale or "time remaining" scale. Need to think carefully about schedules in all this...
+    transform::F #Transforms the output of the model to the rate space. Must act on the whole tensor.
+    #Note: losses can be compared for different transforms, but not for different onescale.
 end
-export DoobMatchingFlow
+
+DoobMatchingFlow(P::DiscreteProcess) = DoobMatchingFlow(P, true, NNlib.softplus) #x -> exp.(clamp.(x, -100, 11)) also works, but is scary
+DoobMatchingFlow(P::DiscreteProcess, transform::Function) = DoobMatchingFlow(P, true, transform)
+DoobMatchingFlow(P::DiscreteProcess, onescale::Bool) = DoobMatchingFlow(P, onescale, NNlib.softplus)
+
+onescale(P::DoobMatchingFlow,t) = P.onescale ? (1 .- t)  : eltype(t)(1)
+mulexpand(t,x) = expand(t, ndims(x)) .* x
 
 Flowfusion.bridge(p::DoobMatchingFlow, x0::DiscreteState{<:AbstractArray{<:Signed}}, x1::DiscreteState{<:AbstractArray{<:Signed}}, t) = bridge(p.P, x0, x1, t)
-#Finite diff fallback for when we don't have a closed form for the forward positive velocities:
+
 function fallback_doob(P::DiscreteProcess, t, Xt::DiscreteState, X1::DiscreteState; delta = eltype(t)(1e-5))
     return (tensor(forward(Xt, P, delta) ⊙ backward(X1, P, (1 .- t) .- delta)) .- tensor(onehot(Xt))) ./ delta;
 end
@@ -25,24 +32,34 @@ end
 
 forward_positive_velocities(Xt::DiscreteState, P::PiQ)= (P.r .* (P.π ./ sum(P.π))) .* (1 .- tensor(onehot(Xt)))
 doob_guide(P::PiQ, t, Xt::DiscreteState, X1::DiscreteState) = closed_form_doob(P, t, Xt, X1)
-
 forward_positive_velocities(Xt::DiscreteState, P::UniformUnmasking{T}) where T = (P.μ .* T((1 ./ (Xt.K-1)))) .* (1 .- tensor(onehot(Xt)))
 doob_guide(P::UniformUnmasking, t, Xt::DiscreteState, X1::DiscreteState) = closed_form_doob(P, t, Xt, X1)
-
 forward_positive_velocities(Xt::DiscreteState, P::UniformDiscrete{T}) where T = (P.μ * T(1/(Xt.K*(1-1/Xt.K)))) .* (1 .- tensor(onehot(Xt)))
 doob_guide(P::UniformDiscrete, t, Xt::DiscreteState, X1::DiscreteState) = closed_form_doob(P, t, Xt, X1)
 
-Guide(P::DoobMatchingFlow, t, Xt::DiscreteState, X1::DiscreteState) = Flowfusion.Guide(doob_guide(P.P, t, Xt, X1))
-Guide(P::DoobMatchingFlow, t, mXt::Union{MaskedState{<:DiscreteState}, DiscreteState}, mX1::MaskedState{<:DiscreteState}) = Guide(doob_guide(P.P, t, mXt, mX1), mX1.cmask, mX1.lmask)
+Guide(P::DoobMatchingFlow, t, Xt::DiscreteState, X1::DiscreteState) = Flowfusion.Guide(mulexpand(onescale(P, t), doob_guide(P.P, t, Xt, X1)))
+Guide(P::DoobMatchingFlow, t, mXt::Union{MaskedState{<:DiscreteState}, DiscreteState}, mX1::MaskedState{<:DiscreteState}) = Guide(mulexpand(onescale(P, t), doob_guide(P.P, t, mXt, mX1)), mX1.cmask, mX1.lmask)
 
-function velo_step(Xₜ::DiscreteState{<:AbstractArray{<:Signed}}, delta_t, velocity)
+function rate_constraint(Xt, X̂₁, f) 
+    posQt = f(X̂₁) .* (1 .- Xt)   
+    diagQt = -sum(posQt, dims = 1) .* Xt
+    return posQt .+ diagQt
+end
+
+function velo_step(P, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}, delta_t, log_velocity, scale)
     ohXₜ = onehot(Xₜ)
+    velocity = rate_constraint(tensor(ohXₜ), log_velocity, P.transform) .* scale
     newXₜ = CategoricalLikelihood(eltype(delta_t).(tensor(ohXₜ) .+ (delta_t .* velocity)))
     clamp!(tensor(newXₜ), 0, Inf) #Because one velo will be < 0 and a large step might push Xₜ < 0
     return rand(newXₜ)
 end
 
-step(P::DoobMatchingFlow, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}, veloX̂₁::Flowfusion.Guide, s₁, s₂) = velo_step(Xₜ, s₂ .- s₁, veloX̂₁.H)
-step(P::DoobMatchingFlow, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}, veloX̂₁, s₁, s₂) = velo_step(Xₜ, s₂ .- s₁, veloX̂₁)
+step(P::DoobMatchingFlow, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}, veloX̂₁::Flowfusion.Guide, s₁, s₂) = velo_step(P, Xₜ, s₂ .- s₁, veloX̂₁.H, expand(1 ./ onescale(P, s₁), ndims(veloX̂₁.H)))
+step(P::DoobMatchingFlow, Xₜ::DiscreteState{<:AbstractArray{<:Signed}}, veloX̂₁, s₁, s₂) = velo_step(P, Xₜ, s₂ .- s₁, veloX̂₁, expand(1 ./ onescale(P, s₁), ndims(veloX̂₁)))
+
+function cgm_dloss(P, Xt, X̂₁, doobX₁)
+    Qt = P.transform(X̂₁)
+    return sum((1 .- Xt) .* (Qt .- xlogy.(doobX₁, Qt)), dims = 1) #<- note, diagonals ignored; implicit zero sum
+end
 
-poisson_loss(mu, count, mask) = sum(mask .* (mu .- xlogy.(count, mu))) / sum(mask)
+floss(P::Flowfusion.fbu(DoobMatchingFlow), Xt::Flowfusion.msu(DiscreteState), X̂₁, X₁::Guide, c) = Flowfusion.scaledmaskedmean(cgm_dloss(P, tensor(Xt), tensor(X̂₁), X₁.H), c, Flowfusion.getlmask(X₁))