touchups

murrellb · murrellb · commit aad2095b27df · 2025-02-19T03:25:10.000+01:00
diff --git a/README.md b/README.md
@@ -5,11 +5,12 @@
 [![Build Status](https://github.com/MurrellGroup/Flowfusion.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/Flowfusion.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/MurrellGroup/Flowfusion.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/MurrellGroup/Flowfusion.jl)
 
-![Image](https://github.com/user-attachments/assets/f2754ba5-b798-4db9-8ce6-a0324b89a534)
-
 Flowfusion.jl is a Julia package for training and sampling from diffusion and flow matching models (and some things in between), across continuous, discrete, and manifold spaces, all in a single unified framework and interface.
 
-The animated shows samples from a model trained to steer a coupled 2D Brownian bridge diffusion in space, with an angular diffusion in hue. The hue endpoints are antipodal, and you can see both paths, in opposite angular directions, are sampled.
+![Image](https://github.com/user-attachments/assets/d739c07e-f9e9-4aef-932e-c36cae182391)
+![Image](https://github.com/user-attachments/assets/f2754ba5-b798-4db9-8ce6-a0324b89a534)
+
+The animated logo shows samples from a model trained to jointly transport a 2D point and an angular hue between two distributions. For the 2D point, the left side uses "Flow matching" with deterministic trajectories, and the right uses a Brownian bridge. For both sides, the angular hue is diffused via an angular Brownian bridge. The hue endpoints are antipodal, and you can see both paths, in opposite angular directions, are sampled.
 
 ## Features
 
diff --git a/examples/logo_example.jl b/examples/logo_example.jl
@@ -54,17 +54,18 @@ ManifoldState(M, Array{Float32}.(rand(M, n_samples)))
 sampleX0(n_samples) = ContinuousState(T.(stack(rand(flowinds,   n_samples))) .+ rand(T, 2, n_samples) .* 0.01f0), ManifoldState(M, fill([0.6f0], n_samples))
 sampleX1(n_samples) = ContinuousState(T.(stack(rand(fusioninds, n_samples))) .+ rand(T, 2, n_samples) .* 0.01f0), ManifoldState(M, fill([-2.54159f0], n_samples))
 
-model = FModel(embeddim = 384, layers = 5)
+model = FModel(embeddim = 512, layers = 5)
 n_samples = 500
 
 #The process:
 P = (BrownianMotion(0.05f0), ManifoldProcess(0.1f0))
+#P = (Deterministic(), ManifoldProcess(0.1f0))
 
 #Optimizer:
 eta = 0.001
 opt_state = Flux.setup(AdamW(eta = eta, lambda = 0.001), model)
 
-iters = 6000
+iters = 10000
 for i in 1:iters
     #Set up a batch of training pairs, and t, where X1 is a MaskedState: 
     X0 = sampleX0(n_samples)
@@ -82,8 +83,8 @@ for i in 1:iters
     Flux.update!(opt_state, model, g[1])
     #Logging, and lr cooldown:
     if i % 10 == 0
-        if i > iters - 2000
-            eta *= 0.975
+        if i > iters - 3000
+            eta *= 0.98
             Optimisers.adjust!(opt_state, eta)
         end
         println("i: $i; Loss: $l; eta: $eta")
@@ -106,8 +107,6 @@ astate = tensor(samp[2])
 zcstate = tensor(X0[1])
 zastate = tensor(X0[2])
 
-#scatter(zcstate[1,:], zcstate[2,:], msw = 0, ms = 1.5, markerz = zastate[1,:], cmap = :hsv)
-#scatter!(cstate[1,:], cstate[2,:], msw = 0, ms = 1.5, markerz = astate[1,:], cmap = :hsv)
 scatter(zcstate[1,:], zcstate[2,:], msw = 0, ms = 1.5, markerz = zastate[1,:], cmap = :hsv, label = :none, xlim = (-0.5, 5.5), ylim = (-1.5, 1.5))
 scatter!(cstate[1,:], cstate[2,:], msw = 0, ms = 1.5, markerz = astate[1,:], cmap = :hsv, label = :none, xlim = (-0.5, 5.5), ylim = (-1.5, 1.5))
 scatter!([-100,-100],[-100,-100], markerz = [-pi,pi], label = :none, colorbar = :none, axis=([], false))
diff --git a/src/bridge.jl b/src/bridge.jl
@@ -32,7 +32,6 @@ Returns a state where `X.state` is not onehot.
 unhot(X::DiscreteState{<:Union{OneHotArray, OneHotMatrix}}) = DiscreteState(X.K, onecold(X.state, 1:X.K))
 unhot(X::DiscreteState{<:AbstractArray{<:Integer}}) = X
 ForwardBackward.stochastic(T::Type, o::DiscreteState{<:Union{OneHotArray, OneHotMatrix}}) = CategoricalLikelihood(T.(o.state .+ 0), zeros(T, size(o.state)[2:end]...))
-#TODO: onehot/unhot for masked state?
 
 """
     dense(X::DiscreteState; T = Float32)
@@ -138,9 +137,8 @@ function stack_tracker(tracker, field; tuple_index = 1)
 end
 
 
-
+#Todo: tesst Guide with MaskedState
 Guide(Xt::ManifoldState, X1::ManifoldState; kwargs...) = Guide(tangent_guide(Xt, X1; kwargs...))
-#MaskedState needs to be tested. Current setup disallows X1 being masked but Xt not.
 Guide(mXt::Union{MaskedState{<:ManifoldState}, ManifoldState}, mX1::MaskedState{<:ManifoldState}; kwargs...) = Guide(tangent_guide(mXt, mX1; kwargs...), mX1.cmask, mX1.lmask)
 
 #=If we want the model to directly predict the tangent coordinates, we use:
diff --git a/src/loss.jl b/src/loss.jl
@@ -1,24 +1,3 @@
-#=######
-NOTES on what works:
-- Euclidean state:
-- - any compatible process, using floss
-- Manifold state:
-- - any compatible process, using tcloss
-- Discrete state:
-- - for a DiscreteProcess, only UniformUnmasking works properly. The rest have issues.
-- - works to using the ProbabilitySimplex in a ManifoldProcess.
-- - Either:
-- - - The process must have non-zero variance
-- - - or X0 must be a continuous distribution (ie. not discrete "corners") on the ProbabilitySimplex (in which case a deterministic process also works)
-=#######
-
-#This is badness that doesn't work:
-#rotangle(rots::AbstractArray{T,3}) where T = acos.(clamp.((rots[1,1,:] .+ rots[2,2,:] .+ rots[3,3,:] .- 1) ./ 2, T(-0.99), T(0.99)))
-#rotangle(rots::AbstractArray) = reshape(rotangle(reshape(rots, 3, 3, :)), 1, size(rots)[3:end]...)
-#torangle(x, y) = mod.(y .- x .+ π, 2π) .- π
-#msra(X̂₁, X₁) = rotangle(batched_mul(batched_transpose(tensor(X̂₁)), tensor(X₁))).^2 #Mean Squared Angle
-#msta(X̂₁, X₁) = sum(torangle(tensor(X̂₁), tensor(X₁)), dims=1).^2 #Mean Squared Toroidal Angle
-
 mse(X̂₁, X₁) = abs2.(tensor(X̂₁) .- tensor(X₁)) #Mean Squared Error
 lce(X̂₁, X₁) = -sum(tensor(X₁) .* logsoftmax(tensor(X̂₁)), dims=1) #Logit Cross Entropy
 kl(P,Q) = sum(softmax(tensor(P)) .* (logsoftmax(tensor(P)) .- log.(tensor(Q))), dims=1) #Kullback-Leibler Divergence
@@ -55,45 +34,11 @@ floss(P::fbu(ManifoldProcess{<:Euclidean}), X̂₁, X₁::msu(ContinuousState),
 #For a discrete process, X̂₁ will be a distribution, and X₁ will have to be a onehot before going onto the gpu.
 floss(P::fbu(DiscreteProcess), X̂₁, X₁::msu(DiscreteState{<:AbstractArray{<:Integer}}), c) = error("X₁ needs to be onehot encoded with `onehot(X₁)`. You might need to do this before moving it to the GPU.")
 floss(P::fbu(DiscreteProcess), X̂₁, X₁::msu(DiscreteState{<:OneHotArray}), c) = scaledmaskedmean(lce(X̂₁, X₁), c, getlmask(X₁))
-#floss(P::fbu(ManifoldProcess{Rotations(3)}), X̂₁, X₁::msu(ManifoldState{Rotations(3)}), c) = scaledmaskedmean(msra(X̂₁, X₁), c, getlmask(X₁))
-#floss(P::fbu(ManifoldProcess{SpecialOrthogonal(3)}), X̂₁, X₁::msu(ManifoldState{SpecialOrthogonal(3)}), c) = scaledmaskedmean(msra(X̂₁, X₁), c, getlmask(X₁))
-#floss(P::fbu(ManifoldProcess), X̂₁, X₁::msu(ManifoldState{<:Torus}), c) = scaledmaskedmean(msta(X̂₁, X₁), c, getlmask(X₁))
-
 floss(P::Tuple, X̂₁::Tuple, X₁::Tuple, c::Union{AbstractArray, Real}) = sum(floss.(P, X̂₁, X₁, (c,)))
 floss(P::Tuple, X̂₁::Tuple, X₁::Tuple, c::Tuple) = sum(floss.(P, X̂₁, X₁, c))
-
-#I should make a self-balancing loss that tracks the running mean/std and adaptively scales to balance against target weights.
-
-"""
-    tcloss(P::Union{fbu(ManifoldProcess), fbu(Deterministic)}, ξhat, ξ, c, mask = nothing)
-
-Where `ξhat` is the predicted tangent coordinates, and `ξ` is the true tangent coordinates.
-"""
 floss(P::Union{fbu(ManifoldProcess), fbu(Deterministic)}, ξhat, ξ::Guide, c) = scaledmaskedmean(mse(ξhat, ξ.H), c, getlmask(ξ))
-#tcloss(P::fbu(DiscreteProcess), ξhat, ξ, c, mask = nothing) = scaledmaskedmean(rkl(ξhat, ξ), c, mask)
-
-
-
-
-#=
-#Doesn't help to do it this way
-"""
-    tangent_coordinates(P::DiscreteProcess, Xt::DiscreteState, X1)
-
-Computes (a weighted mixture of) Doob's h-transform(s) that would condition the current state Xt (which must be a discrete value)
-to end at X1 (which can be a distribution) under P. Maybe.
-"""
-function tangent_coordinates(P::DiscreteProcess, X1::DiscreteState, t)
-    #(for a single column) for state=i at 1-t, H_j(t)/H_i(t) is the rate scaling ratio per Doob's h-transform.
-    #If the model can learn this directly, we can gen.
-    H = backward(X1, P, 1 .- t)
-    scale = sum(H.dist, dims = 1)
-    H.dist ./= scale
-    return H
-end
-=#
-
 
+#I should make a self-balancing loss that tracks the running mean/std and adaptively scales to balance against target weights.
 
 ########################################################################
 #Manifold-specific helper functions
diff --git a/src/mask.jl b/src/mask.jl
@@ -1,16 +1,13 @@
-#Need to test key mask functions on ContinuousState, DiscreteState, CategoricalLikelihood, ManifoldState
-
 #=#####################
 Conditioning mask behavior:
 The typical use is that it makes sense, during training, to construct the conditioning mask on the training observation, X1.
 During inference, the conditioning mask (and conditioned-upon state) has to be present on X1.
 This dictates the behavior of the masking:
-- When bridge() is called, the mask, and the state where mask=1, are inherited from X1.
+- When bridge() is called, the mask, and the state where mask=0, are inherited from X1.
 - When gen is called, the state and mask will be propogated from X0 through all of the Xts.
 =#####################
 
 
-#import Base.copy
 ForwardBackward.tensor(X::MaskedState) = tensor(X.S)
 Base.copy(X::MaskedState) = MaskedState(copy(X.S), copy(X.cmask), copy(X.lmask))
 
@@ -22,21 +19,9 @@ For example, if `m` is a boolean array, then `size(a)[ndims(a)-ndims(m):end] ==
 """
 endslices(a,m) = @view a[ntuple(Returns(:),ndims(a)-ndims(m))...,m]
 
-
-#=
-Need to handle:
-Xt = stochastic(Float32, bridge(P, X0, X1, t))
-which means "stochastic" needs to preserve mask, and CategoricalLikelihoods need to be able to be masked.
-onehot too
-=#
-
-
-
 onehot(X::MaskedState{<:DiscreteState{<:AbstractArray{<:Integer}}}) = MaskedState(onehot(X.S), X.cmask, X.lmask)
 ForwardBackward.stochastic(T::Type, o::MaskedState) = MaskedState(stochastic(T, o.S), o.cmask, o.lmask)
 
-
-
 getlmask(X1::UState) = X1.lmask
 getlmask(X1::State) = nothing
 getcmask(X1::UState) = X1.cmask
@@ -73,32 +58,8 @@ function cmask!(Xt_state, X1_state, cmask)
     return Xt_state
 end
 
-#THIS IS NOT MODIFYING - NEED TO RETHINK
-#=
-function cmask!(ohXt_state::Union{OneHotArray, OneHotMatrix}, ohX1_state::Union{OneHotArray, OneHotMatrix}, cmask)
-    K = size(ohXt_state, 1)
-    Xt_state, X1_state = onecold(ohXt_state, 1:K), onecold(ohX1_state, 1:K)
-    endslices(Xt_state,.!cmask) .= endslices(X1_state,.!cmask)
-    return onehotbatch(Xt_state, 1:K)
-end
-=#
-
 cmask!(Xt::Union{State, MaskedState{<:State}}, X1::MaskedState{<:StateLikelihood}) = error("Cannot condition a state on a Likelihood")
 
-#=
-function cmask!(Xt::MaskedState, X1::MaskedState)
-    cmask!(Xt.S.state, X1.S.state, X1.cmask)
-    size(Xt.cmask) != size(X1.cmask) && error("cmask dimensions must match")
-    Xt.cmask .= X1.cmask
-    return Xt
-end
-
-cmask!(Xt_state, X1_state, cmask::Nothing) = Xt_state
-cmask!(Xt, X1::State) = Xt
-cmask!(Xt, X1::StateLikelihood) = Xt
-cmask!(Xt::Tuple, X1::Tuple) = map(cmask!, Xt, X1)
-=#
-
 """
     mask(X, Y)
 
@@ -126,10 +87,4 @@ bridge(P::UProcess, X0, X1::MaskedState, t) = mask(bridge(P, unmask(X0), X1.S, t
 #Mask passthroughs, because the masking gets handled elsewhere:
 step(P::UProcess, Xₜ::MaskedState, hat, s₁, s₂) = step(P, unmask(Xₜ), unmask(hat), s₁, s₂) #step is only called in gen, which handles the masking itself
 resolveprediction(X, Xₜ) = resolveprediction(unmask(X), unmask(Xₜ))
-#resolveprediction(X, Xₜ::MaskedState) = resolveprediction(unmask(X), unmask(Xₜ))
-
-print("?")
 
-#REMOVE?:
-#⊙ itself doesn't force the masks - it just propogates them. The forcing happens elsewhere.
-#ForwardBackward.:⊙(a::MaskedState, b::MaskedState; kwargs...) = MaskedState(⊙(a.S, b.S; kwargs...), a.cmask .* b.cmask, a.lmask .* b.lmask)