More sophisticated convergence checks in sinkhorn (#79)

devmotion · github-actions[bot] · web-flow · commit dfcc08839cef · 2021-05-28T02:00:36.000+02:00
Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -20,7 +20,6 @@ jobs:
         os:
           - ubuntu-latest
           - windows-latest
-          - macOS-latest 
         arch:
           - x64
         include:
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "OptimalTransport"
 uuid = "7e02d93a-ae51-4f58-b602-d97af76e3b33"
 authors = ["zsteve <stephenz@student.unimelb.edu.au>"]
-version = "0.3.2"
+version = "0.3.3"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
diff --git a/src/OptimalTransport.jl b/src/OptimalTransport.jl
@@ -120,65 +120,120 @@ function emd2(μ, ν, C, optimizer; plan=nothing)
 end
 
 """
-    sinkhorn_gibbs(mu, nu, K; tol=1e-9, check_marginal_step=10, maxiter=1000)
+    sinkhorn_gibbs(
+        μ, ν, K; atol=0, rtol=atol > 0 ? 0 : √eps, check_convergence=10, maxiter=1_000
+    )
+
+Compute the dual potentials for the entropically regularized optimal transport problem
+with source and target marginals `μ` and `ν` and Gibbs kernel `K` using the Sinkhorn
+algorithm.
 
-Compute dual potentials `u` and `v` for histograms `mu` and `nu` and Gibbs kernel `K` using
-the Sinkhorn algorithm (Peyre et al., 2019)
+The Gibbs kernel `K` is defined as
+```math
+K = \\exp(-C / \\varepsilon),
+```
+where ``C`` is the cost matrix and ``\\varepsilon`` the entropic regularization parameter.
+The corresponding optimal transport plan can be computed from the dual potentials ``u``
+and ``v`` as
+```math
+\\gamma = \\operatorname{diag}(u) K \\operatorname{diag}(v).
+```
 
-The Gibbs kernel `K` is given by `K = exp.(- C / eps)` where `C` is the cost matrix and
-`eps` the entropic regularization parameter. The optimal transport plan for histograms `u`
-and `v` and cost matrix `C` with regularization parameter `eps` can be computed as
-`Diagonal(u) * K * Diagonal(v)`.
+Every `check_convergence` steps it is assessed if the algorithm is converged by checking if
+the iterate of the transport plan `G` satisfies
+```julia
+isapprox(sum(G; dims=2), μ; atol=atol, rtol=rtol, norm=x -> norm(x, 1))
+```
+The default `rtol` depends on the types of `μ`, `ν`, and `K`. After `maxiter` iterations,
+the computation is stopped.
 """
-function sinkhorn_gibbs(mu, nu, K; tol=1e-9, check_marginal_step=10, maxiter=1000)
-    if !(sum(mu) ≈ sum(nu))
-        throw(ArgumentError("Error: mu and nu must lie in the simplex"))
+function sinkhorn_gibbs(
+    μ,
+    ν,
+    K;
+    tol=nothing,
+    atol=tol,
+    rtol=nothing,
+    check_marginal_step=nothing,
+    check_convergence=check_marginal_step,
+    maxiter::Int=1_000,
+)
+    if tol !== nothing
+        Base.depwarn(
+            "keyword argument `tol` is deprecated, please use `atol` and `rtol`",
+            :sinkhorn_gibbs,
+        )
     end
+    if check_marginal_step !== nothing
+        Base.depwarn(
+            "keyword argument `check_marginal_step` is deprecated, please use `check_convergence`",
+            :sinkhorn_gibbs,
+        )
+    end
+    sum(μ) ≈ sum(ν) ||
+        throw(ArgumentError("source and target marginals must have the same mass"))
+
+    # set default values of tolerances
+    T = float(Base.promote_eltype(μ, ν, K))
+    _atol = atol === nothing ? 0 : atol
+    _rtol = rtol === nothing ? (_atol > zero(_atol) ? zero(T) : sqrt(eps(T))) : rtol
 
     # initial iteration
-    temp_v = vec(sum(K; dims=2))
-    u = mu ./ temp_v
-    temp_u = K' * u
-    v = nu ./ temp_u
+    u = μ ./ sum(K; dims=2)
+    v = ν ./ (K' * u)
+    tmp1 = K * v
+    tmp2 = similar(u)
 
+    norm_μ = sum(abs, μ) # for convergence check
     isconverged = false
+    check_step = check_convergence === nothing ? 10 : check_convergence
     for iter in 0:maxiter
-        # check mu marginal
-        if iter % check_marginal_step == 0
-            mul!(temp_v, K, v)
-            @. temp_v = abs(mu - u * temp_v)
-
-            err = maximum(temp_v)
-            @debug "Sinkhorn algorithm: iteration $iter" err
+        if iter % check_step == 0
+            # check source marginal
+            # do not overwrite `tmp1` but reuse it for computing `u` if not converged
+            @. tmp2 = u * tmp1
+            norm_uKv = sum(abs, tmp2)
+            @. tmp2 = μ - tmp2
+            norm_diff = sum(abs, tmp2)
+
+            @debug "Sinkhorn algorithm (" *
+                   string(iter) *
+                   "/" *
+                   string(maxiter) *
+                   ": absolute error of source marginal = " *
+                   string(norm_diff)
 
             # check stopping criterion
-            if err < tol
+            if norm_diff < max(_atol, _rtol * max(norm_μ, norm_uKv))
+                @debug "Sinkhorn algorithm ($iter/$maxiter): converged"
                 isconverged = true
                 break
             end
         end
 
         # perform next iteration
         if iter < maxiter
-            mul!(temp_v, K, v)
-            @. u = mu / temp_v
-            mul!(temp_u, K', u)
-            @. v = nu / temp_u
+            @. u = μ / tmp1
+            mul!(v, K', u)
+            @. v = ν / v
+            mul!(tmp1, K, v)
         end
     end
 
     if !isconverged
-        @warn "Sinkhorn algorithm did not converge"
+        @warn "Sinkhorn algorithm ($maxiter/$maxiter): not converged"
     end
 
     return u, v
 end
 
 """
-    sinkhorn(μ, ν, C, ε; tol=1e-9, check_marginal_step=10, maxiter=1_000)
+    sinkhorn(
+        μ, ν, C, ε; atol=0, rtol=atol > 0 ? 0 : √eps, check_convergence=10, maxiter=1_000
+    )
 
-Compute the optimal transport plan for the entropic regularization optimal transport problem
-with source and target marginals `μ` and `ν`, cost matrix `C` of size
+Compute the optimal transport plan for the entropically regularized optimal transport
+problem with source and target marginals `μ` and `ν`, cost matrix `C` of size
 `(length(μ), length(ν))`, and entropic regularization parameter `ε`.
 
 The optimal transport plan `γ` is of the same size as `C` and solves
@@ -189,28 +244,35 @@ The optimal transport plan `γ` is of the same size as `C` and solves
 where ``\\Omega(\\gamma) = \\sum_{i,j} \\gamma_{i,j} \\log \\gamma_{i,j}`` is the entropic
 regularization term.
 
-Every `check_marginal_step` steps a convergence check of the error of the marginal
-`μ` with absolute tolerance `tol` is performed. After `maxiter` iterations, the
-computation is stopped.
+Every `check_convergence` steps it is assessed if the algorithm is converged by checking if
+the iterate of the transport plan `G` satisfies
+```julia
+isapprox(sum(G; dims=2), μ; atol=atol, rtol=rtol, norm=x -> norm(x, 1))
+```
+The default `rtol` depends on the types of `μ`, `ν`, and `C`. After `maxiter` iterations,
+the computation is stopped.
+
+See also: [`sinkhorn2`](@ref)
 """
-function sinkhorn(mu, nu, C, eps; kwargs...)
+function sinkhorn(μ, ν, C, ε; kwargs...)
     # compute Gibbs kernel
-    K = @. exp(-C / eps)
+    K = @. exp(-C / ε)
 
     # compute dual potentials
-    u, v = sinkhorn_gibbs(mu, nu, K; kwargs...)
+    u, v = sinkhorn_gibbs(μ, ν, K; kwargs...)
 
-    return Diagonal(u) * K * Diagonal(v)
+    return K .* u .* v'
 end
 
 """
     sinkhorn2(μ, ν, C, ε; regularization=false, plan=nothing, kwargs...)
 
-Solve the entropic regularization optimal transport problem with source and target
+Solve the entropically regularized optimal transport problem with source and target
 marginals `μ` and `ν`, cost matrix `C` of size `(length(μ), length(ν))`, and entropic
 regularization parameter `ε`, and return the optimal cost.
 
-A pre-computed optimal transport `plan` may be provided.
+A pre-computed optimal transport `plan` may be provided. The other keyword arguments
+supported here are the same as those in the [`sinkhorn`](@ref) function.
 
 !!! note
     As the `sinkhorn2` function in the Python Optimal Transport package, this function
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -124,20 +124,20 @@ end
 
         # compute optimal transport map (Julia implementation + POT)
         eps = 0.01
-        γ = sinkhorn(μ, ν, C, eps; maxiter=5_000)
+        γ = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-9)
         γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)
-        @test norm(γ - γ_pot, Inf) < 1e-9
+        @test γ_pot ≈ γ rtol = 1e-6
 
         # compute optimal transport cost
-        c = sinkhorn2(μ, ν, C, eps; maxiter=5_000)
+        c = sinkhorn2(μ, ν, C, eps; maxiter=5_000, rtol=1e-9)
 
         # with regularization term
         c_w_regularization = sinkhorn2(μ, ν, C, eps; maxiter=5_000, regularization=true)
         @test c_w_regularization ≈ c + eps * sum(x -> iszero(x) ? x : x * log(x), γ)
 
         # compare with POT
         c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)[1]
-        @test c_pot ≈ c atol = 1e-9
+        @test c_pot ≈ c
 
         # ensure that provided map is used and correct
         c2 = sinkhorn2(similar(μ), similar(ν), C, rand(); plan=γ)
@@ -159,23 +159,50 @@ end
 
         # compute optimal transport map (Julia implementation + POT)
         eps = 0.01f0
-        γ = sinkhorn(μ, ν, C, eps; maxiter=5_000)
+        γ = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
         @test eltype(γ) === Float32
 
-        γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)
-        @test norm(γ - γ_pot, Inf) < Base.eps(Float32)
+        γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)
+        @test Float32.(γ_pot) ≈ γ rtol = 1e-3
 
         # compute optimal transport cost
-        c = sinkhorn2(μ, ν, C, eps; maxiter=5_000)
+        c = sinkhorn2(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
         @test c isa Float32
 
         # with regularization term
-        c_w_regularization = sinkhorn2(μ, ν, C, eps; maxiter=5_000, regularization=true)
+        c_w_regularization = sinkhorn2(
+            μ, ν, C, eps; maxiter=5_000, rtol=1e-6, regularization=true
+        )
         @test c_w_regularization ≈ c + eps * sum(x -> iszero(x) ? x : x * log(x), γ)
 
         # compare with POT
-        c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)[1]
-        @test c_pot ≈ c atol = Base.eps(Float32)
+        c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)[1]
+        @test Float32(c_pot) ≈ c rtol = 1e-3
+    end
+
+    @testset "deprecations" begin
+        # create two uniform histograms
+        μ = fill(1 / M, M)
+        ν = fill(1 / N, N)
+
+        # create random cost matrix
+        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+        # check `sinkhorn2`
+        eps = 0.01
+        c = sinkhorn2(μ, ν, C, eps; atol=1e-6)
+        @test (@test_deprecated sinkhorn2(μ, ν, C, eps; tol=1e-6)) == c
+        c = sinkhorn2(μ, ν, C, eps; check_convergence=5)
+        @test (@test_deprecated sinkhorn2(μ, ν, C, eps; check_marginal_step=5)) == c
+
+        # check `sinkhorn_gibbs
+        K = @. exp(-C / eps)
+        γ = OptimalTransport.sinkhorn_gibbs(μ, ν, K; atol=1e-6)
+        @test (@test_deprecated OptimalTransport.sinkhorn_gibbs(μ, ν, K; tol=1e-6)) == γ
+        γ = OptimalTransport.sinkhorn_gibbs(μ, ν, K; check_convergence=5)
+        @test (@test_deprecated OptimalTransport.sinkhorn_gibbs(
+            μ, ν, K; check_marginal_step=5
+        )) == γ
     end
 end