diff --git a/Project.toml b/Project.toml index 083819e..a0f407d 100644 --- a/Project.toml +++ b/Project.toml @@ -4,8 +4,12 @@ authors = ["Thomas Wutzler and contributors"] version = "1.0.0-DEV" [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" @@ -21,9 +25,13 @@ HybridVariationalInferenceLuxExt = "Lux" HybridVariationalInferenceSimpleChainsExt = "SimpleChains" [compat] +ChainRulesCore = "1.25" +CUDA = "5.5.2" Combinatorics = "1.0.2" ComponentArrays = "0.15.19" Flux = "v0.15.2" +GPUArraysCore = "0.1, 0.2" +LinearAlgebra = "1.10.0" Lux = "1.4.2" Random = "1.10.0" SimpleChains = "0.4" diff --git a/README.md b/README.md index 8de4544..e336dc4 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Extending Variational Inference (VI), an approximate bayesian inversion method, to hybrid models, i.e. models that combine mechanistic and machine-learning parts. -The model inversion, inferes parametric approximations of posterior density +The model inversion, infers parametric approximations of posterior density of model parameters, by comparing model outputs to uncertain observations. At the same time, a machine learning model is fit that predicts parameters of these approximations by covariates. diff --git a/dev/Project.toml b/dev/Project.toml index 42d3764..04a4a3a 100644 --- a/dev/Project.toml +++ b/dev/Project.toml @@ -1,5 +1,6 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" @@ -11,5 +12,6 @@ SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl index 95d3c36..cb754a8 100644 --- a/dev/doubleMM.jl +++ b/dev/doubleMM.jl @@ -65,28 +65,245 @@ loss_g(ϕg_opt1, xM, g) scatterplot(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2])) @test cor(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2])) > 0.9 -#----------- fit g and θP to y_o -f = gen_hybridcase_PBmodel(case; scenario) +tmpf = () -> begin + #----------- fit g and θP to y_o + # end2end inversion + f = gen_hybridcase_PBmodel(case; scenario) -int_ϕθP = ComponentArrayInterpreter(CA.ComponentVector( - ϕg = 1:length(ϕg0), θP = par_templates.θP)) -p = p0 = vcat(ϕg0, par_templates.θP .* 0.9); # slightly disturb θP_true + int_ϕθP = ComponentArrayInterpreter(CA.ComponentVector( + ϕg = 1:length(ϕg0), θP = par_templates.θP)) + p = p0 = vcat(ϕg0, par_templates.θP .* 0.9); # slightly disturb θP_true -# Pass the site-data for the batches as separate vectors wrapped in a tuple -train_loader = MLUtils.DataLoader((xM, xP, y_o), batchsize = n_batch) + # Pass the site-data for the batches as separate vectors wrapped in a tuple + train_loader = MLUtils.DataLoader((xM, xP, y_o), batchsize = n_batch) -loss_gf = get_loss_gf(g, f, y_global_o, int_ϕθP) -l1 = loss_gf(p0, train_loader.data...)[1] + loss_gf = get_loss_gf(g, f, y_global_o, int_ϕθP) + l1 = loss_gf(p0, train_loader.data...)[1] -optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...)[1], + optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...)[1], + Optimization.AutoZygote()) + optprob = OptimizationProblem(optf, p0, train_loader) + + res = Optimization.solve( + optprob, Adam(0.02), callback = callback_loss(100), maxiters = 1000); + + l1, y_pred_global, y_pred, θMs = loss_gf(res.u, train_loader.data...) + scatterplot(vec(θMs_true), vec(θMs)) + scatterplot(log.(vec(θMs_true)), log.(vec(θMs))) + scatterplot(vec(y_pred), vec(y_o)) + hcat(par_templates.θP, int_ϕθP(res.u).θP) +end + +#---------- HADVI +# TODO think about good general initializations +coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951] +logσ2_logP = CA.ComponentVector(r0=-8.997, K2=-5.893) +mean_σ_o_MC = 0.006042 + +# correlation matrices +ρsP = zeros(sum(1:(n_θP-1))) +ρsM = zeros(sum(1:(n_θM-1))) + +ϕunc = CA.ComponentVector(; + logσ2_logP=logσ2_logP, + coef_logσ2_logMs=coef_logσ2_logMs, + ρsP, + ρsM) +int_unc = ComponentArrayInterpreter(ϕunc) + +# for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude +ϕunc0 = CA.ComponentVector(; + logσ2_logP=fill(-10.0, n_θP), + coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)), + ρsP, + ρsM) + +logσ2y = fill(2 * log(σ_o), size(y_o, 1)) +n_MC = 3 + + +#-------------- ADVI with g inside cost function +using CUDA +using TransformVariables + +transPMs_batch = as( + (P=as(Array, asℝ₊, n_θP), + Ms=as(Array, asℝ₊, n_θM, n_batch))) +transPMs_all = as( + (P=as(Array, asℝ₊, n_θP), + Ms=as(Array, asℝ₊, n_θM, n_site))) + +ϕ_true = θ = CA.ComponentVector(; + μP=θP_true, + ϕg=ϕg_opt, + unc=ϕunc); +trans_gu = as( + (μP=as(Array, asℝ₊, n_θP), + ϕg=as(Array, n_ϕg), + unc=as(Array, length(ϕunc)))) +trans_g = as( + (μP=as(Array, asℝ₊, n_θP), + ϕg=as(Array, n_ϕg))) + +const int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP, + θMs=CA.ComponentMatrix( + zeros(n_θM, n_batch), first(CA.getaxes(θM)), CA.Axis(i=1:n_batch)))) + +interpreters = interpreters_g = map(get_concrete,(; + μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true), + PMs=int_PMs_batch, + unc=ComponentArrayInterpreter(ϕunc) + )) + +ϕg_true_vec = CA.ComponentVector( + TransformVariables.inverse(trans_gu, cv2NamedTuple(ϕ_true))) +ϕcg_true = interpreters.μP_ϕg_unc(ϕg_true_vec) +ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]]); +ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, SimpleChains.init_params(g), ϕunc0); + +neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], x_o[:, 1:n_batch], + transPMs_batch, map(get_concrete, interpreters); + n_MC=8, logσ2y) +Zygote.gradient(ϕ -> neg_elbo_transnorm_gf( + rng, g, f, ϕ, y_o[:, 1:n_batch], x_o[:, 1:n_batch], + transPMs_batch, interpreters; n_MC=8, logσ2y), ϕcg_true) + +() -> begin + train_loader = MLUtils.DataLoader((x_o, y_o), batchsize = n_batch) + + optf = Optimization.OptimizationFunction((ζg, data) -> begin + x_o, y_o = data + neg_elbo_transnorm_gf( + rng, g, f, ζg, y_o, x_o, transPMs_batch, map(get_concrete, interpreters_g); n_MC=5, logσ2y) + end, + Optimization.AutoZygote()) + optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini), train_loader); + res = Optimization.solve(optprob, Optimisers.Adam(0.02), callback=callback_loss(50), maxiters=800); + #optprob = Optimization.OptimizationProblem(optf, ϕ_ini0); + #res = Optimization.solve(optprob, Adam(0.02), callback=callback_loss(50), maxiters=1_400); +end + +#using Lux +ϕ = ϕcg_true |> gpu; +x_o_gpu = x_o |> gpu; +# y_o = y_o |> gpu +# logσ2y = logσ2y |> gpu +n_covar = size(x_o, 1) +g_flux = Flux.Chain( + # dense layer with bias that maps to 8 outputs and applies `tanh` activation + Flux.Dense(n_covar => n_covar * 4, tanh), + Flux.Dense(n_covar * 4 => n_covar * 4, logistic), + # dense layer without bias that maps to n outputs and `identity` activation + Flux.Dense(n_covar * 4 => n_θM, identity, bias=false), +) +() -> begin + using Lux + g_lux = Lux.Chain( + # dense layer with bias that maps to 8 outputs and applies `tanh` activation + Lux.Dense(n_covar => n_covar * 4, tanh), + Lux.Dense(n_covar * 4 => n_covar * 4, logistic), + # dense layer without bias that maps to n outputs and `identity` activation + Lux.Dense(n_covar * 4 => n_θM, identity, use_bias=false), + ) + ps, st = Lux.setup(Random.default_rng(), g_lux) + ps_ca = CA.ComponentArray(ps) |> gpu + st = st |> gpu + g_luxs = StatefulLuxLayer{true}(g_lux, nothing, st) + g_luxs(x_o_gpu[:, 1:n_batch], ps_ca) + ax_g = CA.getaxes(ps_ca) + g_luxs(x_o_gpu[:, 1:n_batch], CA.ComponentArray(ϕ.ϕg, ax_g)) + interpreters = (interpreters..., ϕg = ComponentArrayInterpreter(ps_ca)) + ϕg = CA.ComponentArray(ϕ.ϕg, ax_g) + ϕgc = interpreters.ϕg(ϕ.ϕg) + g_gpu = g_luxs +end +g_gpu = g_flux + +#Zygote.gradient(ϕg -> sum(g_gpu(x_o_gpu[:, 1:n_batch],ϕg)), ϕgc) +# Zygote.gradient(ϕg -> sum(compute_g(g_gpu, x_o_gpu[:, 1:n_batch], ϕg, interpreters)), ϕ.ϕg) +# Zygote.gradient(ϕ -> sum(tmp_gen1(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ.ϕg) +# Zygote.gradient(ϕ -> sum(tmp_gen2(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), CA.getdata(ϕ)) +# Zygote.gradient(ϕ -> sum(tmp_gen2(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ) |> cpu +# Zygote.gradient(ϕ -> sum(tmp_gen3(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ) |> cpu +# Zygote.gradient(ϕ -> sum(tmp_gen4(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)[1]), ϕ) |> cpu +# generate_ζ(rng, g_gpu, f, ϕ, x_o_gpu[:, 1:n_batch], interpreters) +# Zygote.gradient(ϕ -> sum(generate_ζ(rng, g_gpu, f, ϕ, x_o_gpu[:, 1:n_batch], interpreters)[1]), ϕ) |> cpu +# include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss +# neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], +# x_o_gpu[:, 1:n_batch], transPMs_batch, interpreters; logσ2y) +# Zygote.gradient(ϕ -> sum(neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], +# x_o_gpu[:, 1:n_batch], transPMs_batch, interpreters; logσ2y)[1]), ϕ) |> cpu + + +fcost(ϕ) = neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], + x_o_gpu[:, 1:n_batch], transPMs_batch, map(get_concrete, interpreters); + n_MC=8, logσ2y = logσ2y) +fcost(ϕ) +gr = Zygote.gradient(fcost, ϕ) |> cpu; +Zygote.gradient(fcost, CA.getdata(ϕ)) + + +train_loader = MLUtils.DataLoader((x_o_gpu, y_o), batchsize = n_batch) + +optf = Optimization.OptimizationFunction((ζg, data) -> begin + x_o, y_o = data + neg_elbo_transnorm_gf( + rng, g_gpu, f, ζg, y_o, x_o, transPMs_batch, map(get_concrete, interpreters_g); n_MC=5, logσ2y) + end, Optimization.AutoZygote()) -optprob = OptimizationProblem(optf, p0, train_loader) +optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini) |> gpu, train_loader); +res = res_gpu = Optimization.solve(optprob, Optimisers.Adam(0.02), callback=callback_loss(50), maxiters=800); + +ζ_VIc = interpreters_g.μP_ϕg_unc(res.u |> cpu) +ζMs_VI = g(x_o, ζ_VIc.ϕg) +ϕunc_VI = int_unc(ζ_VIc.unc) + +hcat(θP_true, exp.(ζ_VIc.μP)) +plt = scatterplot(vec(θMs_true), vec(exp.(ζMs_VI))) +#lineplot!(plt, 0.0, 1.1, identity) +# +hcat(ϕunc, ϕunc_VI) # need to compare to MC sample +# hard to estimate for original very small theta's but otherwise good + +# test predicting correct obs-uncertainty of predictive posterior +n_sample_pred = 200 +intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP, + θMs=CA.ComponentMatrix( + zeros(n_θM, n_site), first(CA.getaxes(θM)), CA.Axis(i=1:n_sample_pred)))) + +include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss +ζs, _ = generate_ζ(rng, g, f, res.u |> cpu, x_o, + (;interpreters..., PMs = intm_PMs_gen); n_MC=n_sample_pred) +# ζ = ζs[:,1] +θsc = stack(ζ -> CA.getdata(CA.ComponentVector( + TransformVariables.transform(transPMs_all, ζ))), eachcol(ζs)); +y_pred = stack(map(ζ -> first(predict_y(ζ, f, transPMs_all)), eachcol(ζs))); + +size(y_pred) +σ_o_post = mapslices(std, y_pred; dims=3); +#describe(σ_o_post) +vcat(σ_o, mean_σ_o_MC, mean(σ_o_post), sqrt(mean(abs2, σ_o_post))) +mean_y_pred = map(mean, eachslice(y_pred; dims=(1, 2))) +#describe(mean_y_pred - y_o) +histogram(vec(mean_y_pred - y_true)) # predictions centered around y_o (or y_true) + +# look at θP, θM1 of first site +intm = ComponentArrayInterpreter(int_θdoubleMM(1:length(int_θdoubleMM)), (n_sample_pred,)) +ζs1c = intm(ζs[1:length(int_θdoubleMM), :]) +vcat(θP_true, θM_true) +histogram(exp.(ζs1c[:r0, :])) +histogram(exp.(ζs1c[:K2, :])) +histogram(exp.(ζs1c[:r1, :])) +histogram(exp.(ζs1c[:K1, :])) +# all parameters estimated to high (true not in cf bounds) +scatterplot(ζs1c[:r1, :], ζs1c[:K1, :]) # r1 and K1 strongly correlated (from θM) +scatterplot(ζs1c[:r0, :], ζs1c[:K2, :]) # r0 and K also correlated (from θP) +scatterplot(ζs1c[:r0, :], ζs1c[:K1, :]) # no correlation (modeled independent) + +# TODO compare distributions to MC sample + + + + -res = Optimization.solve( - optprob, Adam(0.02), callback = callback_loss(100), maxiters = 1000); -l1, y_pred_global, y_pred, θMs = loss_gf(res.u, train_loader.data...) -scatterplot(vec(θMs_true), vec(θMs)) -scatterplot(log.(vec(θMs_true)), log.(vec(θMs))) -scatterplot(vec(y_pred), vec(y_o)) -hcat(par_templates.θP, int_ϕθP(res.u).θP) diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl index d66b4d0..1d7b2bb 100644 --- a/src/HybridVariationalInference.jl +++ b/src/HybridVariationalInference.jl @@ -4,6 +4,10 @@ using ComponentArrays: ComponentArrays as CA using Random using StatsBase # fit ZScoreTransform using Combinatorics # gen_hybridcase_synthetic/combinations +using GPUArraysCore +using LinearAlgebra +using CUDA +using ChainRulesCore export ComponentArrayInterpreter, flatten1, get_concrete include("ComponentArrayInterpreter.jl") @@ -25,6 +29,9 @@ include("gencovar.jl") export callback_loss include("util_opt.jl") +#export - all internal +include("cholesky.jl") + export DoubleMM include("DoubleMM/DoubleMM.jl") diff --git a/src/cholesky.jl b/src/cholesky.jl new file mode 100644 index 0000000..5432d3a --- /dev/null +++ b/src/cholesky.jl @@ -0,0 +1,259 @@ +sumn(n::T) where T = n*(n+one(T)) ÷ T(2) + +""" +Inverse of s = sumn(n) for positive integer `n`. + +Gives an inexact error, if given s was not such a sum. +""" +invsumn(s::T) where T = T(-0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2 + + +""" +Convert vector v columnwise entries of upper diagonal matrix to UppterTriangular +""" +function vec2utri(v::AbstractVector{T}; n=invsumn(length(v))) where {T} + # works with Zygote despite of doing mutation of k (see test_cholesky_structure.jl) + #https://groups.google.com/g/julia-users/c/UARlZBCNlng/m/6tKKxIeoEY8J + z = zero(T) + k = 0 + m = [j >= i ? (k += 1; v[k]) : z for i = 1:n, j = 1:n] + UpperTriangular(m) +end + +function vec2utri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v))) where {T} + z = zero(T) + k = 0 + m = CUDA.allowscalar() do + CuArray([j >= i ? (k += 1; v[k]) : z for i = 1:n, j = 1:n]) + end + # m = CUDA.zeros(T,n,n) + # @cuda threads = 256 vec2utri_gpu!(m, v) # planned to put v into positions of m + return (m) +end + +function vec2utri_gpu!(m, v::AbstractVector) + index = threadIdx().x # this example only requires linear indexing, so just use `x` + stride = blockDim().x + for i = index:stride:length(v) + row, col = vec2utri_pos(i) + @inbounds m[row, col] = v[i] + end + return nothing # important +end + +""" +Compute the (one-based) position `(row, col)` within an upper tridiagonal matrix for +given (one-based) position, `s` within a packed vector representation. +""" +function vec2utri_pos(T, s) + col = ceil(T, -0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2 + cl = col - one(T) + s1 = cl*(cl+one(T)) ÷ T(2) # first number in column + row = s - s1 + (row, col) +end +vec2utri_pos(s) = vec2utri_pos(Int, s) + +""" +Compute the index in the vector of entries in an upper tridiagonal matrix +""" +function utri2vec_pos(row, col) + @assert row <= col + utri2vec_pos_unsafe(row, col) +end +function utri2vec_pos_unsafe(row::T, col::T) where T + # variant that does not check for unreasonable position away from upper tridiagonal + sc1 = T((col-one(T))*(col) ÷ T(2)) + sc1 + row +end + + +function vec2uutri(v::AbstractArray{T}; kwargs...) where {T} + m = _vec2uutri(v; kwargs...) + UnitUpperTriangular(m) + #TriangularRFP(m, :U) # no adjoint + #SymmetricPacked(m) # no adjoint + #SymmetricPacked(v) # also no adjoint - does not yet care for main diagonal +end + +# """ +# Convert vector v columnwise entries of upper diagonal matrix to UnitUppterTriangular + +# Avoid using this repeatedly on GPU arrays, because it only works on CPU (scalar indexing). +# There is a fallback that pulls `v` to the CPU, applies, and pushes back to GPU. +# """ +function _vec2uutri(v::AbstractVector{T}; n=invsumn(length(v)) + one(T), diag=one(T)) where {T} + z = zero(T) + k = 0 + m = [j > i ? (k += 1; v[k]) : i == j ? diag : z for i = 1:n, j = 1:n] + return (m) +end + + +#function vec2uutri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v)) + one(T)) where {T} +#TODO remove internal conversion to CuArray to generalize to AbstractGPUVector +function vec2uutri(v::CuVector{T}; n=invsumn(length(v)) + 1, diag=one(T)) where {T} + # _one = one(T) + # z = zero(T) + # k = 0 + # m = CUDA.allowscalar() do + # CuArray([j > i ? (k += 1; v[k]) : i == j ? _one : z for i = 1:n, j = 1:n]) + # end + m = CUDA.zeros(T,n,n) + m[1:size(m, 1)+1:end] .= diag # indexing trick for diag(m) .= one(T) + @cuda threads = 256 vec2uutri_gpu!(m, v) + return (m) +end + +function vec2uutri_gpu!(m::Union{CuDeviceArray, GPUArraysCore.AbstractGPUMatrix}, v::AbstractVector) + index = threadIdx().x # this example only requires linear indexing, so just use `x` + stride = blockDim().x + for i = index:stride:length(v) + row, col = vec2utri_pos(i) + # for unit-upper triangle shift the column position by one + @inbounds m[row, col+1] = v[i] + end + return nothing # important +end + +function ChainRulesCore.rrule(::typeof(vec2uutri), v::AbstractArray; kwargs...) + # does not change the value or the derivative but just reshapes + # -> extract the components of matrix Δy into vector + function vec2uutri_pullback(Δy) + (NoTangent(), uutri2vec(Δy)) + end + return vec2uutri(v; kwargs...), vec2uutri_pullback +end + +""" +Extract entries of upper diagonal matrix of UppterTriangular to columnwise vector +""" +function utri2vec(X::AbstractMatrix{T}) where {T} + n = size(X, 1) + lv = sumn(n) + i = 0 + j = 1 + [ + begin + if i == j + i = 0 + j += 1 + end + i += 1 + X[i, j] + end for _ in 1:lv + ] +end + + + +""" +Extract entries of upper diagonal matrix of UnitUppterTriangular to columnwise vector +""" +function uutri2vec(X::AbstractMatrix{T}) where {T} + n = size(X, 1) - 1 + lv = sumn(n) + i = 0 + j = 2 + [ + begin + if i == j - 1 + i = 0 + j += 1 + end + i += 1 + X[i, j] + end for _ in 1:lv + ] +end + +function ChainRulesCore.rrule(::typeof(uutri2vec), X::AbstractMatrix{T}) where T + # does not change the value or the derivative but just reshapes + # -> put the components of vector Δy into matrix + # make sure that the gradient of main diagonal is zero rather than one + function uutri2vec_pullback(Δy) + (NoTangent(), vec2uutri(Δy; diag=zero(T))) + end + return uutri2vec(X), uutri2vec_pullback +end + +#function uutri2vec(X::GPUArraysCore.AbstractGPUMatrix; kwargs...) +#TODO remove internal coercion to CuArray to extend to other AbstractGPUMatrix +# function uutri2vec(X::CuArray; kwargs...) +# n = size(X, 1) - 1 +# lv = sumn(n) +# i = 0 +# j = 2 +# CUDA.allowscalar() do +# CuArray([ +# begin +# if i == j - 1 +# i = 0 +# j += 1 +# end +# i += 1 +# X[i, j] +# end for _ in 1:lv +# ]) +# end +# end +function uutri2vec(X::CuMatrix{T}; kwargs...) where T + lv = sumn(size(X,1)-1) + v = CUDA.zeros(T,lv) + @cuda threads=(16, 16) uutri2vec_gpu!(v, X) + return v +end + +function uutri2vec_gpu!(v::Union{CuVector,CuDeviceVector}, X::AbstractMatrix) + x = threadIdx().x + y = threadIdx().y + stride_x= blockDim().x + stride_y = blockDim().y + for row = x:stride_x:size(X,1) + for col = y:stride_y:size(X,2) + if row < col + i = utri2vec_pos_unsafe(row, col-1) + @inbounds v[i] = X[row, col] + end + end + end + return nothing # important +end + + +""" +Takes a vector of entries of a lower UnitUpperTriangular matrix +and transforms it to an UpperTriangular that satisfies +diag(U' * U) = 1. + +This can be used to fit parameters that yield an upper Cholesky-Factor +of a Covariance matrix. + +It uses the upper triangular matrix rather than the lower because it +involes a sum across columns, whereas the alternative of a lower triangular +uses sum across rows. +Sum across columns is often faster, because entries of columns are contiguous. +""" +function transformU_cholesky1(v::AbstractVector; + n=invsumn(length(v)) + 1 + ) + U_scaled = vec2uutri(v; n) + #Sc_inv = sqrt.(sum(abs2, U_scaled, dims=1)) + #U_scaled * Diagonal(1 ./ vec(Sc_inv)) + #U = U_scaled ./ Sc_inv + U = U_scaled ./ sqrt.(sum(abs2, U_scaled, dims=1)) + return (UpperTriangular(U)) +end +function transformU_cholesky1(v::GPUArraysCore.AbstractGPUVector; n=invsumn(length(v)) + 1) + U_scaled = vec2uutri(v; n) + U = U_scaled ./ sqrt.(sum(abs2, U_scaled, dims=1)) + # do not convert to UpperTrinangular on GPU, but full matrix + #return (UpperTriangular(U)) + return U +end + +() -> begin + tmp = sqrt.(sum(abs2, U_scaled, dims=1)) + tmp2 = sum(abs2, U_scaled, dims=1) .^ (-1 / 2) + U_scaled * tmp' +end diff --git a/src/gencovar.jl b/src/gencovar.jl index 46c613e..69206c8 100644 --- a/src/gencovar.jl +++ b/src/gencovar.jl @@ -4,7 +4,7 @@ are a linear combination of the uncorrelated underlying principal factors and their binary combinations. In addition provide a SimpleChains model of adequate complexity to -fit this realationship θMs_true = f(x_o) +fit this relationship θMs_true = f(x_o) """ function gen_cov_pred(rng::AbstractRNG, T::DataType, n_covar_pc, n_covar, n_site, n_θM::Integer; diff --git a/test/Project.toml b/test/Project.toml index f3437a2..3006d5b 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,6 +4,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Lux = "b2108857-7c20-44ae-9111-449ecde12c47" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1" diff --git a/test/runtests.jl b/test/runtests.jl index ee4dbc1..49f0f37 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,8 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml @time @safetestset "test_SimpleChains" include("test_SimpleChains.jl") #@safetestset "test" include("test/test_doubleMM.jl") @time @safetestset "test_doubleMM" include("test_doubleMM.jl") + #@safetestset "test" include("test/test_cholesky_structure.jl") + @time @safetestset "test_cholesky_structure" include("test_cholesky_structure.jl") # #@safetestset "test" include("test/test_Flux.jl") @time @safetestset "test_Flux" include("test_Flux.jl") @@ -22,7 +24,9 @@ end @time begin if GROUP == "All" || GROUP == "Aqua" #@safetestset "test" include("test/test_aqua.jl") - @time @safetestset "test_aqua" include("test_aqua.jl") + if VERSION >= VersionNumber("1.11.2") + @time @safetestset "test_aqua" include("test_aqua.jl") + end end end diff --git a/test/test_cholesky_structure.jl b/test/test_cholesky_structure.jl new file mode 100644 index 0000000..ad188f4 --- /dev/null +++ b/test/test_cholesky_structure.jl @@ -0,0 +1,254 @@ +using LinearAlgebra, Test +using HybridVariationalInference +using HybridVariationalInference: HybridVariationalInference as CP +using Zygote +using OptimizationOptimisers +using ComponentArrays: ComponentArrays as CA +#using SymmetricFormats +using GPUArraysCore: GPUArraysCore +#using Flux +using CUDA + +A = [1.0 2.0 3.0 + 2.0 1.0 4.0 + 3.0 4.0 1.0] + I(3) .* 10 +LA = cholesky(A).L + +B = rand(3, 3) ./ 10 + +C = [1.0 2.0 3.2 + 2.0 1.0 4.2 + 3.2 4.2 1.0] + I(3) .* 10 +LC = cholesky(C).L + +Z = zeros(3, 3) + +@testset "cholesky of blockdiagonal" begin + # cholesky factorization of a BlockDiagonal equals BlockDiagonal of Choleskies + X = [A Z; Z' C] + LX = cholesky(X).L + @test LX == [LA Z; Z LC] +end; + +@testset "cholesky for of edge structure" begin + # structure of cholesky factorization of a BlockDiagonal with edges does not carry over + X = [A B B; B' C Z; B' Z C] + LX = cholesky(X).L + LX[4:6, 4:6] ≈ LX[7:9, 7:9] + X = [A B B B; + B' C Z Z; + B' Z C Z; + B' Z Z C] + LX = cholesky(X).L + LX[4:6, 4:6] ≈ LX[7:9, 7:9] + # but non-zero off-diagonals at Z positions and different edge entries + @test_broken LX[7:9, 4:6] ≈ Z + @test_broken LX[9:12, 4:6] ≈ LX[9:12, 7:9] +end; + +@testset "invsumn" begin + ns_orig = [1, 2, 3, 6] + s = map(n -> sum(1:n), ns_orig) + ns = CP.invsumn.(s) + @test ns == ns_orig + @test eltype(ns) == Int + # + @test_throws Exception invsumn(5) # 5 is not result of sum(1:n) +end; + +@testset "vec2utri" begin + v_orig = 1.0:6.0 + Uv = CP.vec2utri(v_orig) + @test Uv isa UpperTriangular + Zygote.gradient(v -> sum(CP.vec2utri(v)), v_orig)[1] # works nice + # + v2 = CP.utri2vec(Uv) + @test v2 == v_orig + Zygote.gradient(Uv -> sum(CP.utri2vec(Uv)), Uv)[1] # works nice +end; + +@testset "vec2uutri" begin + v = v_orig = 1.0:6.0 + vcpu = collect(v_orig) + n = CP.invsumn(length(v)) + 1 + T = eltype(v) + U1v = CP.vec2uutri(v_orig) + @test U1v isa UnitUpperTriangular + @test size(U1v, 1) == 4 + gr = Zygote.gradient(v -> sum(abs2.(CP.vec2uutri(v))) , vcpu)[1] # works nice + # test providing keyword argument + gr = Zygote.gradient(v -> sum(abs2.(CP.vec2uutri(v; n=4))) , vcpu)[1] # works nice + # + v2 = CP.uutri2vec(U1v) + @test v2 == v_orig + gr = Zygote.gradient(U1v -> sum(CP.uutri2vec(U1v) .* (1.0:6.0)), U1v)[1] # works nice +end; + +@testset "utri2vec_pos" begin + @test CP.utri2vec_pos(1,1) == 1 + @test CP.utri2vec_pos(1,2) == 2 + @test CP.utri2vec_pos(2,2) == 3 + @test CP.utri2vec_pos(1,3) == 4 + @test CP.utri2vec_pos(1,4) == 7 + @test CP.utri2vec_pos(5,5) == 15 + typeof(CP.utri2vec_pos(5,5)) == Int + typeof(CP.utri2vec_pos(Int32(5),Int32(5))) == Int32 + @test_throws AssertionError CP.utri2vec_pos(2,1) +end + +@testset "vec2uutri gpu" begin + if CUDA.functional() # only run the test, if CUDA is working (not on Github ci) + v_orig = 1.0f0:6.0f0 + v = CuArray(collect(v_orig)) + U1v = CP.vec2uutri(v) + @test !(U1v isa UnitUpperTriangular) # on CUDA work with normal matrix + @test U1v isa GPUArraysCore.AbstractGPUMatrix + @test size(U1v, 1) == 4 + @test Array(U1v) == CP.vec2uutri(v_orig) + gr = Zygote.gradient(v -> sum(abs2.(CP.vec2uutri(v))), v)[1] # works nice + @test gr isa GPUArraysCore.AbstractGPUArray + @test Array(gr) == (1:6) .* 2.0 + # + v2 = CP.uutri2vec(U1v) + @test v2 isa GPUArraysCore.AbstractGPUVector + @test eltype(v2) == eltype(U1v) + @test Array(v2) == v_orig + gr = Zygote.gradient(U1v -> sum(CP.uutri2vec(U1v .* 2)), U1v)[1] # works nice + @test gr isa GPUArraysCore.AbstractGPUArray + @test all(diag(gr) .== 0) + @test Array(CP.uutri2vec(gr)) == fill(2.0f0, length(v_orig)) + end +end; + +@testset "transformU_cholesky1 gpu" begin + v_orig = 1.0f0:6.0f0 + vcpu = collect(v_orig) + ch = CP.transformU_cholesky1(vcpu) + gr1 = Zygote.gradient(v -> sum(CP.transformU_cholesky1(v)), vcpu)[1] # works nice + @test all(diag(ch' * ch) .≈ 1) + if CUDA.functional() # only run the test, if CUDA is working (not on Github ci) + v = CuArray(collect(v_orig)) + U1v = CP.transformU_cholesky1(v) + @test !(U1v isa UnitUpperTriangular) # on CUDA work with normal matrix + @test U1v isa GPUArraysCore.AbstractGPUMatrix + @test Array(U1v) ≈ ch + gr2 = Zygote.gradient(v -> sum(CP.transformU_cholesky1(v)), v)[1] # works nice + @test Array(gr2) == gr1 + end +end; + +() -> begin + #setup for fitting of interactive blocks below + _X = rand(3, 3) + S = _X * _X' # know that this is Hermitian + stdS = sqrt.(diag(S)) + C = Diagonal(1 ./ stdS) * S * Diagonal(1 ./ stdS) + @test Diagonal(stdS) * C * Diagonal(stdS) ≈ S + + SL = cholesky(S).L + SU = cholesky(S).U + # CU = cholesky(C).U # fails: its not recognized as Symmetric + # CU = cholesky(UnitLowerTriangular(C)).U + CU = cholesky(Symmetric(C)).U + CL = cholesky(Symmetric(C)).L + @test CU * Diagonal(stdS) ≈ SU + @test Diagonal(stdS) * CL ≈ SL +end + +() -> begin + # find entries of U so that Cpred = U'*U = C + n_x = 20 + xs = rand(3, n_x) + ys = xs' * CU + n_U = size(C, 1) + fcost = (Uvec) -> begin + U = vec2utri(Uvec; n = n_U) + y_pred = xs' * U + sum(abs2, ys .- y_pred) + end + Uvec_true = utri2vec(CU) + Uvec0 = Uvec_true .* 1.2 + fcost(Uvec0) + + optf = Optimization.OptimizationFunction((x, p) -> fcost(x), Optimization.AutoZygote()) + optprob = Optimization.OptimizationProblem(optf, Uvec0) + res = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02), + callback = callback_loss(50), maxiters = 600) + + #hcat(Uvec_true, res.u, Uvec0) + Upred = vec2utri(res.u) + @test Upred ≈ CU + Cpred = Upred' * Upred + #hcat(C, Cpred) + @test Cpred ≈ C +end + +() -> begin + # find Cholesky factor so that U' * U = C given that diag(C) == 1 + n_x = 20 + xs = rand(3, n_x) + ys = ysC = xs' * CU + # Us1vec = 1.0:6.0 + n_U = size(S, 1) + + fcost = fcostCT = (Us1vec) -> begin + U = transformU_cholesky1(Us1vec; n = n_U) + y_pred = xs' * U + sum(abs2, ys .- y_pred) + end + # cannot infer true U_scaled any more + Unscaled0 = CU ./ diag(CU) + Us1vec0 = uutri2vec(Unscaled0) + fcost(Us1vec0) + + optf = Optimization.OptimizationFunction((x, p) -> fcost(x), Optimization.AutoZygote()) + optprob = Optimization.OptimizationProblem(optf, Us1vec0) + res = resCT = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02), + callback = callback_loss(50), maxiters = 1_600) + + Upred = transformU_cholesky1(res.u; n = n_U) + #hcat(CU', Upred) + @test Upred ≈ CU + fcostCT(resCT.u) +end + +@testset "fit cholesky correlation" begin + # find transformed Cholesky factor so that recovers covariance S given its diagonal + _X = rand(3, 3) + S = _X * _X' # know that this is Hermitian + n_x = 200 + xs = rand(3, n_x) + SU = cholesky(S).U + σ_o = 0.05 + ys_true = ysS = xs' * SU + ys = ys_true .+ randn(n_x) .* σ_o + + Dσ = Diagonal(sqrt.(diag(S))) # assume given + n_U = size(S, 1) + + fcost = fcostS = (Us1vec) -> begin + U = CP.transformU_cholesky1(Us1vec; n = n_U) + y_pred = (xs' * U) * Dσ + sum(abs2, ys .- y_pred) + end + # cannot infer true U_scaled any more + Unscaled0 = S ./ diag(S) + Us1vec0 = CP.uutri2vec(Unscaled0) + fcost(Us1vec0) + #fcostS(resCT.u) # cost of u optimized by Covar should yield small result if same x + + optf = Optimization.OptimizationFunction((x, p) -> fcost(x), Optimization.AutoZygote()) + optprob = Optimization.OptimizationProblem(optf, Us1vec0) + res = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02), + #callback=callback_loss(50), + maxiters = 800) + + Upred = CP.transformU_cholesky1(res.u; n = n_U) + #@test Upred ≈ CU + SUpred = Upred * Dσ + #hcat(SUpred, SU) + @test SUpred≈SU atol=1e-1 + S_pred = Dσ' * Upred' * Upred * Dσ + @test S_pred≈S atol=1e-1 +end +