diff --git a/Project.toml b/Project.toml
index 083819e..a0f407d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,8 +4,12 @@ authors = ["Thomas Wutzler <twutz@bgc-jena.mpg.de> and contributors"]
 version = "1.0.0-DEV"
 
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
@@ -21,9 +25,13 @@ HybridVariationalInferenceLuxExt = "Lux"
 HybridVariationalInferenceSimpleChainsExt = "SimpleChains"
 
 [compat]
+ChainRulesCore = "1.25"
+CUDA = "5.5.2"
 Combinatorics = "1.0.2"
 ComponentArrays = "0.15.19"
 Flux = "v0.15.2"
+GPUArraysCore = "0.1, 0.2"
+LinearAlgebra = "1.10.0"
 Lux = "1.4.2"
 Random = "1.10.0"
 SimpleChains = "0.4"
diff --git a/README.md b/README.md
index 8de4544..e336dc4 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 Extending Variational Inference (VI), an approximate bayesian inversion method,
 to hybrid models, i.e. models that combine mechanistic and machine-learning parts.
 
-The model inversion, inferes parametric approximations of posterior density
+The model inversion, infers parametric approximations of posterior density
 of model parameters, by comparing model outputs to uncertain observations. At
 the same time, a machine learning model is fit that predicts parameters of these
 approximations by covariates.
diff --git a/dev/Project.toml b/dev/Project.toml
index 42d3764..04a4a3a 100644
--- a/dev/Project.toml
+++ b/dev/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
@@ -11,5 +12,6 @@ SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
index 95d3c36..cb754a8 100644
--- a/dev/doubleMM.jl
+++ b/dev/doubleMM.jl
@@ -65,28 +65,245 @@ loss_g(ϕg_opt1, xM, g)
 scatterplot(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2]))
 @test cor(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2])) > 0.9
 
-#----------- fit g and θP to y_o
-f = gen_hybridcase_PBmodel(case; scenario)
+tmpf = () -> begin
+    #----------- fit g and θP to y_o
+    # end2end inversion
+    f = gen_hybridcase_PBmodel(case; scenario)
 
-int_ϕθP = ComponentArrayInterpreter(CA.ComponentVector(
-    ϕg = 1:length(ϕg0), θP = par_templates.θP))
-p = p0 = vcat(ϕg0, par_templates.θP .* 0.9);  # slightly disturb θP_true
+    int_ϕθP = ComponentArrayInterpreter(CA.ComponentVector(
+        ϕg = 1:length(ϕg0), θP = par_templates.θP))
+    p = p0 = vcat(ϕg0, par_templates.θP .* 0.9);  # slightly disturb θP_true
 
-# Pass the site-data for the batches as separate vectors wrapped in a tuple
-train_loader = MLUtils.DataLoader((xM, xP, y_o), batchsize = n_batch)
+    # Pass the site-data for the batches as separate vectors wrapped in a tuple
+    train_loader = MLUtils.DataLoader((xM, xP, y_o), batchsize = n_batch)
 
-loss_gf = get_loss_gf(g, f, y_global_o, int_ϕθP)
-l1 = loss_gf(p0, train_loader.data...)[1]
+    loss_gf = get_loss_gf(g, f, y_global_o, int_ϕθP)
+    l1 = loss_gf(p0, train_loader.data...)[1]
 
-optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...)[1],
+    optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...)[1],
+        Optimization.AutoZygote())
+    optprob = OptimizationProblem(optf, p0, train_loader)
+
+    res = Optimization.solve(
+        optprob, Adam(0.02), callback = callback_loss(100), maxiters = 1000);
+
+    l1, y_pred_global, y_pred, θMs = loss_gf(res.u, train_loader.data...)
+    scatterplot(vec(θMs_true), vec(θMs))
+    scatterplot(log.(vec(θMs_true)), log.(vec(θMs)))
+    scatterplot(vec(y_pred), vec(y_o))
+    hcat(par_templates.θP, int_ϕθP(res.u).θP)
+end
+
+#---------- HADVI
+# TODO think about good general initializations
+coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
+logσ2_logP = CA.ComponentVector(r0=-8.997, K2=-5.893)
+mean_σ_o_MC = 0.006042
+
+# correlation matrices
+ρsP = zeros(sum(1:(n_θP-1)))
+ρsM = zeros(sum(1:(n_θM-1)))
+
+ϕunc = CA.ComponentVector(;
+    logσ2_logP=logσ2_logP,
+    coef_logσ2_logMs=coef_logσ2_logMs,
+    ρsP,
+    ρsM)
+int_unc = ComponentArrayInterpreter(ϕunc)
+
+# for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
+ϕunc0 = CA.ComponentVector(;
+    logσ2_logP=fill(-10.0, n_θP),
+    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+    ρsP,
+    ρsM)
+
+logσ2y = fill(2 * log(σ_o), size(y_o, 1))
+n_MC = 3
+
+
+#-------------- ADVI with g inside cost function
+using CUDA
+using TransformVariables
+
+transPMs_batch = as(
+    (P=as(Array, asℝ₊, n_θP),
+    Ms=as(Array, asℝ₊, n_θM, n_batch)))
+transPMs_all = as(
+    (P=as(Array, asℝ₊, n_θP),
+    Ms=as(Array, asℝ₊, n_θM, n_site)))
+    
+ϕ_true = θ = CA.ComponentVector(;
+    μP=θP_true,
+    ϕg=ϕg_opt,
+    unc=ϕunc);
+trans_gu = as(
+    (μP=as(Array, asℝ₊, n_θP),
+    ϕg=as(Array, n_ϕg),
+    unc=as(Array, length(ϕunc))))
+trans_g = as(
+    (μP=as(Array, asℝ₊, n_θP),
+    ϕg=as(Array, n_ϕg)))
+
+const int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP,
+    θMs=CA.ComponentMatrix(
+        zeros(n_θM, n_batch), first(CA.getaxes(θM)), CA.Axis(i=1:n_batch))))
+
+interpreters = interpreters_g = map(get_concrete,(; 
+    μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true), 
+    PMs=int_PMs_batch,
+    unc=ComponentArrayInterpreter(ϕunc)
+    ))
+
+ϕg_true_vec = CA.ComponentVector(
+    TransformVariables.inverse(trans_gu, cv2NamedTuple(ϕ_true)))
+ϕcg_true = interpreters.μP_ϕg_unc(ϕg_true_vec)
+ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]]);
+ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, SimpleChains.init_params(g), ϕunc0);
+
+neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], x_o[:, 1:n_batch],
+    transPMs_batch, map(get_concrete, interpreters);
+    n_MC=8, logσ2y)
+Zygote.gradient(ϕ -> neg_elbo_transnorm_gf(
+        rng, g, f, ϕ, y_o[:, 1:n_batch], x_o[:, 1:n_batch],
+        transPMs_batch, interpreters; n_MC=8, logσ2y), ϕcg_true)
+
+() -> begin
+    train_loader = MLUtils.DataLoader((x_o, y_o), batchsize = n_batch)
+
+    optf = Optimization.OptimizationFunction((ζg, data) -> begin
+            x_o, y_o = data
+            neg_elbo_transnorm_gf(
+                rng, g, f, ζg, y_o, x_o, transPMs_batch, map(get_concrete, interpreters_g); n_MC=5, logσ2y)
+        end,
+        Optimization.AutoZygote())
+    optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini), train_loader);
+    res = Optimization.solve(optprob, Optimisers.Adam(0.02), callback=callback_loss(50), maxiters=800);
+    #optprob = Optimization.OptimizationProblem(optf, ϕ_ini0);
+    #res = Optimization.solve(optprob, Adam(0.02), callback=callback_loss(50), maxiters=1_400);
+end
+        
+#using Lux
+ϕ = ϕcg_true |> gpu;
+x_o_gpu = x_o |> gpu;
+# y_o = y_o |> gpu
+# logσ2y = logσ2y |> gpu
+n_covar = size(x_o, 1)
+g_flux = Flux.Chain(
+    # dense layer with bias that maps to 8 outputs and applies `tanh` activation
+    Flux.Dense(n_covar => n_covar * 4, tanh),
+    Flux.Dense(n_covar * 4 => n_covar * 4, logistic),
+    # dense layer without bias that maps to n outputs and `identity` activation
+    Flux.Dense(n_covar * 4 => n_θM, identity, bias=false),
+)
+() -> begin
+    using Lux
+    g_lux = Lux.Chain(
+        # dense layer with bias that maps to 8 outputs and applies `tanh` activation
+        Lux.Dense(n_covar => n_covar * 4, tanh),
+        Lux.Dense(n_covar * 4 => n_covar * 4, logistic),
+        # dense layer without bias that maps to n outputs and `identity` activation
+        Lux.Dense(n_covar * 4 => n_θM, identity, use_bias=false),
+    )
+    ps, st = Lux.setup(Random.default_rng(), g_lux)
+    ps_ca = CA.ComponentArray(ps) |> gpu
+    st = st |> gpu
+    g_luxs = StatefulLuxLayer{true}(g_lux, nothing, st)
+    g_luxs(x_o_gpu[:, 1:n_batch], ps_ca)
+    ax_g = CA.getaxes(ps_ca)
+    g_luxs(x_o_gpu[:, 1:n_batch], CA.ComponentArray(ϕ.ϕg, ax_g))
+    interpreters = (interpreters..., ϕg = ComponentArrayInterpreter(ps_ca))
+    ϕg = CA.ComponentArray(ϕ.ϕg, ax_g)
+    ϕgc = interpreters.ϕg(ϕ.ϕg)
+    g_gpu = g_luxs
+end
+g_gpu = g_flux
+
+#Zygote.gradient(ϕg -> sum(g_gpu(x_o_gpu[:, 1:n_batch],ϕg)), ϕgc)
+# Zygote.gradient(ϕg -> sum(compute_g(g_gpu, x_o_gpu[:, 1:n_batch], ϕg, interpreters)), ϕ.ϕg)
+# Zygote.gradient(ϕ -> sum(tmp_gen1(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ.ϕg)
+# Zygote.gradient(ϕ -> sum(tmp_gen2(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), CA.getdata(ϕ))
+# Zygote.gradient(ϕ -> sum(tmp_gen2(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ) |> cpu
+# Zygote.gradient(ϕ -> sum(tmp_gen3(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ) |> cpu
+# Zygote.gradient(ϕ -> sum(tmp_gen4(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)[1]), ϕ) |> cpu
+# generate_ζ(rng, g_gpu, f, ϕ, x_o_gpu[:, 1:n_batch], interpreters)
+# Zygote.gradient(ϕ -> sum(generate_ζ(rng, g_gpu, f, ϕ, x_o_gpu[:, 1:n_batch], interpreters)[1]), ϕ) |> cpu
+# include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
+# neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], 
+#     x_o_gpu[:, 1:n_batch], transPMs_batch, interpreters; logσ2y)
+# Zygote.gradient(ϕ -> sum(neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], 
+# x_o_gpu[:, 1:n_batch], transPMs_batch, interpreters; logσ2y)[1]), ϕ) |> cpu
+
+
+fcost(ϕ) = neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], 
+    x_o_gpu[:, 1:n_batch], transPMs_batch, map(get_concrete, interpreters);
+    n_MC=8, logσ2y = logσ2y)
+fcost(ϕ)
+gr = Zygote.gradient(fcost, ϕ) |> cpu;
+Zygote.gradient(fcost, CA.getdata(ϕ))
+
+
+train_loader = MLUtils.DataLoader((x_o_gpu, y_o), batchsize = n_batch)
+
+optf = Optimization.OptimizationFunction((ζg, data) -> begin
+        x_o, y_o = data
+        neg_elbo_transnorm_gf(
+            rng, g_gpu, f, ζg, y_o, x_o, transPMs_batch, map(get_concrete, interpreters_g); n_MC=5, logσ2y)
+    end,
     Optimization.AutoZygote())
-optprob = OptimizationProblem(optf, p0, train_loader)
+optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini) |> gpu, train_loader);
+res = res_gpu = Optimization.solve(optprob, Optimisers.Adam(0.02), callback=callback_loss(50), maxiters=800);
+
+ζ_VIc = interpreters_g.μP_ϕg_unc(res.u |> cpu)
+ζMs_VI = g(x_o, ζ_VIc.ϕg)
+ϕunc_VI = int_unc(ζ_VIc.unc)
+
+hcat(θP_true, exp.(ζ_VIc.μP))
+plt = scatterplot(vec(θMs_true), vec(exp.(ζMs_VI)))
+#lineplot!(plt, 0.0, 1.1, identity)
+# 
+hcat(ϕunc, ϕunc_VI) # need to compare to MC sample
+# hard to estimate for original very small theta's but otherwise good
+
+# test predicting correct obs-uncertainty of predictive posterior
+n_sample_pred = 200
+intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP,
+    θMs=CA.ComponentMatrix(
+        zeros(n_θM, n_site), first(CA.getaxes(θM)), CA.Axis(i=1:n_sample_pred))))
+
+include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
+ζs, _ = generate_ζ(rng, g, f, res.u |> cpu, x_o, 
+    (;interpreters..., PMs = intm_PMs_gen); n_MC=n_sample_pred)
+# ζ = ζs[:,1]   
+θsc = stack(ζ -> CA.getdata(CA.ComponentVector(
+        TransformVariables.transform(transPMs_all, ζ))), eachcol(ζs));
+y_pred = stack(map(ζ -> first(predict_y(ζ, f, transPMs_all)), eachcol(ζs)));
+
+size(y_pred)
+σ_o_post = mapslices(std, y_pred; dims=3);
+#describe(σ_o_post)
+vcat(σ_o, mean_σ_o_MC, mean(σ_o_post), sqrt(mean(abs2, σ_o_post)))
+mean_y_pred = map(mean, eachslice(y_pred; dims=(1, 2)))
+#describe(mean_y_pred - y_o)
+histogram(vec(mean_y_pred - y_true)) # predictions centered around y_o (or y_true)
+
+# look at θP, θM1 of first site
+intm = ComponentArrayInterpreter(int_θdoubleMM(1:length(int_θdoubleMM)), (n_sample_pred,))
+ζs1c = intm(ζs[1:length(int_θdoubleMM), :])
+vcat(θP_true, θM_true)
+histogram(exp.(ζs1c[:r0, :]))
+histogram(exp.(ζs1c[:K2, :]))
+histogram(exp.(ζs1c[:r1, :]))
+histogram(exp.(ζs1c[:K1, :]))
+# all parameters estimated to high (true not in cf bounds)
+scatterplot(ζs1c[:r1, :], ζs1c[:K1, :])  # r1 and K1 strongly correlated (from θM)
+scatterplot(ζs1c[:r0, :], ζs1c[:K2, :])  # r0 and K also correlated (from θP)
+scatterplot(ζs1c[:r0, :], ζs1c[:K1, :])  # no correlation (modeled independent)
+
+# TODO compare distributions to MC sample
+
+
+
+
 
-res = Optimization.solve(
-    optprob, Adam(0.02), callback = callback_loss(100), maxiters = 1000);
 
-l1, y_pred_global, y_pred, θMs = loss_gf(res.u, train_loader.data...)
-scatterplot(vec(θMs_true), vec(θMs))
-scatterplot(log.(vec(θMs_true)), log.(vec(θMs)))
-scatterplot(vec(y_pred), vec(y_o))
-hcat(par_templates.θP, int_ϕθP(res.u).θP)
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
index d66b4d0..1d7b2bb 100644
--- a/src/HybridVariationalInference.jl
+++ b/src/HybridVariationalInference.jl
@@ -4,6 +4,10 @@ using ComponentArrays: ComponentArrays as CA
 using Random
 using StatsBase # fit ZScoreTransform
 using Combinatorics # gen_hybridcase_synthetic/combinations
+using GPUArraysCore
+using LinearAlgebra
+using CUDA
+using ChainRulesCore
 
 export ComponentArrayInterpreter, flatten1, get_concrete
 include("ComponentArrayInterpreter.jl")
@@ -25,6 +29,9 @@ include("gencovar.jl")
 export callback_loss
 include("util_opt.jl")
 
+#export - all internal
+include("cholesky.jl")
+
 export DoubleMM
 include("DoubleMM/DoubleMM.jl")
 
diff --git a/src/cholesky.jl b/src/cholesky.jl
new file mode 100644
index 0000000..5432d3a
--- /dev/null
+++ b/src/cholesky.jl
@@ -0,0 +1,259 @@
+sumn(n::T) where T = n*(n+one(T)) ÷ T(2)
+
+""" 
+Inverse of s = sumn(n) for positive integer `n`.
+
+Gives an inexact error, if given s was not such a sum.
+"""
+invsumn(s::T) where T = T(-0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2
+
+
+"""
+Convert vector v columnwise entries of upper diagonal matrix to UppterTriangular
+"""
+function vec2utri(v::AbstractVector{T}; n=invsumn(length(v))) where {T}
+    # works with Zygote despite of doing mutation of k (see test_cholesky_structure.jl)
+    #https://groups.google.com/g/julia-users/c/UARlZBCNlng/m/6tKKxIeoEY8J
+    z = zero(T)
+    k = 0
+    m = [j >= i ? (k += 1; v[k]) : z for i = 1:n, j = 1:n]
+    UpperTriangular(m)
+end
+
+function vec2utri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v))) where {T}
+    z = zero(T)
+    k = 0
+    m = CUDA.allowscalar() do
+        CuArray([j >= i ? (k += 1; v[k]) : z for i = 1:n, j = 1:n])
+    end
+    # m = CUDA.zeros(T,n,n)
+    # @cuda threads = 256 vec2utri_gpu!(m, v) # planned to put v into positions of m
+    return (m)
+end
+
+function vec2utri_gpu!(m, v::AbstractVector) 
+    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+    stride = blockDim().x
+    for i = index:stride:length(v)
+        row, col = vec2utri_pos(i)
+        @inbounds m[row, col] = v[i]
+    end
+    return nothing # important
+end
+
+""" 
+Compute the (one-based) position `(row, col)` within an upper tridiagonal matrix for
+given (one-based) position, `s` within a packed vector representation.
+"""
+function vec2utri_pos(T, s) 
+    col = ceil(T, -0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2
+    cl = col - one(T)
+    s1 = cl*(cl+one(T)) ÷ T(2) # first number in column
+    row = s - s1 
+    (row, col)
+end
+vec2utri_pos(s) = vec2utri_pos(Int, s) 
+
+"""
+Compute the index in the vector of entries in an upper tridiagonal matrix
+"""
+function utri2vec_pos(row, col)
+    @assert row <= col
+    utri2vec_pos_unsafe(row, col)
+end
+function utri2vec_pos_unsafe(row::T, col::T) where T
+    # variant that does not check for unreasonable position away from upper tridiagonal
+    sc1 = T((col-one(T))*(col) ÷ T(2))
+    sc1 + row
+end
+
+
+function vec2uutri(v::AbstractArray{T}; kwargs...) where {T}
+    m = _vec2uutri(v; kwargs...)
+    UnitUpperTriangular(m)
+    #TriangularRFP(m, :U) # no adjoint
+    #SymmetricPacked(m) # no adjoint
+    #SymmetricPacked(v) # also no adjoint - does not yet care for main diagonal
+end
+
+# """
+# Convert vector v columnwise entries of upper diagonal matrix to UnitUppterTriangular
+
+# Avoid using this repeatedly on GPU arrays, because it only works on CPU (scalar indexing).
+# There is a fallback that pulls `v` to the CPU, applies, and pushes back to GPU.
+# """
+function _vec2uutri(v::AbstractVector{T}; n=invsumn(length(v)) + one(T), diag=one(T)) where {T}
+    z = zero(T)
+    k = 0
+    m = [j > i ? (k += 1; v[k]) : i == j ? diag : z for i = 1:n, j = 1:n]
+    return (m)
+end
+
+
+#function vec2uutri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v)) + one(T)) where {T}
+#TODO remove internal conversion to CuArray to generalize to AbstractGPUVector
+function vec2uutri(v::CuVector{T}; n=invsumn(length(v)) + 1, diag=one(T)) where {T}
+    # _one = one(T)
+    # z = zero(T)
+    # k = 0
+    # m = CUDA.allowscalar() do
+    #     CuArray([j > i ? (k += 1; v[k]) : i == j ? _one : z for i = 1:n, j = 1:n])
+    # end
+    m = CUDA.zeros(T,n,n) 
+    m[1:size(m, 1)+1:end] .= diag  # indexing trick for diag(m) .= one(T) 
+    @cuda threads = 256 vec2uutri_gpu!(m, v)
+    return (m)
+end
+
+function vec2uutri_gpu!(m::Union{CuDeviceArray, GPUArraysCore.AbstractGPUMatrix}, v::AbstractVector) 
+    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+    stride = blockDim().x
+    for i = index:stride:length(v)
+        row, col = vec2utri_pos(i)
+        # for unit-upper triangle shift the column position by  one
+        @inbounds m[row, col+1] = v[i]
+    end
+    return nothing # important
+end
+
+function ChainRulesCore.rrule(::typeof(vec2uutri), v::AbstractArray; kwargs...)
+    # does not change the value or the derivative but just reshapes
+    # -> extract the components of matrix Δy into vector 
+    function vec2uutri_pullback(Δy) 
+        (NoTangent(), uutri2vec(Δy))
+    end
+    return vec2uutri(v; kwargs...), vec2uutri_pullback
+end
+
+"""
+Extract entries of upper diagonal matrix of UppterTriangular to columnwise vector
+"""
+function utri2vec(X::AbstractMatrix{T}) where {T}
+    n = size(X, 1)
+    lv = sumn(n)
+    i = 0
+    j = 1
+    [
+        begin
+            if i == j
+                i = 0
+                j += 1
+            end
+            i += 1
+            X[i, j]
+        end for _ in 1:lv
+    ]
+end
+
+
+
+"""
+Extract entries of upper diagonal matrix of UnitUppterTriangular to columnwise vector
+"""
+function uutri2vec(X::AbstractMatrix{T}) where {T}
+    n = size(X, 1) - 1
+    lv = sumn(n)
+    i = 0
+    j = 2
+    [
+        begin
+            if i == j - 1
+                i = 0
+                j += 1
+            end
+            i += 1
+            X[i, j]
+        end for _ in 1:lv
+    ]
+end
+
+function ChainRulesCore.rrule(::typeof(uutri2vec), X::AbstractMatrix{T}) where T
+    # does not change the value or the derivative but just reshapes
+    # -> put the components of vector Δy into matrix
+    # make sure that the gradient of main diagonal is zero rather than one
+    function uutri2vec_pullback(Δy) 
+        (NoTangent(), vec2uutri(Δy; diag=zero(T)))
+    end
+    return uutri2vec(X), uutri2vec_pullback
+end
+
+#function uutri2vec(X::GPUArraysCore.AbstractGPUMatrix; kwargs...)
+#TODO remove internal coercion to CuArray to extend to other AbstractGPUMatrix
+# function uutri2vec(X::CuArray; kwargs...)
+#     n = size(X, 1) - 1
+#     lv = sumn(n)
+#     i = 0
+#     j = 2
+#     CUDA.allowscalar() do
+#         CuArray([
+#             begin
+#                 if i == j - 1
+#                     i = 0
+#                     j += 1
+#                 end
+#                 i += 1
+#                 X[i, j]
+#             end for _ in 1:lv
+#         ])
+#     end
+# end
+function uutri2vec(X::CuMatrix{T}; kwargs...) where T
+    lv = sumn(size(X,1)-1)
+    v = CUDA.zeros(T,lv) 
+    @cuda threads=(16, 16) uutri2vec_gpu!(v, X)
+    return v
+end
+
+function uutri2vec_gpu!(v::Union{CuVector,CuDeviceVector}, X::AbstractMatrix) 
+    x = threadIdx().x
+    y = threadIdx().y
+    stride_x= blockDim().x
+    stride_y = blockDim().y
+    for row = x:stride_x:size(X,1)
+        for col = y:stride_y:size(X,2)
+            if row < col
+                i = utri2vec_pos_unsafe(row, col-1)
+                @inbounds v[i] = X[row, col]
+            end        
+        end
+    end
+    return nothing # important
+end
+
+
+"""
+Takes a vector of entries of a lower UnitUpperTriangular matrix
+and transforms it to an UpperTriangular that satisfies 
+diag(U' * U) = 1.
+
+This can be used to fit parameters that yield an upper Cholesky-Factor
+of a Covariance matrix.
+
+It uses the upper triangular matrix rather than the lower because it
+involes a sum across columns, whereas the alternative of a lower triangular
+uses sum across rows. 
+Sum across columns is often faster, because entries of columns are contiguous.
+"""
+function transformU_cholesky1(v::AbstractVector; 
+    n=invsumn(length(v)) + 1
+    )
+    U_scaled = vec2uutri(v; n)
+    #Sc_inv = sqrt.(sum(abs2, U_scaled, dims=1))
+    #U_scaled * Diagonal(1 ./ vec(Sc_inv))
+    #U = U_scaled ./ Sc_inv
+    U = U_scaled ./ sqrt.(sum(abs2, U_scaled, dims=1))
+    return (UpperTriangular(U))
+end
+function transformU_cholesky1(v::GPUArraysCore.AbstractGPUVector; n=invsumn(length(v)) + 1)
+    U_scaled = vec2uutri(v; n)
+    U = U_scaled ./ sqrt.(sum(abs2, U_scaled, dims=1))
+    # do not convert to UpperTrinangular on GPU, but full matrix
+    #return (UpperTriangular(U))
+    return U
+end
+
+() -> begin
+    tmp = sqrt.(sum(abs2, U_scaled, dims=1))
+    tmp2 = sum(abs2, U_scaled, dims=1) .^ (-1 / 2)
+    U_scaled * tmp'
+end
diff --git a/src/gencovar.jl b/src/gencovar.jl
index 46c613e..69206c8 100644
--- a/src/gencovar.jl
+++ b/src/gencovar.jl
@@ -4,7 +4,7 @@ are a linear combination of the uncorrelated underlying principal
 factors and their binary combinations.
 
 In addition provide a SimpleChains model of adequate complexity to
-fit this realationship θMs_true = f(x_o)
+fit this relationship θMs_true = f(x_o)
 """
 function gen_cov_pred(rng::AbstractRNG, T::DataType,
     n_covar_pc, n_covar, n_site, n_θM::Integer;
diff --git a/test/Project.toml b/test/Project.toml
index f3437a2..3006d5b 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -4,6 +4,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
diff --git a/test/runtests.jl b/test/runtests.jl
index ee4dbc1..49f0f37 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,6 +11,8 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
         @time @safetestset "test_SimpleChains" include("test_SimpleChains.jl")
         #@safetestset "test" include("test/test_doubleMM.jl")
         @time @safetestset "test_doubleMM" include("test_doubleMM.jl")
+        #@safetestset "test" include("test/test_cholesky_structure.jl")
+        @time @safetestset "test_cholesky_structure" include("test_cholesky_structure.jl")
         #
         #@safetestset "test" include("test/test_Flux.jl")
         @time @safetestset "test_Flux" include("test_Flux.jl")
@@ -22,7 +24,9 @@ end
 @time begin
     if GROUP == "All" || GROUP == "Aqua"
         #@safetestset "test" include("test/test_aqua.jl")
-        @time @safetestset "test_aqua" include("test_aqua.jl")
+        if VERSION >= VersionNumber("1.11.2")
+            @time @safetestset "test_aqua" include("test_aqua.jl")
+        end
     end
 end
 
diff --git a/test/test_cholesky_structure.jl b/test/test_cholesky_structure.jl
new file mode 100644
index 0000000..ad188f4
--- /dev/null
+++ b/test/test_cholesky_structure.jl
@@ -0,0 +1,254 @@
+using LinearAlgebra, Test
+using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as CP
+using Zygote
+using OptimizationOptimisers
+using ComponentArrays: ComponentArrays as CA
+#using SymmetricFormats
+using GPUArraysCore: GPUArraysCore
+#using Flux
+using CUDA
+
+A = [1.0 2.0 3.0
+     2.0 1.0 4.0
+     3.0 4.0 1.0] + I(3) .* 10
+LA = cholesky(A).L
+
+B = rand(3, 3) ./ 10
+
+C = [1.0 2.0 3.2
+     2.0 1.0 4.2
+     3.2 4.2 1.0] + I(3) .* 10
+LC = cholesky(C).L
+
+Z = zeros(3, 3)
+
+@testset "cholesky of blockdiagonal" begin
+    # cholesky factorization of a BlockDiagonal equals BlockDiagonal of Choleskies
+    X = [A Z; Z' C]
+    LX = cholesky(X).L
+    @test LX == [LA Z; Z LC]
+end;
+
+@testset "cholesky for of edge structure" begin
+    # structure of cholesky factorization of a BlockDiagonal with edges does not carry over
+    X = [A B B; B' C Z; B' Z C]
+    LX = cholesky(X).L
+    LX[4:6, 4:6] ≈ LX[7:9, 7:9]
+    X = [A B B B;
+         B' C Z Z;
+         B' Z C Z;
+         B' Z Z C]
+    LX = cholesky(X).L
+    LX[4:6, 4:6] ≈ LX[7:9, 7:9]
+    # but non-zero off-diagonals at Z positions and different edge entries 
+    @test_broken LX[7:9, 4:6] ≈ Z
+    @test_broken LX[9:12, 4:6] ≈ LX[9:12, 7:9]
+end;
+
+@testset "invsumn" begin
+    ns_orig = [1, 2, 3, 6]
+    s = map(n -> sum(1:n), ns_orig)
+    ns = CP.invsumn.(s)
+    @test ns == ns_orig
+    @test eltype(ns) == Int
+    #
+    @test_throws Exception invsumn(5) # 5 is not result of sum(1:n)
+end;
+
+@testset "vec2utri" begin
+    v_orig = 1.0:6.0
+    Uv = CP.vec2utri(v_orig)
+    @test Uv isa UpperTriangular
+    Zygote.gradient(v -> sum(CP.vec2utri(v)), v_orig)[1] # works nice
+    #
+    v2 = CP.utri2vec(Uv)
+    @test v2 == v_orig
+    Zygote.gradient(Uv -> sum(CP.utri2vec(Uv)), Uv)[1] # works nice
+end;
+
+@testset "vec2uutri" begin
+    v = v_orig = 1.0:6.0
+    vcpu = collect(v_orig)
+    n = CP.invsumn(length(v)) + 1
+    T = eltype(v)
+    U1v = CP.vec2uutri(v_orig)
+    @test U1v isa UnitUpperTriangular
+    @test size(U1v, 1) == 4
+    gr = Zygote.gradient(v -> sum(abs2.(CP.vec2uutri(v))) , vcpu)[1] # works nice
+    # test providing keyword argument
+    gr = Zygote.gradient(v -> sum(abs2.(CP.vec2uutri(v; n=4))) , vcpu)[1] # works nice
+    #
+    v2 = CP.uutri2vec(U1v)
+    @test v2 == v_orig
+    gr = Zygote.gradient(U1v -> sum(CP.uutri2vec(U1v) .* (1.0:6.0)), U1v)[1] # works nice
+end;
+
+@testset "utri2vec_pos" begin
+    @test CP.utri2vec_pos(1,1) == 1
+    @test CP.utri2vec_pos(1,2) == 2
+    @test CP.utri2vec_pos(2,2) == 3
+    @test CP.utri2vec_pos(1,3) == 4
+    @test CP.utri2vec_pos(1,4) == 7
+    @test CP.utri2vec_pos(5,5) == 15
+    typeof(CP.utri2vec_pos(5,5)) == Int
+    typeof(CP.utri2vec_pos(Int32(5),Int32(5))) == Int32
+    @test_throws AssertionError CP.utri2vec_pos(2,1)
+end
+
+@testset "vec2uutri gpu" begin
+    if CUDA.functional() # only run the test, if CUDA is working (not on Github ci)
+        v_orig = 1.0f0:6.0f0
+        v = CuArray(collect(v_orig))
+        U1v = CP.vec2uutri(v)
+        @test !(U1v isa UnitUpperTriangular) # on CUDA work with normal matrix
+        @test U1v isa GPUArraysCore.AbstractGPUMatrix
+        @test size(U1v, 1) == 4
+        @test Array(U1v) == CP.vec2uutri(v_orig)
+        gr = Zygote.gradient(v -> sum(abs2.(CP.vec2uutri(v))), v)[1] # works nice
+        @test gr isa GPUArraysCore.AbstractGPUArray
+        @test Array(gr) == (1:6) .* 2.0
+        #
+        v2 = CP.uutri2vec(U1v)
+        @test v2 isa GPUArraysCore.AbstractGPUVector
+        @test eltype(v2) == eltype(U1v)
+        @test Array(v2) == v_orig
+        gr = Zygote.gradient(U1v -> sum(CP.uutri2vec(U1v .* 2)), U1v)[1] # works nice
+        @test gr isa GPUArraysCore.AbstractGPUArray
+        @test all(diag(gr) .== 0)
+        @test Array(CP.uutri2vec(gr)) == fill(2.0f0, length(v_orig))
+    end
+end;
+
+@testset "transformU_cholesky1 gpu" begin
+    v_orig = 1.0f0:6.0f0
+    vcpu = collect(v_orig)
+    ch = CP.transformU_cholesky1(vcpu)
+    gr1 = Zygote.gradient(v -> sum(CP.transformU_cholesky1(v)), vcpu)[1] # works nice
+    @test all(diag(ch' * ch) .≈ 1)
+    if CUDA.functional() # only run the test, if CUDA is working (not on Github ci)
+        v = CuArray(collect(v_orig))
+        U1v = CP.transformU_cholesky1(v)
+        @test !(U1v isa UnitUpperTriangular) # on CUDA work with normal matrix
+        @test U1v isa GPUArraysCore.AbstractGPUMatrix
+        @test Array(U1v) ≈ ch
+        gr2 = Zygote.gradient(v -> sum(CP.transformU_cholesky1(v)), v)[1] # works nice
+        @test Array(gr2) == gr1
+    end
+end;
+
+() -> begin
+    #setup for fitting of interactive blocks below
+    _X = rand(3, 3)
+    S = _X * _X' # know that this is Hermitian
+    stdS = sqrt.(diag(S))
+    C = Diagonal(1 ./ stdS) * S * Diagonal(1 ./ stdS)
+    @test Diagonal(stdS) * C * Diagonal(stdS) ≈ S
+
+    SL = cholesky(S).L
+    SU = cholesky(S).U
+    # CU = cholesky(C).U  # fails: its not recognized as Symmetric
+    # CU = cholesky(UnitLowerTriangular(C)).U
+    CU = cholesky(Symmetric(C)).U
+    CL = cholesky(Symmetric(C)).L
+    @test CU * Diagonal(stdS) ≈ SU
+    @test Diagonal(stdS) * CL ≈ SL
+end
+
+() -> begin
+    # find entries of U so that Cpred = U'*U = C
+    n_x = 20
+    xs = rand(3, n_x)
+    ys = xs' * CU
+    n_U = size(C, 1)
+    fcost = (Uvec) -> begin
+        U = vec2utri(Uvec; n = n_U)
+        y_pred = xs' * U
+        sum(abs2, ys .- y_pred)
+    end
+    Uvec_true = utri2vec(CU)
+    Uvec0 = Uvec_true .* 1.2
+    fcost(Uvec0)
+
+    optf = Optimization.OptimizationFunction((x, p) -> fcost(x), Optimization.AutoZygote())
+    optprob = Optimization.OptimizationProblem(optf, Uvec0)
+    res = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02),
+        callback = callback_loss(50), maxiters = 600)
+
+    #hcat(Uvec_true, res.u, Uvec0)
+    Upred = vec2utri(res.u)
+    @test Upred ≈ CU
+    Cpred = Upred' * Upred
+    #hcat(C, Cpred)
+    @test Cpred ≈ C
+end
+
+() -> begin
+    # find Cholesky factor so that U' * U = C given that diag(C) == 1
+    n_x = 20
+    xs = rand(3, n_x)
+    ys = ysC = xs' * CU
+    # Us1vec = 1.0:6.0
+    n_U = size(S, 1)
+
+    fcost = fcostCT = (Us1vec) -> begin
+        U = transformU_cholesky1(Us1vec; n = n_U)
+        y_pred = xs' * U
+        sum(abs2, ys .- y_pred)
+    end
+    # cannot infer true U_scaled any more
+    Unscaled0 = CU ./ diag(CU)
+    Us1vec0 = uutri2vec(Unscaled0)
+    fcost(Us1vec0)
+
+    optf = Optimization.OptimizationFunction((x, p) -> fcost(x), Optimization.AutoZygote())
+    optprob = Optimization.OptimizationProblem(optf, Us1vec0)
+    res = resCT = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02),
+        callback = callback_loss(50), maxiters = 1_600)
+
+    Upred = transformU_cholesky1(res.u; n = n_U)
+    #hcat(CU', Upred)
+    @test Upred ≈ CU
+    fcostCT(resCT.u)
+end
+
+@testset "fit cholesky correlation" begin
+    # find transformed Cholesky factor so  that recovers covariance S given its diagonal
+    _X = rand(3, 3)
+    S = _X * _X' # know that this is Hermitian
+    n_x = 200
+    xs = rand(3, n_x)
+    SU = cholesky(S).U
+    σ_o = 0.05
+    ys_true = ysS = xs' * SU
+    ys = ys_true .+ randn(n_x) .* σ_o
+
+    Dσ = Diagonal(sqrt.(diag(S))) # assume given
+    n_U = size(S, 1)
+
+    fcost = fcostS = (Us1vec) -> begin
+        U = CP.transformU_cholesky1(Us1vec; n = n_U)
+        y_pred = (xs' * U) * Dσ
+        sum(abs2, ys .- y_pred)
+    end
+    # cannot infer true U_scaled any more
+    Unscaled0 = S ./ diag(S)
+    Us1vec0 = CP.uutri2vec(Unscaled0)
+    fcost(Us1vec0)
+    #fcostS(resCT.u)  # cost of u optimized by Covar should yield small result if same x
+
+    optf = Optimization.OptimizationFunction((x, p) -> fcost(x), Optimization.AutoZygote())
+    optprob = Optimization.OptimizationProblem(optf, Us1vec0)
+    res = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02),
+        #callback=callback_loss(50), 
+        maxiters = 800)
+
+    Upred = CP.transformU_cholesky1(res.u; n = n_U)
+    #@test Upred ≈ CU
+    SUpred = Upred * Dσ
+    #hcat(SUpred, SU)  
+    @test SUpred≈SU atol=1e-1
+    S_pred = Dσ' * Upred' * Upred * Dσ
+    @test S_pred≈S atol=1e-1
+end
+