diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
index f0fd939..74b63bf 100644
--- a/dev/doubleMM.jl
+++ b/dev/doubleMM.jl
@@ -55,7 +55,7 @@ scatterplot(θMs_true[2,:], θMs[2,:])
 prob1o.θP
 scatterplot(vec(y_true), vec(y_pred))
 
-# still overestimating θMs
+# still overestimating θMs and θP
 
 () -> begin # with more iterations?
     prob2 = prob1o
diff --git a/src/AbstractHybridProblem.jl b/src/AbstractHybridProblem.jl
index f4c1cc6..00a9759 100644
--- a/src/AbstractHybridProblem.jl
+++ b/src/AbstractHybridProblem.jl
@@ -1,19 +1,25 @@
 """
 Type to dispatch constructing data and network structures
-for different cases of hybrid problem setups
+for different cases of hybrid problem setups.
 
 For a specific prob, provide functions that specify details
 - `get_hybridproblem_MLapplicator`
+- `get_hybridproblem_transforms`
 - `get_hybridproblem_PBmodel`
 - `get_hybridproblem_neg_logden_obs`
 - `get_hybridproblem_par_templates`
-- `get_hybridproblem_transforms`
+- `get_hybridproblem_ϕunc`
 - `get_hybridproblem_train_dataloader` (default depends on `gen_hybridcase_synthetic`)
 optionally
 - `gen_hybridcase_synthetic`
 - `get_hybridproblem_n_covar` (defaults to number of rows in xM in train_dataloader )
 - `get_hybridproblem_float_type` (defaults to `eltype(θM)`)
-- `get_hybridproblem_cor_starts` (defaults to include all correlations: `(P=(1,), M=(1,))`)
+- `get_hybridproblem_cor_ends` (defaults to include all correlations: `(P=(1,), M=(1,))`)
+
+The initial value of parameters to estimate is spread
+- `ϕg`: parameter of the MLapplicator: returned by `get_hybridproblem_MLapplicator`
+- `ζP`: mean of the PBmodel parameters: returned by `get_hybridproblem_par_templates`
+- `ϕunc`: additional parameters of the approximte posterior: returned by `get_hybridproblem_ϕunc`
 """
 abstract type AbstractHybridProblem end;
 
@@ -64,6 +70,13 @@ Provide tuple of templates of ComponentVectors `θP` and `θM`.
 """
 function get_hybridproblem_par_templates end
 
+"""
+    get_hybridproblem_ϕunc(::AbstractHybridProblem; scenario)
+
+Provide a ComponentArray of the initial additional parameters of the approximate posterior.
+"""
+function get_hybridproblem_ϕunc end
+
 """
     get_hybridproblem_transforms(::AbstractHybridProblem; scenario)
 
@@ -143,7 +156,7 @@ function get_hybridproblem_train_dataloader(prob::AbstractHybridProblem; scenari
 end
 
 """
-    get_hybridproblem_cor_starts(prob::AbstractHybridProblem; scenario)
+    get_hybridproblem_cor_ends(prob::AbstractHybridProblem; scenario)
 
 Specify blocks in correlation matrices among parameters.
 Returns a NamedTuple.
@@ -159,6 +172,7 @@ then the first subrange starts at position 1 and the second subrange starts at p
 If there is only single block of all ML-predicted parameters being correlated 
 with each other then this block starts at position 1: `(P=(1,3), M=(1,))`.
 """
-function get_hybridproblem_cor_starts(prob::AbstractHybridProblem; scenario = ())
-    (P = (1,), M = (1,))
+function get_hybridproblem_cor_ends(prob::AbstractHybridProblem; scenario = ())
+    pt = get_hybridproblem_par_templates(prob; scenario)
+    (P = [length(pt.θP)], M = [length(pt.θM)])
 end
diff --git a/src/HybridProblem.jl b/src/HybridProblem.jl
index ff03ef6..a946c6c 100644
--- a/src/HybridProblem.jl
+++ b/src/HybridProblem.jl
@@ -7,7 +7,7 @@ struct HybridProblem <: AbstractHybridProblem
     py
     transP
     transM
-    cor_starts # = (P=(1,),M=(1,))
+    cor_ends # = (P=(1,),M=(1,))
     get_train_loader
     # inner constructor to constrain the types
     function HybridProblem(
@@ -20,8 +20,8 @@ struct HybridProblem <: AbstractHybridProblem
             #train_loader::DataLoader,
             # return a function that constructs the trainloader based on n_batch
             get_train_loader::Function,
-            cor_starts::NamedTuple = (P = (1,), M = (1,)))
-        new(θP, θM, f, g, ϕg, py, transM, transP, cor_starts, get_train_loader)
+            cor_ends::NamedTuple = (P = [length(θP)], M = [length(θM)]))
+        new(θP, θM, f, g, ϕg, py, transM, transP, cor_ends, get_train_loader)
     end
 end
 
@@ -45,8 +45,8 @@ function HybridProblem(prob::AbstractHybridProblem; scenario = ())
             get_hybridproblem_train_dataloader(rng::AbstractRNG, prob; scenario, kwargs...)
         end
     end
-    cor_starts = get_hybridproblem_cor_starts(prob; scenario)
-    HybridProblem(θP, θM, g, ϕg, f, py, transP, transM, get_train_loader, cor_starts)
+    cor_ends = get_hybridproblem_cor_ends(prob; scenario)
+    HybridProblem(θP, θM, g, ϕg, f, py, transP, transM, get_train_loader, cor_ends)
 end
 
 function update(prob::HybridProblem;
@@ -58,7 +58,7 @@ function update(prob::HybridProblem;
         transM::Union{Function, Bijectors.Transform} = prob.transM,
         transP::Union{Function, Bijectors.Transform} = prob.transP,
         get_train_loader::Function = prob.get_train_loader,
-        cor_starts::NamedTuple = prob.cor_starts)
+        cor_ends::NamedTuple = prob.cor_ends)
     # prob.θP = θP
     # prob.θM = θM
     # prob.f = f
@@ -67,15 +67,19 @@ function update(prob::HybridProblem;
     # prob.py = py
     # prob.transM = transM
     # prob.transP = transP
-    # prob.cor_starts = cor_starts
+    # prob.cor_ends = cor_ends
     # prob.get_train_loader = get_train_loader
-    HybridProblem(θP, θM, g, ϕg, f, py, transP, transM, get_train_loader, cor_starts)
+    HybridProblem(θP, θM, g, ϕg, f, py, transP, transM, get_train_loader, cor_ends)
 end
 
 function get_hybridproblem_par_templates(prob::HybridProblem; scenario::NTuple = ())
     (; θP = prob.θP, θM = prob.θM)
 end
 
+function get_hybridproblem_ϕunc(prob::HybridProblem; scenario::NTuple = ())
+    prob.ϕunc
+end
+
 function get_hybridproblem_neg_logden_obs(prob::HybridProblem; scenario::NTuple = ())
     prob.py
 end
@@ -102,8 +106,8 @@ function get_hybridproblem_train_dataloader(rng::AbstractRNG, prob::HybridProble
     return prob.get_train_loader(rng; kwargs...)
 end
 
-function get_hybridproblem_cor_starts(prob::HybridProblem; scenario = ())
-    prob.cor_starts
+function get_hybridproblem_cor_ends(prob::HybridProblem; scenario = ())
+    prob.cor_ends
 end
 
 # function get_hybridproblem_float_type(prob::HybridProblem; scenario::NTuple = ()) 
diff --git a/src/HybridSolver.jl b/src/HybridSolver.jl
index db8a4b7..9e08a3f 100644
--- a/src/HybridSolver.jl
+++ b/src/HybridSolver.jl
@@ -49,17 +49,19 @@ function CommonSolve.solve(prob::AbstractHybridProblem, solver::HybridPosteriorS
     scenario, rng = Random.default_rng(), kwargs...)
     par_templates = get_hybridproblem_par_templates(prob; scenario)
     (; θP, θM) = par_templates
+    cor_ends = get_hybridproblem_cor_ends(prob; scenario)
     g, ϕg0 = get_hybridproblem_MLapplicator(prob; scenario);
     (; transP, transM) = get_hybridproblem_transforms(prob; scenario)
     (; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs) = init_hybrid_params(
-        θP, θM, ϕg0, solver.n_batch; transP, transM);
+        θP, θM, cor_ends, ϕg0, solver.n_batch; transP, transM);
     use_gpu = (:use_Flux ∈ scenario)
     ϕ0 = use_gpu ? CuArray(ϕ) : ϕ # TODO replace CuArray by something more general
     train_loader = get_hybridproblem_train_dataloader(rng, prob; scenario, solver.n_batch)    
     f = get_hybridproblem_PBmodel(prob; scenario)
     py = get_hybridproblem_neg_logden_obs(prob; scenario)
     y_global_o = Float32[] # TODO
-    loss_elbo = get_loss_elbo(g, transPMs_batch, f, py, y_global_o, interpreters; solver.n_MC)
+    loss_elbo = get_loss_elbo(
+        g, transPMs_batch, f, py, y_global_o, interpreters; solver.n_MC, cor_ends)
     # test loss function once
     l0 = loss_elbo(ϕ0, rng, first(train_loader)...)
     optf = Optimization.OptimizationFunction((ϕ, data) -> loss_elbo(ϕ, rng, data...)[1],
@@ -84,12 +86,12 @@ The loss function takes in addition to ϕ, data that changes with minibatch
 - xP: drivers for the processmodel: Iterator of size n_site
 - y_o, y_unc: matrix of observations and uncertainties, sites in columns
 """
-function get_loss_elbo(g, transPMs, f, py, y_o_global, interpreters; n_MC)
-    let g = g, transPMs = transPMs, f = f, py=py, y_o_global = y_o_global, n_MC = n_MC
-        interpreters = map(get_concrete, interpreters)
+function get_loss_elbo(g, transPMs, f, py, y_o_global, interpreters; n_MC, cor_ends)
+    let g = g, transPMs = transPMs, f = f, py=py, y_o_global = y_o_global, n_MC = n_MC,
+        cor_ends = cor_ends, interpreters = map(get_concrete, interpreters)
         function loss_elbo(ϕ, rng, xM, xP, y_o, y_unc)
             neg_elbo_transnorm_gf(rng, ϕ, g, transPMs, f, py,
-            xM, xP, y_o, y_unc, interpreters; n_MC)
+            xM, xP, y_o, y_unc, interpreters; n_MC, cor_ends)
         end
     end
 end
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
index a4999a6..53b1f21 100644
--- a/src/HybridVariationalInference.jl
+++ b/src/HybridVariationalInference.jl
@@ -31,6 +31,7 @@ export AbstractHybridProblem, get_hybridproblem_MLapplicator, get_hybridproblem_
        get_hybridproblem_par_templates, get_hybridproblem_transforms, get_hybridproblem_train_dataloader,
        get_hybridproblem_neg_logden_obs, 
        get_hybridproblem_n_covar, 
+       get_hybridproblem_cor_ends,
        #update,
        gen_cov_pred
 include("AbstractHybridProblem.jl")
@@ -53,13 +54,13 @@ include("util_ca.jl")
 export neg_logden_indep_normal, entropy_MvNormal
 include("logden_normal.jl")
 
-export get_ca_starts
+export get_ca_starts, get_ca_ends, get_cor_count
 include("cholesky.jl")
 
 export neg_elbo_transnorm_gf, predict_gf
 include("elbo.jl")
 
-export init_hybrid_params
+export init_hybrid_params, init_hybrid_ϕunc
 include("init_hybrid_params.jl")
 
 export AbstractHybridSolver, HybridPointSolver, HybridPosteriorSolver
diff --git a/src/cholesky.jl b/src/cholesky.jl
index 3e3efb9..27c5ca7 100644
--- a/src/cholesky.jl
+++ b/src/cholesky.jl
@@ -1,12 +1,11 @@
-sumn(n::T) where T = n*(n+one(T)) ÷ T(2)
+sumn(n::T) where {T} = n * (n + one(T)) ÷ T(2)
 
 """ 
 Inverse of s = sumn(n) for positive integer `n`.
 
 Gives an inexact error, if given s was not such a sum.
 """
-invsumn(s::T) where T = T(-0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2
-
+invsumn(s::T) where {T} = T(-0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2
 
 """
 Convert vector v columnwise entries of upper diagonal matrix to UppterTriangular
@@ -16,7 +15,7 @@ function vec2utri(v::AbstractVector{T}; n=invsumn(length(v))) where {T}
     #https://groups.google.com/g/julia-users/c/UARlZBCNlng/m/6tKKxIeoEY8J
     z = zero(T)
     k = 0
-    m = [j >= i ? (k += 1; v[k]) : z for i = 1:n, j = 1:n]
+    m = [j >= i ? (k += 1; v[k]) : z for i in 1:n, j in 1:n]
     UpperTriangular(m)
 end
 
@@ -24,17 +23,17 @@ function vec2utri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v))) w
     z = zero(T)
     k = 0
     m = CUDA.allowscalar() do
-        CuArray([j >= i ? (k += 1; v[k]) : z for i = 1:n, j = 1:n])
+        CuArray([j >= i ? (k += 1; v[k]) : z for i in 1:n, j in 1:n])
     end
     # m = CUDA.zeros(T,n,n)
     # @cuda threads = 256 vec2utri_gpu!(m, v) # planned to put v into positions of m
     return (m)
 end
 
-function vec2utri_gpu!(m, v::AbstractVector) 
+function vec2utri_gpu!(m, v::AbstractVector)
     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
     stride = blockDim().x
-    for i = index:stride:length(v)
+    for i in index:stride:length(v)
         row, col = vec2utri_pos(i)
         @inbounds m[row, col] = v[i]
     end
@@ -45,14 +44,14 @@ end
 Compute the (one-based) position `(row, col)` within an upper tridiagonal matrix for
 given (one-based) position, `s` within a packed vector representation.
 """
-function vec2utri_pos(T, s) 
+function vec2utri_pos(T, s)
     col = ceil(T, -0.5 + sqrt(1 / 4 + 2 * s)) # inversion of n*(n+1)/2
     cl = col - one(T)
-    s1 = cl*(cl+one(T)) ÷ T(2) # first number in column
-    row = s - s1 
+    s1 = cl * (cl + one(T)) ÷ T(2) # first number in column
+    row = s - s1
     (row, col)
 end
-vec2utri_pos(s) = vec2utri_pos(Int, s) 
+vec2utri_pos(s) = vec2utri_pos(Int, s)
 
 """
 Compute the index in the vector of entries in an upper tridiagonal matrix
@@ -61,13 +60,12 @@ function utri2vec_pos(row, col)
     @assert row <= col
     utri2vec_pos_unsafe(row, col)
 end
-function utri2vec_pos_unsafe(row::T, col::T) where T
+function utri2vec_pos_unsafe(row::T, col::T) where {T}
     # variant that does not check for unreasonable position away from upper tridiagonal
-    sc1 = T((col-one(T))*(col) ÷ T(2))
+    sc1 = T((col - one(T)) * (col) ÷ T(2))
     sc1 + row
 end
 
-
 function vec2uutri(v::AbstractArray{T}; kwargs...) where {T}
     m = _vec2uutri(v; kwargs...)
     UnitUpperTriangular(m)
@@ -82,14 +80,14 @@ end
 # Avoid using this repeatedly on GPU arrays, because it only works on CPU (scalar indexing).
 # There is a fallback that pulls `v` to the CPU, applies, and pushes back to GPU.
 # """
-function _vec2uutri(v::AbstractVector{T}; n=invsumn(length(v)) + one(T), diag=one(T)) where {T}
+function _vec2uutri(
+    v::AbstractVector{T}; n=invsumn(length(v)) + one(T), diag=one(T)) where {T}
     z = zero(T)
     k = 0
-    m = [j > i ? (k += 1; v[k]) : i == j ? diag : z for i = 1:n, j = 1:n]
+    m = [j > i ? (k += 1; v[k]) : i == j ? diag : z for i in 1:n, j in 1:n]
     return (m)
 end
 
-
 #function vec2uutri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v)) + one(T)) where {T}
 #TODO remove internal conversion to CuArray to generalize to AbstractGPUVector
 function vec2uutri(v::CuVector{T}; n=invsumn(length(v)) + 1, diag=one(T)) where {T}
@@ -99,16 +97,17 @@ function vec2uutri(v::CuVector{T}; n=invsumn(length(v)) + 1, diag=one(T)) where
     # m = CUDA.allowscalar() do
     #     CuArray([j > i ? (k += 1; v[k]) : i == j ? _one : z for i = 1:n, j = 1:n])
     # end
-    m = CUDA.zeros(T,n,n) 
-    m[1:size(m, 1)+1:end] .= diag  # indexing trick for diag(m) .= one(T) 
+    m = CUDA.zeros(T, n, n)
+    m[1:(size(m, 1)+1):end] .= diag  # indexing trick for diag(m) .= one(T) 
     @cuda threads = 256 vec2uutri_gpu!(m, v)
     return (m)
 end
 
-function vec2uutri_gpu!(m::Union{CuDeviceArray, GPUArraysCore.AbstractGPUMatrix}, v::AbstractVector) 
+function vec2uutri_gpu!(
+    m::Union{CuDeviceArray,GPUArraysCore.AbstractGPUMatrix}, v::AbstractVector)
     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
     stride = blockDim().x
-    for i = index:stride:length(v)
+    for i in index:stride:length(v)
         row, col = vec2utri_pos(i)
         # for unit-upper triangle shift the column position by  one
         @inbounds m[row, col+1] = v[i]
@@ -119,7 +118,7 @@ end
 function ChainRulesCore.rrule(::typeof(vec2uutri), v::AbstractArray; kwargs...)
     # does not change the value or the derivative but just reshapes
     # -> extract the components of matrix Δy into vector 
-    function vec2uutri_pullback(Δy) 
+    function vec2uutri_pullback(Δy)
         (NoTangent(), uutri2vec(Δy))
     end
     return vec2uutri(v; kwargs...), vec2uutri_pullback
@@ -141,12 +140,11 @@ function utri2vec(X::AbstractMatrix{T}) where {T}
             end
             i += 1
             X[i, j]
-        end for _ in 1:lv
+        end
+        for _ in 1:lv
     ]
 end
 
-
-
 """
 Extract entries of upper diagonal matrix of UnitUppterTriangular to columnwise vector
 """
@@ -163,15 +161,16 @@ function uutri2vec(X::AbstractMatrix{T}) where {T}
             end
             i += 1
             X[i, j]
-        end for _ in 1:lv
+        end
+        for _ in 1:lv
     ]
 end
 
-function ChainRulesCore.rrule(::typeof(uutri2vec), X::AbstractMatrix{T}) where T
+function ChainRulesCore.rrule(::typeof(uutri2vec), X::AbstractMatrix{T}) where {T}
     # does not change the value or the derivative but just reshapes
     # -> put the components of vector Δy into matrix
     # make sure that the gradient of main diagonal is zero rather than one
-    function uutri2vec_pullback(Δy) 
+    function uutri2vec_pullback(Δy)
         (NoTangent(), vec2uutri(Δy; diag=zero(T)))
     end
     return uutri2vec(X), uutri2vec_pullback
@@ -197,30 +196,29 @@ end
 #         ])
 #     end
 # end
-function uutri2vec(X::CuMatrix{T}; kwargs...) where T
-    lv = sumn(size(X,1)-1)
-    v = CUDA.zeros(T,lv) 
-    @cuda threads=(16, 16) uutri2vec_gpu!(v, X)
+function uutri2vec(X::CuMatrix{T}; kwargs...) where {T}
+    lv = sumn(size(X, 1) - 1)
+    v = CUDA.zeros(T, lv)
+    @cuda threads = (16, 16) uutri2vec_gpu!(v, X)
     return v
 end
 
-function uutri2vec_gpu!(v::Union{CuVector,CuDeviceVector}, X::AbstractMatrix) 
+function uutri2vec_gpu!(v::Union{CuVector,CuDeviceVector}, X::AbstractMatrix)
     x = threadIdx().x
     y = threadIdx().y
-    stride_x= blockDim().x
+    stride_x = blockDim().x
     stride_y = blockDim().y
-    for row = x:stride_x:size(X,1)
-        for col = y:stride_y:size(X,2)
+    for row in x:stride_x:size(X, 1)
+        for col in y:stride_y:size(X, 2)
             if row < col
-                i = utri2vec_pos_unsafe(row, col-1)
+                i = utri2vec_pos_unsafe(row, col - 1)
                 @inbounds v[i] = X[row, col]
-            end        
+            end
         end
     end
     return nothing # important
 end
 
-
 """
 Takes a vector of entries of a lower UnitUpperTriangular matrix
 and transforms it to an UpperTriangular that satisfies 
@@ -234,9 +232,9 @@ involes a sum across columns, whereas the alternative of a lower triangular
 uses sum across rows. 
 Sum across columns is often faster, because entries of columns are contiguous.
 """
-function transformU_cholesky1(v::AbstractVector; 
+function transformU_cholesky1(v::AbstractVector;
     n=invsumn(length(v)) + 1
-    )
+)
     U_scaled = vec2uutri(v; n)
     #Sc_inv = sqrt.(sum(abs2, U_scaled, dims=1))
     #U_scaled * Diagonal(1 ./ vec(Sc_inv))
@@ -244,7 +242,8 @@ function transformU_cholesky1(v::AbstractVector;
     U = U_scaled ./ sqrt.(sum(abs2, U_scaled, dims=1))
     return (UpperTriangular(U))
 end
-function transformU_cholesky1(v::GPUArraysCore.AbstractGPUVector; n=invsumn(length(v)) + 1)
+function transformU_cholesky1(
+    v::GPUArraysCore.AbstractGPUVector; n=invsumn(length(v)) + 1)
     U_scaled = vec2uutri(v; n)
     U = U_scaled ./ sqrt.(sum(abs2, U_scaled, dims=1))
     # do not convert to UpperTrinangular on GPU, but full matrix
@@ -259,7 +258,6 @@ end
 #     U = _create_blockdiag(v[first(keys(v))], blocks) # v only for dispatch: plain matrix for gpu
 # end
 
-
 """
     get_ca_starts(vc::ComponentVector)
 
@@ -267,40 +265,87 @@ Return a tuple with starting positions of components in vc.
 Useful for providing information on correlactions among subranges in a vector.
 """
 function get_ca_starts(vc::CA.ComponentVector)
-    (1,  (1 .+ cumsum((length(vc[k]) for k in front(keys(vc)))))...) 
+    (1, (1 .+ cumsum((length(vc[k]) for k in front(keys(vc)))))...)
 end
 "omit the last n elements of an iterator"
-front(itr, n=1) = Iterators.take(itr, length(itr)-n)
+front(itr, n=1) = Iterators.take(itr, length(itr) - n)
+
 
 """
-    transformU_block_cholesky1(v::AbstractVector, cor_starts = (1,))
+    get_ca_ends(vc::ComponentVector)
+
+Return a Vector with ending positions of components in vc. 
+Useful for providing information on correlactions among subranges in a vector.
+"""
+function get_ca_ends(vc::CA.ComponentVector)
+    #(cumsum(length(vc[k]) for k in keys(vc))...,)
+    length(vc) == 0 ? Int[] : cumsum(length(vc[k]) for k in keys(vc))
+end
+
+
+"""
+    get_cor_count(cor_ends::AbstractVector)
+    get_cor_count(n_par::Integer)
+
+Return number of correlation coefficients for a correlation matrix of size `(npar x npar)`
+With blocks starting a positions given with tuple `cor_ends`.
+"""
+function get_cor_count(cor_ends::AbstractVector)
+    sum(get_cor_counts(cor_ends))
+end
+function get_cor_counts(cor_ends::AbstractVector{T}) where {T}
+    isempty(cor_ends) && return (zero(T))
+    cnt_blocks = (
+        begin
+            i == 1 ? cor_ends[i] : cor_ends[i] - cor_ends[i-1]
+        end for i in 1:length(cor_ends)
+    )
+    get_cor_count.(cnt_blocks)
+end
+function get_cor_count(n_par::T) where T<:Number # <: Integer causes problems with  AD 
+    sumn(n_par - one(T))
+end
+
+
+"""
+    transformU_block_cholesky1(v::AbstractVector, cor_ends)
 
 Transform a parameterization v of a blockdiagonal of upper triangular matrices
 into the this matrix.
-`cor_starts` is a NTuple of Integeres specifying the first column of each block. 
-E.g. For a matrix with a 3x3, a 2x2, and another block, 
-the blocks start at columns (1,4,6). It defaults to a single entire block.
+`cor_ends` is an AbstractVector of Integers specifying the last column of each block. 
+E.g. For a matrix with a 3x3, a 2x2, and another single-entry block, 
+the blocks start at columns (3,5,6). It defaults to a single entire block.
 """
-function transformU_block_cholesky1(v::AbstractVector, cor_starts = (1,))
-    cor_starts_end = (cor_starts..., length(v)+1)
+function transformU_block_cholesky1(
+    v::AbstractVector{T}, cor_ends::AbstractVector{TI}=Int[]) where {T,TI<:Integer}
+    #@show v, cor_ends
+    if length(cor_ends) <= 1 # if there is only one block, return it 
+        return transformU_cholesky1(v)
+    end
+    cor_counts = get_cor_counts(cor_ends) # number of correlation parameters
+    #@show cor_counts
     ranges = ChainRulesCore.@ignore_derivatives (
-        cor_starts_end[i]:(cor_starts_end[i+1]-1) for i in 1:length(cor_starts))
+        begin
+            cor_start = (i == 1 ? one(TI) : cor_counts[i-1] + one(TI))
+            cor_start:cor_counts[i]
+        end for i in 1:length(cor_counts)
+    )
+    #@show collect(ranges)
     blocks = [transformU_cholesky1(v[r]) for r in ranges]
     U = _create_blockdiag(v, blocks) # v only for dispatch: plain matrix for gpu
-    return(U)
+    return (U)
 end
 
-function _create_blockdiag(::AbstractArray, blocks) 
+function _create_blockdiag(::AbstractArray{T}, blocks::AbstractArray) where {T}
     BlockDiagonal(blocks)
 end
 
-function _create_blockdiag(::GPUArraysCore.AbstractGPUArray, blocks) 
+
+function _create_blockdiag(::GPUArraysCore.AbstractGPUArray, blocks::AbstractArray)
     # impose no special structure
     cat(blocks...; dims=(1, 2))
 end
 
-
-
 () -> begin
     tmp = sqrt.(sum(abs2, U_scaled, dims=1))
     tmp2 = sum(abs2, U_scaled, dims=1) .^ (-1 / 2)
diff --git a/src/elbo.jl b/src/elbo.jl
index 66b429b..c23d95b 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -6,7 +6,7 @@ expected value of the likelihood of observations.
 
 ## Arguments
 - `rng`: random number generator (ignored on CUDA, if ϕ is a AbstractGPUArray)
-- `ϕ`: flat vector of parameters, interpreted by interpreters
+- `ϕ`: flat vector of parameters
   interpreted by interpreters.μP_ϕg_unc and interpreters.PMs
 - `g`: machine learning model
 - `transPMs`: Transformations as generated by get_transPMs returned from init_hybrid_params
@@ -29,9 +29,9 @@ function neg_elbo_transnorm_gf(rng, ϕ::AbstractVector, g, transPMs, f, py,
     xM::AbstractMatrix, xP, y_ob, y_unc, 
     interpreters::NamedTuple; 
     n_MC=3, gpu_data_handler = get_default_GPUHandler(),
-    cor_starts=(P=(1,),M=(1,))
+    cor_ends # =(P=(1,),M=(1,))
     )
-    ζs, σ = generate_ζ(rng, g, ϕ, xM, interpreters; n_MC, cor_starts)
+    ζs, σ = generate_ζ(rng, g, ϕ, xM, interpreters; n_MC, cor_ends)
     ζs_cpu = gpu_data_handler(ζs) # differentiable fetch to CPU in Flux package extension
     #ζi = first(eachcol(ζs_cpu))
     nLy = reduce(+, map(eachcol(ζs_cpu)) do ζi
@@ -57,13 +57,15 @@ Prediction function for hybrid model. Returns an Array `(n_obs, n_site, n_sample
 function predict_gf(rng, g, f, ϕ::AbstractVector, xM::AbstractMatrix, xP, interpreters;
     get_transPMs, get_ca_int_PMs, n_sample_pred=200, 
     gpu_data_handler=get_default_GPUHandler(),
-    cor_starts=(P=(1,),M=(1,)))
+    cor_ends
+    #cor_ends=(P=(1,),M=(1,))
+    )
     n_site = size(xM, 2)
     intm_PMs_gen = get_ca_int_PMs(n_site)
     trans_PMs_gen = get_transPMs(n_site)
     interpreters_gen = (; interpreters..., PMs = intm_PMs_gen)
     ζs, _ = generate_ζ(rng, g, CA.getdata(ϕ), CA.getdata(xM),
-    interpreters_gen; n_MC = n_sample_pred, cor_starts)
+    interpreters_gen; n_MC = n_sample_pred, cor_ends)
     ζs_cpu = gpu_data_handler(ζs) #
     y_pred = stack(map(ζ -> first(predict_y(
         ζ, xP, f, trans_PMs_gen, interpreters_gen.PMs)), eachcol(ζs_cpu)));
@@ -79,13 +81,13 @@ to the means extracted from parameters and predicted by the machine learning
 model. 
 """
 function generate_ζ(rng, g, ϕ::AbstractVector, xM::AbstractMatrix,
-    interpreters::NamedTuple; n_MC=3, cor_starts=(P=(1,),M=(1,)))
+    interpreters::NamedTuple; n_MC=3, cor_ends)
     # see documentation of neg_elbo_transnorm_gf
     ϕc = interpreters.μP_ϕg_unc(CA.getdata(ϕ))
     μ_ζP = ϕc.μP
     ϕg = ϕc.ϕg
     μ_ζMs0 = g(xM, ϕg) # TODO provide μ_ζP to g
-    ζ_resid, σ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC, cor_starts)
+    ζ_resid, σ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC, cor_ends)
     #ζ_resid, σ = sample_ζ_norm0(rng, ϕ[1:2], reshape(ϕ[2 .+ (1:20)],2,:), ϕ[(end-length(interpreters.unc)+1):end], interpreters.unc; n_MC)
     # @show size(ζ_resid)
     # @show length(interpreters.PMs)
@@ -108,20 +110,22 @@ together with the vector of standard deviations, σ.
    ρsP, ρsM, logσ2_logP, coef_logσ2_logMs(intercept + slope), 
 """
 function sample_ζ_norm0(rng::Random.AbstractRNG, ζP::AbstractVector, ζMs::AbstractMatrix, 
-    args...; n_MC, cor_starts)
+    args...; n_MC, cor_ends)
     n_θP, n_θMs = length(ζP), length(ζMs)
     urand = _create_random(rng, CA.getdata(ζP), n_θP + n_θMs, n_MC)
-    sample_ζ_norm0(urand, ζP, ζMs, args...; cor_starts)
+    sample_ζ_norm0(urand, ζP, ζMs, args...; cor_ends)
 end
 
 function sample_ζ_norm0(urand::AbstractMatrix, ζP::AbstractVector{T}, ζMs::AbstractMatrix, 
-    ϕunc::AbstractVector, int_unc = ComponentArrayInterpreter(ϕunc); cor_starts
+    ϕunc::AbstractVector, int_unc = ComponentArrayInterpreter(ϕunc); cor_ends
     ) where {T}
     ϕuncc = int_unc(CA.getdata(ϕunc))
     n_θP, n_θMs, (n_θM, n_batch) = length(ζP), length(ζMs), size(ζMs) 
     # make sure to not create a UpperTriangular Matrix of an CuArray in transformU_cholesky1
-    UP = transformU_block_cholesky1(ϕuncc.ρsP, cor_starts.P)
-    UM = transformU_block_cholesky1(ϕuncc.ρsM, cor_starts.M)
+    ρsP = isempty(ϕuncc.ρsP) ? similar(ϕuncc.ρsP) : ϕuncc.ρsP # required by zygote
+    UP = transformU_block_cholesky1(ρsP, cor_ends.P)
+    ρsM = isempty(ϕuncc.ρsM) ? similar(ϕuncc.ρsM) : ϕuncc.ρsM # required by zygote
+    UM = transformU_block_cholesky1(ρsM, cor_ends.M)
     cf = ϕuncc.coef_logσ2_logMs
     logσ2_logMs = vec(cf[1, :] .+ cf[2, :] .* ζMs)
     logσ2_logP = vec(CA.getdata(ϕuncc.logσ2_logP))
diff --git a/src/init_hybrid_params.jl b/src/init_hybrid_params.jl
index c048c2f..45defed 100644
--- a/src/init_hybrid_params.jl
+++ b/src/init_hybrid_params.jl
@@ -12,39 +12,38 @@ Returns a NamedTuple of
 
 # Arguments
 - `θP`, `θM`: Template ComponentVectors of global parameters and ML-predicted parameters
+- `cor_ends`: NamedTuple with entries, `P`, and `M`, respectively with 
+   integer vectors of ending columns of parameters blocks
 - `ϕg`: vector of parameters to optimize, as returned by `get_hybridproblem_MLapplicator`
 - `n_batch`: the number of sites to predicted in each mini-batch
 - `transP`, `transM`: the Bijector.Transformations for the global and site-dependent 
     parameters, e.g. `Stacked(elementwise(identity), elementwise(exp), elementwise(exp))`.
     Its the transformation froing from unconstrained to constrained space: θ = Tinv(ζ),
     because this direction is used much more often.
+- `ϕunc0` initial uncertainty parameters, ComponentVector with format of `init_hybrid_ϕunc.`
 """
-function init_hybrid_params(θP, θM, ϕg, n_batch; 
-    transP=elementwise(identity), transM=elementwise(identity))
+function init_hybrid_params(θP::AbstractVector{FT}, θM::AbstractVector{FT},
+        cor_ends::NamedTuple, ϕg::AbstractVector{FT}, n_batch;
+        transP = elementwise(identity), transM = elementwise(identity),
+        ϕunc0 = init_hybrid_ϕunc(cor_ends, zero(FT))) where {FT}
     n_θP = length(θP)
     n_θM = length(θM)
+    @assert cor_ends.P[end] == n_θP
+    @assert cor_ends.M[end] == n_θM
     n_ϕg = length(ϕg)
     # check translating parameters - can match length?
     _ = Bijectors.inverse(transP)(θP)
     _ = Bijectors.inverse(transM)(θM)
-    FT = eltype(θM)
-    # zero correlation matrices
-    ρsP = zeros(FT, sum(1:(n_θP - 1)))
-    ρsM = zeros(FT, sum(1:(n_θM - 1)))
-    ϕunc0 = CA.ComponentVector(;
-        logσ2_logP = fill(FT(-10.0), n_θP),
-        coef_logσ2_logMs = reduce(hcat, (FT[-10.0, 0.0] for _ in 1:n_θM)),
-        ρsP,
-        ρsM)
     ϕ = CA.ComponentVector(;
-        μP = apply_preserve_axes(inverse(transP),θP),
+        μP = apply_preserve_axes(inverse(transP), θP),
         ϕg = ϕg,
-        unc = ϕunc0);
+        unc = ϕunc0)
     #
-    get_transPMs = let transP=transP, transM=transM, n_θP=n_θP, n_θM=n_θM 
+    get_transPMs = let transP = transP, transM = transM, n_θP = n_θP, n_θM = n_θM
         function get_transPMs_inner(n_site)
             transMs = ntuple(i -> transM, n_site)
-            ranges = vcat([1:n_θP], [(n_θP + i0*n_θM) .+ (1:n_θM) for i0 in 0:(n_site-1)])
+            ranges = vcat(
+                [1:n_θP], [(n_θP + i0 * n_θM) .+ (1:n_θM) for i0 in 0:(n_site - 1)])
             transPMs = Stacked((transP, transMs...), ranges)
             transPMs
         end
@@ -54,20 +53,54 @@ function init_hybrid_params(θP, θM, ϕg, n_batch;
     # inv_trans_gu = Stacked(
     #     (inverse(transP), elementwise(identity), elementwise(identity)), values(ranges))
     # ϕ = inv_trans_gu(CA.getdata(ϕt))        
-    get_ca_int_PMs = let 
+    get_ca_int_PMs = let
         function get_ca_int_PMs_inner(n_site)
-            ComponentArrayInterpreter(CA.ComponentVector(; P=θP,
-            Ms = CA.ComponentMatrix(
-                zeros(n_θM, n_site), first(CA.getaxes(θM)), CA.Axis(i = 1:n_site))))
+            ComponentArrayInterpreter(CA.ComponentVector(; P = θP,
+                Ms = CA.ComponentMatrix(
+                    zeros(n_θM, n_site), first(CA.getaxes(θM)), CA.Axis(i = 1:n_site))))
         end
-        
     end
     interpreters = map(get_concrete,
-    (;
-        μP_ϕg_unc = ComponentArrayInterpreter(ϕ),
-        PMs = get_ca_int_PMs(n_batch),
-        unc = ComponentArrayInterpreter(ϕunc0)
-    ))
-    (;ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs)
+        (;
+            μP_ϕg_unc = ComponentArrayInterpreter(ϕ),
+            PMs = get_ca_int_PMs(n_batch),
+            unc = ComponentArrayInterpreter(ϕunc0)
+        ))
+    (; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs)
 end
 
+"""
+    init_hybrid_ϕunc(cor_ends, ρ0=0f0; logσ2_logP, coef_logσ2_logMs, ρsP, ρsM)
+
+Initialize vector of additional parameter of the approximate posterior.
+
+Arguments:
+- `cor_ends`: NamedTuple with entries, `P`, and `M`, respectively with 
+   integer vectors of ending columns of parameters blocks
+- `ρ0`: default entry for ρsP and ρsM, defaults = 0f0.
+- `coef_logσ2_logM`: default column for `coef_logσ2_logMs`, defaults to `[-10.0, 0.0]`
+
+Returns a `ComponentVector` of 
+- `logσ2_logP`: vector of log-variances of ζP (on log scale).
+  defaults to -10
+- `coef_logσ2_logMs`: offset and slope for the log-variances of ζM scaling with 
+   its value given by columns for each parameter in ζM, defaults to `[-10, 0]`
+- `ρsP` and `ρsM`: parameterization of the upper triangular cholesky factor 
+  of the correlation matrices of ζP and ζM, default to all entries `ρ0`, which defaults to zero.
+"""
+function init_hybrid_ϕunc(
+        cor_ends::NamedTuple,
+        ρ0::FT = 0.0f0,
+        coef_logσ2_logM::AbstractVector{FT} = FT[-10.0, 0.0];
+        logσ2_logP::AbstractVector{FT} = fill(FT(-10.0), cor_ends.P[end]),
+        coef_logσ2_logMs::AbstractMatrix{FT} = reduce(
+            hcat, (coef_logσ2_logM for _ in 1:cor_ends.M[end])),
+        ρsP = fill(ρ0, get_cor_count(cor_ends.P)),
+        ρsM = fill(ρ0, get_cor_count(cor_ends.M)),
+) where {FT}
+    CA.ComponentVector(;
+        logσ2_logP,
+        coef_logσ2_logMs,
+        ρsP,
+        ρsM)
+end
diff --git a/src/util_ca.jl b/src/util_ca.jl
index 212f05d..be9641f 100644
--- a/src/util_ca.jl
+++ b/src/util_ca.jl
@@ -7,7 +7,7 @@ function cpu_ca end
 # define in FluxExt
 
 function apply_preserve_axes(f, ca::CA.ComponentArray)
-    CA.ComponentArray(f(ca), CA.getaxes(ca))
+    CA.ComponentArray(f(CA.getdata(ca)), CA.getaxes(ca))
 end
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 4d716e1..7599ade 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,7 +3,6 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
 
 @time begin
     if GROUP == "All" || GROUP == "Basic"
-        @time @safetestset "test_HybridProblem" include("test_HybridProblem.jl")
         #@safetestset "test" include("test/test_ComponentArrayInterpreter.jl")
         @time @safetestset "test_ComponentArrayInterpreter" include("test_ComponentArrayInterpreter.jl")
         #@safetestset "test" include("test/test_gencovar.jl")
diff --git a/test/test_HybridProblem.jl b/test/test_HybridProblem.jl
index d8ca77e..9564459 100644
--- a/test/test_HybridProblem.jl
+++ b/test/test_HybridProblem.jl
@@ -18,7 +18,7 @@ construct_problem = () -> begin
     θM = CA.ComponentVector{FT}(r1 = 0.5, K1 = 0.2)
     transP = elementwise(exp)
     transM = Stacked(elementwise(identity), elementwise(exp))
-    cov_starts = (P = (1, 2), M = (1)) # assume r0 independent of K2
+    cor_ends = (P = 1:length(θP), M = [length(θM)]) # assume r0 independent of K2
     n_covar = 5
     n_batch = 10
     int_θdoubleMM = get_concrete(ComponentArrayInterpreter(
@@ -57,7 +57,7 @@ construct_problem = () -> begin
         end
     end
     HybridProblem(θP, θM, g_chain, f_doubleMM_with_global, py,
-        transM, transP, get_train_loader, cov_starts)
+        transM, transP, get_train_loader, cor_ends)
 end
 prob = construct_problem();
 scenario = (:default,)
@@ -111,21 +111,23 @@ import Flux
     (θP0, θM0) = get_hybridproblem_par_templates(prob)
     (; transP, transM) = get_hybridproblem_transforms(prob)
     py = get_hybridproblem_neg_logden_obs(prob)
+    cor_ends = get_hybridproblem_cor_ends(prob)
 
     (; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs) = init_hybrid_params(
-        θP0, θM0, ϕg0, n_batch; transP, transM)
+        θP0, θM0, cor_ends, ϕg0, n_batch; transP, transM)
     ϕ_ini = ϕ
+    ϕ.unc
 
     py = get_hybridproblem_neg_logden_obs(prob)
 
     cost = neg_elbo_transnorm_gf(rng, ϕ_ini, g, transPMs_batch, f, py,
         xM, xP, y_o, y_unc, map(get_concrete, interpreters);
-        n_MC = 8)
+        n_MC = 8, cor_ends)
     @test cost isa Float64
     gr = Zygote.gradient(
         ϕ -> neg_elbo_transnorm_gf(rng, ϕ, g, transPMs_batch, f, py,
             xM, xP, y_o, y_unc, map(get_concrete, interpreters);
-            n_MC = 8),
+            n_MC = 8, cor_ends),
         CA.getdata(ϕ_ini))
     @test gr[1] isa Vector
 
@@ -148,12 +150,12 @@ import Flux
             xMg = CuArray(xM)
             cost = neg_elbo_transnorm_gf(rng, ϕ, g, transPMs_batch, f, py,
                 xMg, xP, y_o, y_unc, map(get_concrete, interpreters);
-                n_MC = 8)
+                n_MC = 8, cor_ends)
             @test cost isa Float64
             gr = Zygote.gradient(
                 ϕ -> neg_elbo_transnorm_gf(rng, ϕ, g, transPMs_batch, f, py,
                     xMg, xP, y_o, y_unc, map(get_concrete, interpreters);
-                    n_MC = 8),
+                    n_MC = 8, cor_ends),
                 ϕ)
             @test gr[1] isa CuVector
             @test eltype(gr[1]) == get_hybridproblem_float_type(prob)
diff --git a/test/test_cholesky_structure.jl b/test/test_cholesky_structure.jl
index c6e6e51..36a1b60 100644
--- a/test/test_cholesky_structure.jl
+++ b/test/test_cholesky_structure.jl
@@ -56,6 +56,19 @@ end;
     @test_throws Exception invsumn(5) # 5 is not result of sum(1:n)
 end;
 
+@testset "get_cor_count" begin
+    @test get_cor_count(Int[]) == 0  # case of no physical parameters
+    @test get_cor_count([1]) == 0
+    @test get_cor_count([2]) == 1
+    @test get_cor_count([3]) == 3
+    @test get_cor_count([4]) == 6
+    @test get_cor_count(4) == 6
+    @test get_cor_count([1,4]) == 0 + 3
+    @test get_cor_count([2,4]) == 1 + 1
+    @test get_cor_count([3,4]) == 3 + 0
+    @test get_cor_count([2,5]) == 1 + 3
+end;
+
 @testset "vec2utri" begin
     v_orig = 1.0:6.0
     Uv = CP.vec2utri(v_orig)
@@ -138,35 +151,47 @@ end;
 end;
 
 @testset "transformU_block_cholesky1 gpu" begin
-    vc = CA.ComponentVector(b1 = [1.0f0], b2 = [2.0f0:5.0f0])
-    vc = CA.ComponentVector(b1 = [1.0f0:3.0f0])
+    vc = CA.ComponentVector(b1 = 1.0f0:3.0f0)
     vc = CA.ComponentVector(b1 = 1.0f0:3.0f0, b2 = [5.0f0])
     v = CA.getdata(vc)
-    cor_starts = get_ca_starts(vc)
+    cor_ends = get_ca_ends(vc)
     #ns=(CP.invsumn(length(v[k])) + 1 for k in keys(v))
     #collect(ns)
-    U = CP.transformU_block_cholesky1(v, cor_starts)
-    @test diag(U' * U) ≈ ones(5)
-    @test U[1:3, 4:5] ≈ zeros(3, 2)
-    gr1 = Zygote.gradient(v -> sum(CP.transformU_block_cholesky1(v, cor_starts)), v)[1] # works nice
+    ρ = collect(1f0:get_cor_count(cor_ends))
+    U = CP.transformU_block_cholesky1(ρ)
+    U = CP.transformU_block_cholesky1(v, cor_ends)
+    @test diag(U' * U) ≈ ones(4)
+    @test U[1:3, 4:4] ≈ zeros(3, 1)
+    gr1 = Zygote.gradient(v -> sum(CP.transformU_block_cholesky1(v, cor_ends)), v)[1]; # works nice
+    # degenerate case of no correlations
+    vc0 = CA.ComponentVector{Float32}()
+    cor_ends0 = get_ca_ends(vc0)
+    ρ0 = collect(1f0:get_cor_count(cor_ends0))
+    #ns=(CP.invsumn(length(v[k])) + 1 for k in keys(v))
+    #collect(ns)
+    U = CP.transformU_block_cholesky1(CA.getdata(ρ0), cor_ends0)
+    @test diag(U) == [1f0]
+    gr1 = Zygote.gradient(v -> sum(CP.transformU_block_cholesky1(ρ0, cor_ends0)), v)[1]; # works nice
+
     if CUDA.functional() # only run the test, if CUDA is working (not on Github ci)
         vc = v_orig = CA.ComponentVector(b1 = CuArray(1.0f0:3.0f0), b2 = CuArray([5.0f0]))
         v = CA.getdata(vc)
-        cor_starts = get_ca_starts(vc)
-        U = CP.transformU_block_cholesky1(v, cor_starts)
+        cor_ends = get_ca_ends(vc)
+        ρ = CuArray(1f0:get_cor_count(cor_ends))
+        U = CP.transformU_block_cholesky1(ρ, cor_ends)
         @test U isa CuArray
-        @test diag(Array(U' * U)) ≈ ones(5)
-        @test Array(U[1:3, 4:5]) ≈ zeros(3, 2)
-        gr1 = Zygote.gradient(v -> sum(CP.transformU_block_cholesky1(v, cor_starts)), v)[1] # works nice
+        @test diag(Array(U' * U)) ≈ ones(4)
+        @test Array(U[1:3, 4:4]) ≈ zeros(3, 1)
+        gr1 = Zygote.gradient(v -> sum(CP.transformU_block_cholesky1(v, cor_ends)), v)[1] # works nice
+        #
+        cor_ends0 = Int64[]
+        ρ0 = CuArray(1f0:get_cor_count(cor_ends0))
+        U = CP.transformU_block_cholesky1(ρ0, cor_ends0)
+        @test U isa CuArray
+        @test Array(diag(U)) == [1f0]
     end
 end;
 
-() -> begin
-    cor_starts = (1,)
-    cor_starts_end = (cor_starts..., length(v) + 1)
-    [cor_starts_end[i]:(cor_starts_end[i + 1] - 1) for i in 1:length(cor_starts)]
-end
-
 () -> begin
     #setup for fitting of interactive blocks below
     _X = rand(3, 3)
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
index 5723007..1936c1a 100644
--- a/test/test_elbo.jl
+++ b/test/test_elbo.jl
@@ -35,22 +35,24 @@ py = neg_logden_indep_normal
 
 n_MC = 3
 (; transP, transM) = get_hybridproblem_transforms(prob; scenario)
+cor_ends = get_hybridproblem_cor_ends(prob; scenario)
 # transP = elementwise(exp)
 # transM = Stacked(elementwise(identity), elementwise(exp))
 #transM = Stacked(elementwise(identity), elementwise(exp), elementwise(exp)) # test mismatch
+ϕunc0 = init_hybrid_ϕunc(cor_ends, zero(FT))
 (; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs) = init_hybrid_params(
-    θP_true, θMs_true[:, 1], ϕg0, n_batch; transP, transM);
+    θP_true, θMs_true[:, 1], cor_ends, ϕg0, n_batch; transP, transM);
 ϕ_ini = ϕ
 
 @testset "generate_ζ" begin
     ζ, σ = CP.generate_ζ(
         rng, g, ϕ_ini, xM[:, 1:n_batch], map(get_concrete, interpreters);
-        n_MC = 8)
+        n_MC = 8, cor_ends)
     @test ζ isa Matrix
     gr = Zygote.gradient(
         ϕ -> sum(CP.generate_ζ(
             rng, g, ϕ, xM[:, 1:n_batch], map(get_concrete, interpreters);
-            n_MC = 8)[1]),
+            n_MC = 8, cor_ends)[1]),
         CA.getdata(ϕ_ini))
     @test gr[1] isa Vector
 end;
@@ -69,13 +71,13 @@ if CUDA.functional()
         xMg_batch = CuArray(xM[:, 1:n_batch])
         ζ, σ = CP.generate_ζ(
             rng, g_flux, ϕ, xMg_batch, map(get_concrete, interpreters);
-            n_MC = 8)
+            n_MC = 8, cor_ends)
         @test ζ isa CuMatrix
         @test eltype(ζ) == FT
         gr = Zygote.gradient(
             ϕ -> sum(CP.generate_ζ(
                 rng, g_flux, ϕ, xMg_batch, map(get_concrete, interpreters);
-                n_MC = 8)[1]),
+                n_MC = 8, cor_ends)[1]),
             ϕ)
         @test gr[1] isa CuVector
     end
@@ -85,13 +87,13 @@ end
     cost = neg_elbo_transnorm_gf(rng, ϕ_ini, g, transPMs_batch, f, py,
         xM[:, 1:n_batch], xP[1:n_batch], y_o[:, 1:n_batch], y_unc[:, 1:n_batch],
         map(get_concrete, interpreters);
-        n_MC = 8)
+        n_MC = 8, cor_ends)
     @test cost isa Float64
     gr = Zygote.gradient(
         ϕ -> neg_elbo_transnorm_gf(rng, ϕ, g, transPMs_batch, f, py,
         xM[:, 1:n_batch], xP[1:n_batch], y_o[:, 1:n_batch], y_unc[:, 1:n_batch],
         map(get_concrete, interpreters);
-        n_MC = 8),
+        n_MC = 8, cor_ends),
         CA.getdata(ϕ_ini))
     @test gr[1] isa Vector
 end;
@@ -104,13 +106,13 @@ if CUDA.functional()
         cost = neg_elbo_transnorm_gf(rng, ϕ, g_flux, transPMs_batch, f, py, 
             xMg_batch, xP_batch, y_o[:, 1:n_batch], y_unc[:, 1:n_batch],
              map(get_concrete, interpreters);
-            n_MC = 8)
+            n_MC = 8, cor_ends)
         @test cost isa Float64
         gr = Zygote.gradient(
             ϕ -> neg_elbo_transnorm_gf(rng, ϕ, g_flux, transPMs_batch, f, py, 
             xMg_batch, xP_batch, y_o[:, 1:n_batch], y_unc[:, 1:n_batch],
              map(get_concrete, interpreters);
-            n_MC = 8),
+            n_MC = 8, cor_ends),
             ϕ)
         @test gr[1] isa CuVector
         @test eltype(gr[1]) == FT
@@ -124,7 +126,7 @@ end
     @test length(intm_PMs_gen) == 402
     @test trans_PMs_gen.length_in == 402
     y_pred = predict_gf(rng, g, f, ϕ_ini, xM, xP, map(get_concrete, interpreters);
-        get_transPMs, get_ca_int_PMs, n_sample_pred)
+        get_transPMs, get_ca_int_PMs, n_sample_pred, cor_ends)
     @test y_pred isa Array
     @test size(y_pred) == (size(y_o)..., n_sample_pred)
 end
@@ -135,7 +137,7 @@ if CUDA.functional()
         ϕ = CuArray(CA.getdata(ϕ_ini))
         xMg = CuArray(xM)
         y_pred = predict_gf(rng, g_flux, f, ϕ, xMg, xP, map(get_concrete, interpreters);
-            get_transPMs, get_ca_int_PMs, n_sample_pred)
+            get_transPMs, get_ca_int_PMs, n_sample_pred, cor_ends)
         @test y_pred isa Array
         @test size(y_pred) == (size(y_o)..., n_sample_pred)
     end
diff --git a/test/test_sample_zeta.jl b/test/test_sample_zeta.jl
index 062706d..5579d26 100644
--- a/test/test_sample_zeta.jl
+++ b/test/test_sample_zeta.jl
@@ -11,6 +11,7 @@ using Random
 #using SimpleChains
 using ComponentArrays: ComponentArrays as CA
 using Bijectors
+using StableRNGs
 
 #CUDA.device!(4)
 rng = StableRNG(111)
@@ -23,35 +24,38 @@ n_θM, n_θP = length.(values(get_hybridproblem_par_templates(prob; scenario)))
 (; xM, n_site, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o
 ) = gen_hybridcase_synthetic(rng, prob; scenario)
 
+FT = get_hybridproblem_float_type(prob; scenario)
+
 # set to 0.02 rather than zero for debugging non-zero correlations
-ρsP = zeros(sum(1:(n_θP-1))) .+ 0.02
-ρsM = zeros(sum(1:(n_θM-1))) .+ 0.02
+cor_ends = (P = 1:n_θP, M = [n_θM])
+ρsP = zeros(FT, get_cor_count(cor_ends.P)) .+ FT(0.02)
+ρsM = zeros(FT, get_cor_count(cor_ends.M)) .+ FT(0.02)
 
 ϕunc = CA.ComponentVector(;
-    logσ2_logP=fill(-10.0, n_θP),
-    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+    logσ2_logP = fill(FT(-10.0), n_θP),
+    coef_logσ2_logMs = reduce(hcat, (FT[-10.0, 0.0] for _ in 1:n_θM)),
     ρsP,
     ρsM)
 
 θ_true = θ = CA.ComponentVector(;
-    P=θP_true,
-    Ms=θMs_true)
+    P = θP_true,
+    Ms = θMs_true)
 transPMs = elementwise(exp) # all parameters on LogNormal scale
 ζ_true = inverse(transPMs)(θ_true)
-ϕ_true = vcat(ζ_true, CA.ComponentVector(unc=ϕunc))
-ϕ_cpu = vcat(ζ_true .+ 0.01, CA.ComponentVector(unc=ϕunc))
+ϕ_true = vcat(ζ_true, CA.ComponentVector(unc = ϕunc))
+ϕ_cpu = vcat(ζ_true .+ FT(0.01), CA.ComponentVector(unc = ϕunc))
 
-interpreters = (; pmu=ComponentArrayInterpreter(ϕ_true)) #, M=int_θM, PMs=int_θPMs)
+interpreters = (; pmu = ComponentArrayInterpreter(ϕ_true)) #, M=int_θM, PMs=int_θPMs)
 
 n_MC = 3
 @testset "sample_ζ_norm0 cpu" begin
     ϕ = CA.getdata(ϕ_cpu)
     ϕc = interpreters.pmu(ϕ)
-    cor_starts=(P=(1,),M=(1,))
-    ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC, cor_starts)
+    ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC, cor_ends)
     @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC)
-    gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(
-        rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC, cor_starts)[1]), ϕc)[1]
+    gr = Zygote.gradient(
+        ϕc -> sum(CP.sample_ζ_norm0(
+            rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC, cor_ends)[1]), ϕc)[1]
     @test length(gr) == length(ϕ)
 end
 #
@@ -59,26 +63,32 @@ end
 if CUDA.functional()
     @testset "sample_ζ_norm0 gpu" begin
         ϕ = CuArray(CA.getdata(ϕ_cpu))
-        cor_starts=(P=(1,),M=(1,))
         #tmp = ϕ[1:6]
         #vec2uutri(tmp)
-        ϕc = interpreters.pmu(ϕ);
+        ϕc = interpreters.pmu(ϕ)
         @test CA.getdata(ϕc) isa GPUArraysCore.AbstractGPUArray
         #ζP, ζMs, ϕunc = ϕc.P, ϕc.Ms, ϕc.unc
         #urand = CUDA.randn(length(ϕc.P) + length(ϕc.Ms), n_MC) |> gpu
         #include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
         #ζ_resid, σ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
         #Zygote.gradient(ϕc -> sum(sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
-        ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC, cor_starts)
+        int_unc = ComponentArrayInterpreter(ϕc.unc)
+        ζ_resid, σ = CP.sample_ζ_norm0(
+            rng, CA.getdata(ϕc.P), CA.getdata(ϕc.Ms), CA.getdata(ϕc.unc), int_unc;
+            n_MC, cor_ends)
         @test ζ_resid isa GPUArraysCore.AbstractGPUArray
         @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC)
         gr = Zygote.gradient(
-            ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC, cor_starts)[1]), ϕc)[1];
+            ϕc -> sum(CP.sample_ζ_norm0(
+                rng, CA.getdata(ϕc.P), CA.getdata(ϕc.Ms), CA.getdata(ϕc.unc), int_unc;
+                n_MC, cor_ends)[1]), ϕc)[1];
         @test length(gr) == length(ϕ)
+        @test CA.getdata(gr) isa GPUArraysCore.AbstractGPUArray
+        Array(gr)
         int_unc = ComponentArrayInterpreter(ϕc.unc)
         gr2 = Zygote.gradient(
             ϕc -> sum(CP.sample_ζ_norm0(rng, CA.getdata(ϕc.P), CA.getdata(ϕc.Ms),
-                CA.getdata(ϕc.unc), int_unc; n_MC, cor_starts)[1]),
+                CA.getdata(ϕc.unc), int_unc; n_MC, cor_ends)[1]),
             ϕc)[1];
     end
 end
@@ -94,4 +104,3 @@ end
 #     ζs, _ = CP.generate_ζ(rng, g, ϕ, xM, interpreters; n_MC=n_sample_pred)
 
 # end;
-