theogf
diff --git a/‎docs/examples/heteroscedastic.jl‎
Lines changed: 3 additions & 3 deletions b/‎docs/examples/heteroscedastic.jl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/data/utils.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/data/utils.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/hyperparameter/autotuning_utils.jl‎
Lines changed: 6 additions & 4 deletions b/‎src/hyperparameter/autotuning_utils.jl‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/inference/analyticVI.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/inference/analyticVI.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/likelihood/heteroscedastic.jl‎
Lines changed: 13 additions & 4 deletions b/‎src/likelihood/heteroscedastic.jl‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎src/likelihood/poisson.jl‎
Lines changed: 17 additions & 4 deletions b/‎src/likelihood/poisson.jl‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎src/likelihood/regression.jl‎
Lines changed: 3 additions & 1 deletion b/‎src/likelihood/regression.jl‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/mean/affinemean.jl‎
Lines changed: 5 additions & 5 deletions b/‎src/mean/affinemean.jl‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/models/MOSVGP.jl‎
Lines changed: 40 additions & 97 deletions b/‎src/models/MOSVGP.jl‎
Lines changed: 40 additions & 97 deletions
@@ -41,9 +41,9 @@ model = VGP(
     deepcopy(kernel),
     HeteroscedasticLikelihood(λ),
     AnalyticVI();
-    optimiser = true, # We optimise both the mean parameters and kernel hyperparameters
-    mean = μ₀,
-    verbose = 1
+    optimiser=true, # We optimise both the mean parameters and kernel hyperparameters
+    mean=μ₀,
+    verbose=1,
 )
 
 # Model training, we train for around 100 iterations to wait for the convergence of the hyperparameters
 
@@ -7,7 +7,7 @@ function view_y(l::AbstractLikelihood, d::MODataContainer, i::AbstractVector)
     return view_y.(l, output(d), Ref(i))
 end
 function view_y(
-    l::AbstractVector{<:AbstractLikelihood}, d::MODataContainer, i::AbstractVector
+    l::Tuple{Vararg{<:AbstractLikelihood}}, d::MODataContainer, i::AbstractVector
 )
     return view_y.(l, output(d), Ref(i))
 end
@@ -22,7 +22,7 @@ function wrap_data(X, y, likelihood::AbstractLikelihood)
     return wrap_data(X, y)
 end
 
-function wrap_data(X, y, likelihoods::AbstractVector{<:AbstractLikelihood})
+function wrap_data(X, y, likelihoods::Tuple{Vararg{<:AbstractLikelihood}})
     ys = map(check_data!, y, likelihoods)
     return wrap_modata(X, ys)
 end
@@ -67,14 +67,16 @@ function update_kernel!(opt, x::AbstractArray, g::AbstractArray, state)
 end
 
 ## Updating inducing points
-# function update_Z!(opt, Z::Union{ColVecs,RowVecs}, Z_grads::NamedTuple, state)
-# return Z.X .+= Optimise.apply!(opt, Z.X, Z_grads.X)
-# end
-
 function update_Z!(opt, Z::AbstractVector, Z_grads::AbstractVector, state)
     return map(Z, Z_grads, state) do z, zgrad, st
         st, ΔZ = Optimisers.apply(opt, st, z, zgrad)
         z .+= ΔZ
         return st
     end
 end
+
+function update_Z!(opt, Z::Union{ColVecs,RowVecs}, Z_grads::NamedTuple, state)
+    st, Δ = Optimisers.apply(opt, state, Z.X, Z_grads.X)
+    Z.X .+= Δ
+    return st
+end
@@ -95,7 +95,7 @@ end
             mean_f(m, state.kernel_matrices),
             var_f(m, state.kernel_matrices),
         ) # Compute the local updates given the expectations of f
-    state = merge(state, (;local_vars))
+    state = merge(state, (; local_vars))
     natural_gradient!.(
         m.f,
         ∇E_μ(m, y, state),
 
@@ -78,7 +78,9 @@ function local_updates!(
         0.5 * l.invlink.λ[1] * local_vars.ϕ * safe_expcosh(-0.5 * μ[2], 0.5 * local_vars.c)
     @. local_vars.θ = 0.5 * (0.5 + local_vars.γ) / local_vars.c * tanh(0.5 * local_vars.c)
     @. local_vars.σg = expectation(logistic, μ[2], diagΣ[2])
-    l.invlink.λ .= max(0.5 * length(local_vars.ϕ) / dot(local_vars.ϕ, local_vars.σg), l.invlink.λ[1])
+    l.invlink.λ .= max(
+        0.5 * length(local_vars.ϕ) / dot(local_vars.ϕ, local_vars.σg), l.invlink.λ[1]
+    )
     return local_vars
 end
 
@@ -130,7 +132,9 @@ function heteroscedastic_expectations!(
     Σ::AbstractVector,
 )
     @. local_vars.σg = expectation(logistic, μ, Σ)
-    l.invlink.λ .= max(0.5 * length(local_vars.ϕ) / dot(local_vars.ϕ, local_vars.σg), l.invlink.λ[1])
+    l.invlink.λ .= max(
+        0.5 * length(local_vars.ϕ) / dot(local_vars.ϕ, local_vars.σg), l.invlink.λ[1]
+    )
     return local_vars
 end
 
@@ -153,11 +157,16 @@ end
 end
 
 function compute_proba(
-    l::HeteroscedasticGaussianLikelihood, μs::Tuple{<:AbstractVector,<:AbstractVector}, σs::Tuple{<:AbstractVector,<:AbstractVector}) where {T<:Real}
+    l::HeteroscedasticGaussianLikelihood,
+    μs::Tuple{<:AbstractVector,<:AbstractVector},
+    σs::Tuple{<:AbstractVector,<:AbstractVector},
+) where {T<:Real}
     return μs[1], σs[1] + expectation.(Ref(l.invlink), μs[2], σs[2])
 end
 
-function predict_y(::HeteroscedasticGaussianLikelihood, μs::Tuple{<:AbstractVector,<:AbstractVector})
+function predict_y(
+    ::HeteroscedasticGaussianLikelihood, μs::Tuple{<:AbstractVector,<:AbstractVector}
+)
     return first(μs) # For predict_y the variance is ignored
 end
 
 
@@ -86,14 +86,25 @@ end
 
 ### Global Updates ###
 
-@inline function ∇E_μ(::PoissonLikelihood{<:ScaledLogistic}, ::AOptimizer, y::AbstractVector, state)
+@inline function ∇E_μ(
+    ::PoissonLikelihood{<:ScaledLogistic}, ::AOptimizer, y::AbstractVector, state
+)
     return (0.5 * (y - state.γ),)
 end
-@inline ∇E_Σ(::PoissonLikelihood{<:ScaledLogistic}, ::AOptimizer, y::AbstractVector, state) = (0.5 * state.θ,)
+@inline function ∇E_Σ(
+    ::PoissonLikelihood{<:ScaledLogistic}, ::AOptimizer, y::AbstractVector, state
+)
+    return (0.5 * state.θ,)
+end
 
 ## ELBO Section ##
 function expec_loglikelihood(
-    l::PoissonLikelihood{<:ScaledLogistic}, ::AnalyticVI, y, μ::AbstractVector, Σ::AbstractVector, state
+    l::PoissonLikelihood{<:ScaledLogistic},
+    ::AnalyticVI,
+    y,
+    μ::AbstractVector,
+    Σ::AbstractVector,
+    state,
 )
     tot = 0.5 * (dot(μ, (y - state.γ)) - dot(state.θ, abs2.(μ)) - dot(state.θ, Σ))
     tot += Zygote.@ignore(
@@ -106,7 +117,9 @@ function AugmentedKL(l::PoissonLikelihood{<:ScaledLogistic}, state, y)
     return PoissonKL(l, state) + PolyaGammaKL(l, state, y)
 end
 
-PoissonKL(l::PoissonLikelihood{<:ScaledLogistic}, state) = PoissonKL(state.γ, l.invlink.λ[1])
+function PoissonKL(l::PoissonLikelihood{<:ScaledLogistic}, state)
+    return PoissonKL(state.γ, l.invlink.λ[1])
+end
 
 function PolyaGammaKL(::PoissonLikelihood{<:ScaledLogistic}, state, y)
     return PolyaGammaKL(y + state.γ, state.c, state.θ)
 
@@ -7,7 +7,9 @@ include("heteroscedastic.jl")
 include("matern.jl")
 
 ### Return the labels in a vector of vectors for multiple outputs
-function treat_labels!(y::AbstractVector{T}, ::Union{RegressionLikelihood,HeteroscedasticGaussianLikelihood}) where {T}
+function treat_labels!(
+    y::AbstractVector{T}, ::Union{RegressionLikelihood,HeteroscedasticGaussianLikelihood}
+) where {T}
     T <: Real || throw(ArgumentError("For regression target(s) should be real valued"))
     return y
 end
 
@@ -35,11 +35,11 @@ end
 
 function (μ₀::AffineMean{T})(x::AbstractVector) where {T<:Real}
     # μ₀.nDim == size(x, 1) || error(
-        # "Number of dimensions of prior weight W (",
-        # size(μ₀.w),
-        # ") and X (",
-        # size(x),
-        # ") do not match",
+    # "Number of dimensions of prior weight W (",
+    # size(μ₀.w),
+    # ") and X (",
+    # size(x),
+    # ") do not match",
     # )
     return dot.(x, Ref(μ₀.w)) .+ first(μ₀.b)
 end
 
@@ -4,17 +4,15 @@
 Multi-Output Sparse Variational Gaussian Process
 
 ## Arguments
-- `X::AbstractArray` : : Input features, if `X` is a matrix the choice of colwise/rowwise is given by the `obsdim` keyword
-- `y::AbstractVector{<:AbstractVector}` : Output labels, each vector corresponds to one output dimension
 - `kernel::Union{Kernel,AbstractVector{<:Kernel}` : covariance function or vector of covariance functions, can be either a single kernel or a collection of kernels for multiclass and multi-outputs models
-- `likelihood::Union{AbstractLikelihood,Vector{<:Likelihood}` : Likelihood or vector of likelihoods of the model. For compatibilities, see [`Likelihood Types`](@ref likelihood_user)
+- `likelihoods::Union{AbstractLikelihood,Vector{<:Likelihood}` : Likelihood or vector of likelihoods of the model. For compatibilities, see [`Likelihood Types`](@ref likelihood_user)
 - `inference` : Inference for the model, for compatibilities see the [`Compatibility Table`](@ref compat_table))
 - `nLatent::Int` : Number of latent GPs
 - `nInducingPoints` : number of inducing points, or collection of inducing points locations
 
 ## Keyword arguments
 - `verbose::Int` : How much does the model print (0:nothing, 1:very basic, 2:medium, 3:everything)
-- `optimiser` : Optimiser used for the kernel parameters. Should be an Optimiser object from the [Flux.jl](https://github.com/FluxML/Flux.jl) library, see list here [Optimisers](https://fluxml.ai/Flux.jl/stable/training/optimisers/) and on [this list](https://github.com/theogf/AugmentedGaussianProcesses.jl/tree/master/src/inference/optimisers.jl). Default is `ADAM(0.001)`
+- `optimiser` : Optimiser used for the kernel parameters. Should be an Optimiser object from the [Optimisers.jl](https://github.com/FluxML/Optimisers.jl) library. Default is `ADAM(0.001)`
 - `Zoptimiser` : Optimiser used for the inducing points locations
 - `Aoptimiser` : Optimiser used for the mixing parameters.
 - `atfrequency::Int=1` : Choose how many variational parameters iterations are between hyperparameters optimization
@@ -23,64 +21,46 @@ Multi-Output Sparse Variational Gaussian Process
 """
 mutable struct MOSVGP{
     T<:Real,
-    TLikelihood<:AbstractLikelihood,
+    TLikelihood,
     TInference<:AbstractInference,
-    TData<:AbstractDataContainer,
-    N,
-    Q,
-} <: AbstractGPModel{T,TLikelihood,TInference,N}
-    data::TData
-    nFeatures::Vector{Int64} # Number of features of the GP (equal to number of points)
-    nf_per_task::Vector{Int64}
+    N, # Number of tasks
+    Q, # Number of latent GPs
+} <: AbstractGPModel{T,AbstractLikelihood,TInference,N}
+    nf_per_task::NTuple{N,Int}
     f::NTuple{Q,SparseVarLatent}
-    likelihood::Vector{TLikelihood}
+    likelihood::TLikelihood
     inference::TInference
     A::Vector{Vector{Vector{T}}}
     A_opt::Any
-    verbose::Int64
-    atfrequency::Int64
+    verbose::Int
+    atfrequency::Int
     trained::Bool
 end
 
 function MOSVGP(
-    X::AbstractArray,
-    y::AbstractVector{<:AbstractVector},
     kernel::Union{Kernel,AbstractVector{<:Kernel}},
-    likelihood::Union{AbstractLikelihood,AbstractVector{<:AbstractLikelihood}},
+    likelihoods::Union{
+        AbstractVector{<:AbstractLikelihood},Tuple{Vararg{<:AbstractLikelihood}}
+    },
     inference::AbstractInference,
-    nLatent::Int,
-    nInducingPoints::Union{Int,AbstractVector};
+    Zs::AbstractVector;
     verbose::Int=0,
     atfrequency::Int=1,
     mean::Union{<:Real,AbstractVector{<:Real},PriorMean}=ZeroMean(),
-    variance::Real=1.0,
     optimiser=ADAM(0.01),
     Aoptimiser=ADAM(0.01),
     Zoptimiser=false,
-    obsdim::Int=1,
+    T::DataType=Float64,
 )
-    @assert length(y) > 0 "y should not be an empty vector"
-    nTask = length(y)
+    likelihoods = likelihoods isa AbstractVector ? tuple(likelihoods...) : likelihoods
 
-    X, T = wrap_X(X, obsdim)
-
-    likelihoods = if likelihood isa AbstractLikelihood
-        likelihoods = [deepcopy(likelihood) for _ in 1:nTask]
-    else
-        likelihood
-    end
-
-    nf_per_task = zeros(Int64, nTask)
-    corrected_y = Vector(undef, nTask)
-    for i in 1:nTask
-        corrected_y[i], nf_per_task[i], likelihoods[i] = check_data!(y[i], likelihoods[i])
-    end
+    n_task = length(likelihoods)
+    nf_per_task = n_latent.(likelihoods)
 
     inference isa AnalyticVI || error("The inference object should be of type `AnalyticVI`")
-    all(implemented.(likelihood, Ref(inference))) ||
-        error("The $likelihood is not compatible or implemented with the $inference")
-
-    data = wrap_data(X, corrected_y)
+    all(implemented.(likelihoods, Ref(inference))) || error(
+        "One (or more) of the likelihoods $likelihoods are not compatible or implemented with the $inference",
+    )
 
     if mean isa Real
         mean = ConstantMean(mean)
@@ -92,74 +72,37 @@ function MOSVGP(
         optimiser = optimiser ? ADAM(0.01) : nothing
     end
 
+    if isa(Zoptimiser, Bool)
+        Zoptimiser = Zoptimiser ? ADAM(0.001) : nothing
+    end
+
     if isa(Aoptimiser, Bool)
         Aoptimiser = Aoptimiser ? ADAM(0.01) : nothing
     end
 
     kernel = if kernel isa Kernel
-        [kernel]
+        (kernel,)
     else
-        length(kernel) == nLatent ||
+        length(kernel) == n_task ||
             error("Number of kernels should be equal to the number of tasks")
         kernel
     end
-    nKernel = length(kernel)
-
-    nInducingPoints =
-        if nInducingPoints isa AbstractVector{<:AbstractVector{<:AbstractVector}}
-            nInducingPoints
-        elseif nInducingPoints isa AbstractVector{<:AbstractVector}
-            [deepcopy(nInducingPoints) for _ in 1:nLatent]
-        elseif nInducingPoints isa Int
-            Zref = InducingPoints(KMeansAlg(nInducingPoints), X)
-            [deepcopy(Zref) for _ in 1:nLatent]
-        end
-
-    nFeatures = size.(Z, 1)
-
-    _nMinibatch = nSamples(data)
-    if is_stochastic(inference)
-        0 < nMinibatch(inference) < nSamples || error(
-            "The size of mini-batch $(nMinibatch(inference)) is incorrect (negative or bigger than number of samples), please set nMinibatch correctly in the inference object",
-        )
-        _nMinibatch = nMinibatch(inference)
+
+    n_kernel = length(kernel)
+
+    num_latent = length(Zs)
+
+    latent_f = ntuple(num_latent) do i
+        SparseVarLatent(T, Zs[i], kernel[mod1(i, n_kernel)], mean, optimiser, Zoptimiser)
     end
 
-    latent_f = ntuple(
-        i -> _SVGP{T}(
-            nFeatures[i],
-            _nMinibatch,
-            Z[mod(i, nLatent) + 1],
-            kernel[mod(i, nKernel) + 1],
-            mean,
-            optimiser,
-            Zoptimiser,
-        ),
-        nLatent,
-    )
+    function normalize(x)
+        return x / sqrt(sum(abs2, x))
+    end
+    A = [[normalize(randn(T, num_latent)) for i in 1:nf_per_task[j]] for j in 1:n_task]
 
-    A = [
-        [x -> x / sqrt(sum(abs2, x))(randn(T, nLatent)) for i in 1:nf_per_task[j]] for
-        j in 1:nTask
-    ]
-
-    likelihoods .=
-        init_likelihood.(likelihoods, inference, nf_per_task, _nMinibatch, nFeatures)
-    xview = view_x(data, collect(range(1, _nMinibatch; step=1)))
-    yview = view_y(likelihood, data, 1:nSamples(data))
-    inference = tuple_inference(
-        inference, nLatent, nFeatures, nSamples(data), _nMinibatch, xview, yview
-    )
 
-    return MOSVGP{T,eltype(likelihoods),typeof(inference),nTask,nLatent}(
-        X,
-        corrected_y,
-        nSamples,
-        nDim,
-        nFeatures,
-        nLatent,
-        nX,
-        nTask,
+    return MOSVGP{T,typeof(likelihoods),typeof(inference),n_task,num_latent}(
         nf_per_task,
         latent_f,
         likelihoods,
@@ -181,6 +124,6 @@ end
 
 @traitimpl IsMultiOutput{MOSVGP}
 
-nOutput(::MOSVGP{<:Real,<:AbstractLikelihood,<:AbstractInference,N,Q}) where {N,Q} = Q
+n_output(::MOSVGP{T,L,I,N,Q}) where {T,L,I,N,Q} = Q
 Zviews(m::MOSVGP) = Zview.(m.f)
 objective(m::MOSVGP, state, y) = ELBO(m, state, y)