diff --git a/NEWS.md b/NEWS.md index c46f64194..935584ffa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +- Additional methods for pre-allocated result arrays and `*Config` instances have been added to the ForwardDiff extension. [#871]. + MixedModels v5.2.0 Release Notes ============================== - The use of the `wts` keyword argument has been deprecated in favor of the keyword argument `weights`, in line with the deprecation in GLM.jl v1.9.1. The usage (and subsequent interpretation) remains otherwise unchanged. [#873] @@ -714,4 +716,5 @@ Package dependencies [#864]: https://github.com/JuliaStats/MixedModels.jl/issues/864 [#865]: https://github.com/JuliaStats/MixedModels.jl/issues/865 [#867]: https://github.com/JuliaStats/MixedModels.jl/issues/867 +[#871]: https://github.com/JuliaStats/MixedModels.jl/issues/871 [#873]: https://github.com/JuliaStats/MixedModels.jl/issues/873 diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index c3263e620..149bfe355 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -30,7 +30,10 @@ using LinearAlgebra: LinearAlgebra, using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file -using ForwardDiff: ForwardDiff +using ForwardDiff: ForwardDiff, + Chunk, + GradientConfig, + HessianConfig using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -59,6 +62,16 @@ const FORWARDDIFF = """ should be included is currently still being decided. """ +##### +##### Gradients +##### + +function ForwardDiff.GradientConfig( + model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x) +) where {T} + return GradientConfig(fd_deviance(model), x, chunk) +end + """ ForwardDiff.gradient(model::LinearMixedModel) @@ -68,9 +81,29 @@ values. $(FORWARDDIFF) """ function ForwardDiff.gradient( - model::LinearMixedModel{T}, θ::Vector{T}=model.θ + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::GradientConfig=GradientConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T,CHK} + return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check) +end + +function ForwardDiff.gradient!(result::AbstractArray, + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::GradientConfig=GradientConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T,CHK} + return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) +end + +##### +##### Hessians +##### + +function ForwardDiff.HessianConfig( + model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x) ) where {T} - return ForwardDiff.gradient(fd_deviance(model), θ) + return HessianConfig(fd_deviance(model), x, chunk) end """ @@ -82,9 +115,20 @@ values. $(FORWARDDIFF) """ function ForwardDiff.hessian( - model::LinearMixedModel{T}, θ::Vector{T}=model.θ -) where {T} - return ForwardDiff.hessian(fd_deviance(model), θ) + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::HessianConfig=HessianConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T,CHK} + n = length(θ) + return ForwardDiff.hessian!(Matrix{T}(undef, n, n), model, θ, cfg, check) +end + +function ForwardDiff.hessian!(result::AbstractArray, + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::HessianConfig=HessianConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T,CHK} + return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check) end ##### diff --git a/gradients/.gitignore b/gradients/.gitignore new file mode 100644 index 000000000..7d064f72a --- /dev/null +++ b/gradients/.gitignore @@ -0,0 +1,4 @@ +*.html +*\~ +*.swp + diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd new file mode 100644 index 000000000..ac652277d --- /dev/null +++ b/gradients/GradientEvaluation.qmd @@ -0,0 +1,481 @@ +--- +title: "Gradient of the Profiled log-likelihood" +author: + - name: Douglas Bates + email: dmbates@gmail.com + orcid: 0000-0001-8316-9503 + affiliation: + - name: University of Wisconsin - Madison + city: Madison + state: WI + url: https://www.wisc.edu + department: Statistics + - name: Phillip Alday + email: me@phillipalday.com + orcid: 0000-0002-9984-5745 + affiliation: + - name: Beacon Biosignals + url: https://beacon.bio +date: last-modified +date-format: iso +toc: true +bibliography: bibliography.bib +number-sections: true +engine: julia +julia: + exeflags: + - -tauto + - --project=@. +format: + html: + toc: true + toc-location: right + embed-resources: true +--- + +## Introduction {#sec-intro} + +A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper. +The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design. + +The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and in [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. +The optimization itself is performed by a derivative-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. + +Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. + +The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients. + +### Model definition and evaluation of the objective and gradient + +The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as + +$$ +\begin{aligned} + \mathbfcal{B}&\sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})\\ + (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})& + \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right) +\end{aligned} +$$ {#eq-dists} + +where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$. +Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite. +We express it as + +$$ +\boldsymbol{\Sigma} = \sigma^2 +\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} +$$ {#eq-Sigma} + +for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. + +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship + +$$ +\begin{aligned} +\boldsymbol{\Omega_\theta}&= +\begin{bmatrix} +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\ +\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{X^\top X} & +\mathbf{X^\top y}\\ +\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{y^\top X} & +\mathbf{y^\top y}\\ +\end{bmatrix}\\ +&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ +&= +\begin{bmatrix} +\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ +\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ +\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} +\end{bmatrix} +\begin{bmatrix} +\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ +\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ +\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} +\end{bmatrix}^\top +\end{aligned} +$$ {#eq-blockedOmega} + +where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive. +(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) + +(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) + +As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as + +$$ +\begin{aligned} +-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&= +\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\ +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c_\ell\\ +&=2\sum_{j=1}^q\log L_{j,j} + 2n \log L_{q+p+1,q+p+1} + c_\ell +\end{aligned} +$$ {#eq-objective} + +where $c_\ell$ is a constant. +That is, the objective is an affine function (a linear function plus a constant) of the logarithms of the diagonal elements of $\mathbf{L}_\boldsymbol{\theta}$. +It happens that the gradient of the objective, as a function of $\boldsymbol{\theta}$, expressed in this form is straightforward to evaluate, as shown in @sec-Cholesky_derivative. + +As shown in @bates.maechler.etal:2015, sec 3.4 the REML criterion, which some prefer for parameter estimation, can be written as + +$$ +\begin{aligned} +-2\mathcal{L}_R(\boldsymbol{\theta}|\mathbf{y})&= +\log\left(\left|\mathbf{L_{ZZ}}\right|^2\left|\mathbf{L_{XX}}\right|^2\right) + (n-p) \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n-p}\right)\right]\\ +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + \log\left|\mathbf{L_{XX}}\right|^2 + (n-p)\log\ell^2_{\mathbf{yy}} + c_r\\ +&=2\sum_{j=1}^{q+p}\log L_{j,j} + 2(n-p)\log L_{q+p+1,q+p+1} + c_r +\end{aligned} +$$ {#eq-objective} + +where $c_r$ is likewise a constant. +This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$. + +### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative} + +@murray2016differentiation, section 3.1, provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. + +Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as + +$$ +d\boldsymbol{\Omega}=d\mathbf{L}\mathbf{L}^\top + \mathbf{L} d\mathbf{L}^\top +$$ {#eq-infinitesimal} + +Pre-multiplying @eq-infinitesimal by $\mathbf{L}^{-1}$ and post-multiplying by $\mathbf{L}^{-\top}$ gives + +$$ +\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}=\mathbf{L}^{-1}d\mathbf{L} + d\mathbf{L}^\top\mathbf{L}^{-\top} +$$ {#eq-LOmegaLT} + +The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first. +Thus, the diagonal of the left-hand side is exactly the result we wish to evaluate, twice the infinitesimal of the logarithms of the diagonal elements of $\mathbf{L}$. + +For completeness, we provide the conclusion of the derivation in @murray2016differentiation but we don't need the more general result of $d\mathbf{L}$ - we only need the particular result from the left-hand side of @eq-LOmegaLT. + +To evaluate $d\mathbf{L}$ we must isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, on the right-hand side of @eq-LOmegaLT, which we do with the $\Phi$ transformation applied to a symmetric matrix. +This transformation preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. +Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing + +$$ +\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)=\mathbf{L}^{-1}d\mathbf{L} +$$ {#eq-Phi_dOmega} + +or + +$$ +d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right) +$$ {#eq-dL} + +## Examples + +To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples. + +Load the packages to be used + +```{julia} +#| label: load_packages +#| warning: false +#| output: false +using CairoMakie +using FiniteDiff +using LinearAlgebra +using MixedModels +using MixedModelsDatasets: dataset +using TypedTables: Table + +const progress = isinteractive() # suppress progress bars in non-interactive sessions +CairoMakie.activate!(type="svg") # use svg graphics output +``` + +### Dyestuff - a single, scalar random-effects term + +The `dyestuff` data set provides the yield of dyestuff in each of 5 samples from each of 6 batches of an intermediate product in the process of producing a dye. + +```{julia} +#| label: dyestuff_data +dyestuff = Table(dataset(:dyestuff)) +``` + +A mixed-effects model for these data includes an overall "intercept" term (whose estimate will be the sample mean because of the balanced design) and random effects for each level of `batch`. + +```{julia} +#| label: dyestuff_model +m01 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), dyestuff; progress) +θ = m01.θ # retain a copy of the estimate of θ +print(m01) +``` + +#### The objective as a function of $\theta_1$ + +@fig-obj_graph shows the objective (negative twice the log-likelihood) of this model as a function of $\theta_1$, the relative covariance parameter. + +```{julia} +#| label: fig-obj_graph +#| fig-cap: "Graph of the objective for model m01 as a function of θ₁" +#| code-fold: true +let f = Figure() + ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective") + lines!(ax, -0.25..1.5, objective!(m01)) + f +end +``` + +Notice that the objective is well-defined for negative values of $\theta_1$ and that it is an even function, in the sense that $f(-\theta_1)=f(\theta_1)\,\forall\theta_1$. + +This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. + +At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$ + +```{julia} +#| label: dyestuff_theta +only(θ) +``` + +the derivative should also be zero (in practice, close to zero). + +We can see from @fig-obj_graph that the derivative at $\theta_1=1.$ will be positive. + +To show the evaluation of the gradient at $\boldsymbol{\theta}=[1.]$ we reset the parameter in the model object to $[1.]$ + +```{julia} +#| output: false +updateL!(setθ!(m01, ones(1))); +``` + +#### Evaluating the gradient terms + +In `MixedModels.jl` the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of + +```{julia} +ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) +Int.(ZXy) # Int for more concise printing +``` + +is stored as the $\mathbf{A}$ property of the model object. + +Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern + +```{julia} +BlockDescription(m01) +``` + +In @eq-Sigma the matrix $\boldsymbol{\Lambda_\theta}$ is defined as a $6\times6$ diagonal matrix. +Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diagonal block that is the identity, so that multiplication by $\boldsymbol{\Lambda_\theta}$ applies to the full matrix $\mathbf{A}$. + +```{julia} +Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2))) +Λ([1.]) +``` + +The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is + +```{julia} +dΛdθ1 = Diagonal(vcat(ones(6), zeros(2))) +``` + +The matrix $\mathbf{A}$ from which $\boldsymbol{\Omega_\theta}$ is generated is stored in blocks. +We assemble these into a sparse matrix as + +```{julia} +A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A))) +``` + +and check that these blocks are indeed the Gram matrix of the columns of $\mathbf{[ZXy]}$ +```{julia} +A == ZXy'ZXy +``` + +The matrix $\boldsymbol{\Omega_\theta}$ is + +```{julia} +Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2))) +Ω([1.]) +``` + +The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix as + +```{julia} +L = LowerTriangular(sparseL(m01; full=true)) +``` + +We can check that it is indeed the lower Cholesky factor + +```{julia} +L * L' ≈ Ω([1.]) +``` + +The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is + +```{julia} +dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1 +dΩdθ1([1.]) +``` + +Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$. +In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. + +Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to + +```{julia} +prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L') +``` + +The derivative of the objective at $\theta_1=1$ is, therefore + +```{julia} +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) +``` + +which we can compare to the finite-difference evaluation + +```{julia} +FiniteDiff.finite_difference_gradient(objective!(m01), [1.]) +``` + +If we repeat these steps at the parameter estimate we have +```{julia} +#| output: false +updateL!(setθ!(m01, θ)) # reset the value of θ in the model +L = LowerTriangular(sparseL(m01; full=true)) +prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') +``` + +with the derivative being evaluated as + +```{julia} +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) +``` + +### Penicillin - two completely crossed scalar random-effects terms + +The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. + +```{julia} +#| label: penicillin_data +penicillin = Table(dataset(:penicillin)) +``` + +We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`, + +```{julia} +#| label: m02 +#| output: false +#| warn: false +m02 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +θ = m02.θ +print(m02) +``` + +for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is + +```{julia} +#| label: m02ZXy +Int.(hcat(collect(first(m02.reterms)), collect(last(m02.reterms)), m02.X, m02.y)) +``` + +in which the first 24 columns are the indicators for `plate`, the next 6 columns are the indicators for `sample`, the second-to-last column is the single column of the fixed-effects model matrix, $\mathbf{X}$, and the last column is $\mathbf{y}$. + +The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left[1,1\right]^\top$, can be expressed as a lower-triangular sparse matrix as + +```{julia} +#| label: m02L +Lsparse = LowerTriangular(sparseL(updateL!(setθ!(m02, ones(2))); full=true)) +``` + +In practice, the $\mathbf{L}$ matrix is stored in a blocked form + +```{julia} +#| label: m02_blocks +BlockDescription(m02) +``` + +from which the profiled objective (negative twice the log-likelihood) can be evaluated as + +```{julia} +#| label: m02L_initial_objective +objective(m02) +``` + +#### Evaluating terms in the gradient + +Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the `A` property of the model + +```{julia} +A2 = let blk = m02.A + hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6]) +end +``` + +The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is + +```{julia} +function Λ2(θ::Vector{Float64}) + length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) + return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2))) +end +Λ2([1.,1.]) +``` + +producing $\boldsymbol{\Omega}$ as + +```{julia} +function Ω2(θ) + length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) + return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2))) +end +Ω2([1.,1.]) +``` + +The partial derivatives of `Ω2` are constants + +```{julia} +#| output: false +dΛ2dθ1 = Diagonal(vcat(ones(24), zeros(8))) +dΛ2dθ2 = Diagonal(vcat(zeros(24), ones(6), zeros(2))) +``` + +For the ML objective (i.e. negative twice the log-likelihood) a finite difference gradient at the initial $\boldsymbol{\theta}=[1,1]^\top$ is + +```{julia} +FiniteDiff.finite_difference_gradient(objective!(m02), ones(2)) +``` + +To evaluate this quantity from the formula we create + +```{julia} +dΩ2dθ1(θ) = dΛ2dθ1 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ1' +Int.(dΩ2dθ1([1., 1.])) # Int to save space when printing +``` + + +```{julia} +prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ1([1.,1.])), Lsparse') +``` + +yielding the first component of the gradient as + +```{julia} +dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) +``` + +For the second component of the gradient + +```{julia} +dΩ2dθ2(θ) = dΛ2dθ2 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ2' +Int.(dΩ2dθ2([1., 1.])) +``` + +```{julia} +prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse') +``` + +```{julia} +dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) +``` + +### References {.unnumbered} + +::: {#refs} +::: diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd new file mode 100644 index 000000000..d3167307f --- /dev/null +++ b/gradients/Gradient_based_optimization.qmd @@ -0,0 +1,190 @@ +--- +title: "Gradient-based Optimization of the Profiled log-likelihood" +author: + - name: Douglas Bates + email: dmbates@gmail.com + orcid: 0000-0001-8316-9503 + affiliation: + - name: University of Wisconsin - Madison + city: Madison + state: WI + url: https://www.wisc.edu + department: Statistics + - name: Phillip Alday + email: me@phillipalday.com + orcid: 0000-0002-9984-5745 + affiliation: + - name: Beacon Biosignals + url: https://beacon.bio +date: last-modified +date-format: iso +toc: true +bibliography: bibliography.bib +number-sections: true +engine: julia +julia: + exeflags: + - -tauto + - --project=@. +format: + html: + toc: true + toc-location: right + embed-resources: true +--- + +## Introduction {#sec-intro} + +Before devoting too much effort to efficient evaluation of the gradient of the profiled log-likelihood, we should check if using gradient-based optimization requires sufficiently fewer evaluations of the objective, and the gradient, than does derivative-free optimization. + +Here we fit a few models using automatic differentiation from [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and the `ForwardDiff` extension to [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) to optimize the profiled log-likelihood with the `LD_LBFGS` optimizer from [NLopt.jl](https://github.com/jump-dev/NLopt.jl), instead of the default `LN_NEWUOA` which does not use gradients. + +The results are more-or-less a toss-up when using `ForwardDiff` to evaluate the gradient. +A more efficient evaluation of the gradient, taking advantage of the sparse-blocked structure of the Cholesky factorization to evaluate the profiled log-likelihood, may tip the balance in favor of gradient-based methods. + +## Preliminaries {#sec-prelim} + +Load the packages to be used + +```{julia} +#| label: load_packages +using ForwardDiff +using MixedModels +using MixedModels: fd_deviance +using MixedModelsDatasets: dataset +using NLopt +using Tables: table +using TypedTables: Table + +const progress = false +``` + +## Examples {#sec-examples} + +We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`. + +```{julia} +addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0')) +function gr_refit!(m::LinearMixedModel{T}) where {T} + θ = copy(m.optsum.initial) + k = length(θ) + fitlog = sizehint!(T[], 50 * k) + grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) + function obj(θ::Vector{Float64}, grad::Vector{Float64}) + val = objective(updateL!(setθ!(m, θ))) + push!(fitlog, val) + append!(fitlog, θ) + if !isempty(grad) + ForwardDiff.gradient!(grad, m, θ, grad_config) + append!(fitlog, grad) + else + append!(fitlog, fill(NaN, k)) # never called with empty grad but just in case + end + return val + end + opt = NLopt.Opt(:LD_LBFGS, k) + NLopt.ftol_rel!(opt, 1.e-12) + NLopt.ftol_abs!(opt, 1.e-8) + NLopt.min_objective!(opt, obj) + min_f, min_x, ret = NLopt.optimize(opt, θ) + header = vcat([:obj], addinds('θ', k), addinds('g', k)) + return Table(table(transpose(reshape(fitlog, 2k + 1, :)); header)) +end +``` + +### Penicillin data {#sec-penicillin} + +Define a model for the `penicillin` data + +```{julia} +#| label: m1 +m1 = fit( + MixedModel, + @formula(diameter ~ 1 + (1|plate) + (1|sample)), + dataset(:penicillin); + progress, +) +print(m1) +``` + +for which the optimization summary is + +```{julia} +m1.optsum +``` + +and refit the model using ForwardDiff gradient evaluations. + +```{julia} +fitlog = gr_refit!(m1) +``` + +The objective at convergence is + +```{julia} +last(fitlog.obj) +``` + +and the last few evaluations are + +```{julia} +last(fitlog, 5) +``` + +### Sleepstudy {#sec-sleepstudy} + +```{julia} +m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) +print(m2) +``` + +```{julia} +m2.optsum +``` + +```{julia} +fitlog = gr_refit!(m2) +``` + +```{julia} +last(fitlog.obj) +``` + +```{julia} +last(fitlog, 5) +``` + +### Kronmueller-Barr 2007 {#sec-kb07} + +```{julia} +# this model is very overparameterized, but it's a test example +m3 = fit( + MixedModel, + @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), + dataset(:kb07); + progress, +) +print(m3) +``` + +```{julia} +m3.optsum +``` + +Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector + +```{julia} +findall(iszero, m3.θ) +``` + +```{julia} +fitlog = gr_refit!(m3) +``` + +```{julia} +last(fitlog.obj) +``` + +```{julia} +last(fitlog, 5) +``` \ No newline at end of file diff --git a/gradients/Project.toml b/gradients/Project.toml new file mode 100644 index 000000000..a05f9e152 --- /dev/null +++ b/gradients/Project.toml @@ -0,0 +1,14 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" +MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6" +NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" + +[sources] +MixedModels = {path = ".."} diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib new file mode 100644 index 000000000..decf33365 --- /dev/null +++ b/gradients/bibliography.bib @@ -0,0 +1,44 @@ +@article{Zhou03042019, + author = {Hua Zhou and Liuyi Hu and Jin Zhou and Kenneth Lange}, + title = {MM Algorithms for Variance Components Models}, + journal = {Journal of Computational and Graphical Statistics}, + volume = {28}, + number = {2}, + pages = {350--361}, + year = {2019}, + publisher = {ASA Website}, + doi = {10.1080/10618600.2018.1529601}, + note ={PMID: 31592195}, + URL = {https://doi.org/10.1080/10618600.2018.1529601}, + eprint = {https://doi.org/10.1080/10618600.2018.1529601} +} + +@Article{bates.maechler.etal:2015, + author = {Bates, Douglas and Maechler, Martin and Bolker, Benjamin M. and Walker, Steven}, + title = {Fitting Linear Mixed-Effects Models using lme4}, + doi = {10.18637/jss.v067.i01}, + number = {1}, + pages = {1--48}, + volume = {67}, + date-added = {2020-03-24}, + date-modified = {2016-02-12 06:52:06 +0000}, + file = {:2015/bates.maechler.etal_2015 - Fitting Linear Mixed.pdf:PDF}, + journal = {Journal of Statistical Software}, + year = {2015}, +} + +@article{bates2025mixed, + title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization}, + author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H}, + journal={arXiv preprint arXiv:2505.11674}, + year={2025}, + url={https://arxiv.org/pdf/2505.11674} +} + +@article{murray2016differentiation, + title={Differentiation of the Cholesky decomposition}, + author={Murray, Iain}, + journal={arXiv preprint arXiv:1602.07527}, + year={2016}, + url={https://arxiv.org/pdf/1602.07527} +}