From a2cbbaeb2ed655fe2c8940f67519a7ab12899d12 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 28 Dec 2025 22:20:31 -0600 Subject: [PATCH 01/15] write up on gradients --- experiments/.gitignore | 4 + experiments/GradientEvaluation.qmd | 383 ++++++++++++++++++++ experiments/Gradient_based_optimization.qmd | 197 ++++++++++ experiments/Project.toml | 10 + experiments/bibliography.bib | 36 ++ 5 files changed, 630 insertions(+) create mode 100644 experiments/.gitignore create mode 100644 experiments/GradientEvaluation.qmd create mode 100644 experiments/Gradient_based_optimization.qmd create mode 100644 experiments/Project.toml create mode 100644 experiments/bibliography.bib diff --git a/experiments/.gitignore b/experiments/.gitignore new file mode 100644 index 000000000..7d064f72a --- /dev/null +++ b/experiments/.gitignore @@ -0,0 +1,4 @@ +*.html +*\~ +*.swp + diff --git a/experiments/GradientEvaluation.qmd b/experiments/GradientEvaluation.qmd new file mode 100644 index 000000000..776221549 --- /dev/null +++ b/experiments/GradientEvaluation.qmd @@ -0,0 +1,383 @@ +--- +title: "Evaluation of the Gradient of the Profiled log-likelihood" +author: + - name: Douglas Bates + email: dmbates@gmail.com + orcid: 0000-0001-8316-9503 + affiliation: + - name: University of Wisconsin - Madison + city: Madison + state: WI + url: https://www.wisc.edu + department: Statistics + - name: Phillip Alday + email: me@phillipalday.com + orcid: 0000-0002-9984-5745 + affiliation: + - name: Beacon Biosignals + url: https://beacon.bio +date: last-modified +date-format: iso +toc: true +bibliography: bibliography.bib +number-sections: true +engine: julia +julia: + exeflags: + - -tauto + - --project=@. +format: + html: + toc: true + toc-location: right + embed-resources: true +--- + +## Introduction {#sec-intro} + +A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper. +The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design. + +The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. +The optimization itself is performed by a gradient-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. + +Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. + +The purpose of this note is to explore whether these expressions can be implemented effectively, even if just for the variance components model, which, for our purposes, is a model in which all the random effects terms are simple, scalar terms. + +### Expressions for the gradient terms + +The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as + +$$ +\begin{aligned} + (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})& + \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)\\ + \mathbfcal{B}& + \sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma}) +\end{aligned} +$$ {#eq-dists} + +where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$. +Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite. +We express it as + +$$ +\boldsymbol{\Sigma} = \sigma^2 +\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} +$$ {#eq-Sigma} + +for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. + +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, of + +$$ +\boldsymbol{\Omega_\theta} = +\begin{bmatrix} +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\ +\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{X^\top X} & +\mathbf{X^\top y}\\ +\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{y^\top X} & +\mathbf{y^\top y}\\ +\end{bmatrix} +$$ {#eq-blockedOmega} + +where $\mathbf{L}_\theta$ has a similar blocked structure + +$$ +\mathbf{L}_\boldsymbol{\theta} = +\begin{bmatrix} +\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ +\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ +\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} +\end{bmatrix} +$$ {#eq-blockedL} + +(In the actual computational methods the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) + +The objective to be optimized is negative twice the profiled log-likelihood, + +$$ +-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y}) = +\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right] +$$ {#eq-objective} + +which is on the scale of the deviance (if we were able to define a deviance for these models). + +As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is + +$$ +\begin{aligned} +\nabla\log\left|\mathbf{L_ZZ}\right|^2 &= \nabla\log\left(\left|\mathbf{L_{ZZ}L_{ZZ}}^\top\right|\right)\\ +&=\nabla\log\left(\left|\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right|\right)\\ +&=\operatorname{tr}\left[\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) +\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right)^{-1}\right]\\ +&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} +\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) +\mathbf{L_{ZZ}}^{-\top} +\right]\\ +&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} +\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right) +\mathbf{L_{ZZ}}^{-\top} +\right] +\end{aligned} +$$ {#eq-delterm1} + +For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple in that they are block diagonal with a single non-zero diagonal block, which is an identity matrix. + +## Examples + +To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples. + +Load the packages to be used + +```{julia} +#| label: load_packages +#| warning: false +#| output: false +using FiniteDiff +using LinearAlgebra +using MixedModels +using MixedModelsDatasets: dataset +using TypedTables: Table +``` + +### Penicillin - two completely crossed scalar random-effects terms + +The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. + +```{julia} +#| label: penicillin_data +const penicillin = Table(dataset(:penicillin)) +``` + +We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`, + +```{julia} +#| label: m02 +#| output: false +#| warn: false +m02 = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +``` + +for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is + +```{julia} +#| label: m02ZXy +Int.(hcat(collect(first(m02.reterms)), collect(last(m02.reterms)), m02.X, m02.y)) +``` + +in which the first 24 columns are the indicators for `plate`, the next 6 columns are the indicators for `sample`, the second-to-last column is the single column of the fixed-effects model matrix, $\mathbf{X}$, and the last column is $\mathbf{y}$. + +The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left[1,1\right]^\top$, can be expressed as a lower-triangular sparse matrix as + +```{julia} +#| label: m02L +Lsparse = LowerTriangular(sparseL(updateL!(m02); full=true)) +``` + +In practice, the full $\mathbf{L}$ matrix is stored in a blocked form + +```{julia} +#| label: m02_blocks +BlockDescription(m02) +``` + +from which the profiled objective (negative twice the log-likelihood) can be evaluated as + +```{julia} +#| label: m02L_initial_objective +objective(m02) +``` + +#### Evaluating terms in the gradient + +For illustration of the gradient evaluation we create the lower-triangular sparse submatrix $\mathbf{L_{ZZ}}$ as + +```{julia} +#| label: m02LZZ +LZZsparse = LowerTriangular(sparseL(m02)) +``` + +from which $\log\left|\mathbf{L_{ZZ}}\right|^2$ can be evaluated as + +```{julia} +#| label: logdet_m02 +2. * sum(log, diag(LZZsparse)) +``` + +In practice we use the `logdet` function + +```{julia} +#| label: logdet__m02 +logdet(m02) +``` + +which evaluates this quantity from the blocked representation of $\mathbf{L}$. + +A finite-difference approximation to the gradient of the `logdet` at this value of $\boldsymbol{\theta}$ is + +```{julia} +ldfun(x::Vector{Float64}) = logdet(updateL!(setθ!(m02, x))) +FiniteDiff.finite_difference_gradient(ldfun, [1., 1.]) +``` + +The matrix $\mathbf{A_{ZZ}}=\mathbf{Z}^\top\mathbf{Z}$ for this model, as a dense matrix, is + +```{julia} +#| label: denseA +A = Int.(hvcat(2, first(m02.A), m02.A[2]', m02.A[2], m02.A[3])) +``` + +and the first face of $\nabla{\boldsymbol{\Lambda}}$ is + +```{julia} +#| label: nabla_Lambda +nabla1 = Int.(Diagonal(vcat(ones(Int, 24), zeros(Int, 6)))) +``` + +With $\boldsymbol{\Lambda(\theta)}$ being + +```{julia} +Λ(θ) = Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6))) +θ = ones(2) # initial parameter vector +Int.(Λ(θ)) # initial value of Λ +``` + +the first face of $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is + +```{julia} +#| label: symprod +symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 +Int.(symprod) +``` + +producing the matrix whose trace is desired as + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') # overwrites the value of symprod +``` + +yielding the trace as + +```{julia} +sum(diag(symprod)) +``` + +One point to notice here is that the $[1,1]$ block of this matrix is diagonal, with elements of + +```{julia} +((2 * first(θ)) .* first(m02.A).diag) ./ abs2.(first(m02.L).diag) +``` + +which can be used to simplify the evaluation of the first gradient term. +In particular, the gradient of a model with a single, scalar random-effects term is, unsurprisingly, straightforward. + +For the second element of the gradient we define + +```{julia} +nabla2 = Diagonal(vcat(zeros(24), ones(6))) +Int.(nabla2) +``` + +and + +```{julia} +symprod = nabla2 * A * Λ(ones(2)) + Λ(ones(2)) * A * nabla2 +``` + +The matrix whose trace is required is + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +``` + +producing the second element of the gradient of `ldfun` as + +```{julia} +sum(diag(symprod)) +``` + +Notice that the entire $[1,1]$ block of this matrix is zero and will not need to be evaluated explicitly. + +We evaluate $\boldsymbol{\hat{\theta}}$ using a derivative-free optimizer as + +```{julia} +θ = refit!(m02).θ +``` + +after which the first face of `symprod` becomes + +```{julia} +symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 +``` + +`LZZsparse` becomes + +```{julia} +LZZsparse = LowerTriangular(sparseL(m02)) +``` + +and the matrix whose trace is required is + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +``` + +yielding the gradient term + +```{julia} +sum(diag(symprod)) +``` + +which can be compared to the finite difference value + +```{julia} +FiniteDiff.finite_difference_gradient(ldfun, θ) +``` + +(Note, this is the first element of the gradient of the `logdet` term only, not the gradient of the objective which is near zero + +```{julia} +FiniteDiff.finite_difference_gradient(objective!(m02), θ) +``` + +as it should be at the optimum.) + +For the second element of the gradient of `ldfun` we have + +```{julia} +symprod = nabla2 * A * Λ(θ) + Λ(θ) * A * nabla2 +``` + +After pre- and post-division by `LZZsparse`, this becomes + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +``` + +yielding the second element of the gradient of `ldfun` as + +```{julia} +sum(diag(symprod)) +``` + +#### Factoring the symmetric matrix + +The matrix $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is symmetric and has the same sparsity structure as $\mathbf{Z^\top Z}$, which is positive semi-definite. +However, it is not clear that the non-zero blocks in $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ will be positive semi-definite in the general case. +In the case of a single variance component it will be positive definite when $\theta_1>0$ because it is $2\theta_1\mathbf{A}$. + + +### References {.unnumbered} + +::: {#refs} +::: diff --git a/experiments/Gradient_based_optimization.qmd b/experiments/Gradient_based_optimization.qmd new file mode 100644 index 000000000..ae2d0a47b --- /dev/null +++ b/experiments/Gradient_based_optimization.qmd @@ -0,0 +1,197 @@ +--- +title: "Gradient-based Optimization of the Profiled log-likelihood" +author: + - name: Douglas Bates + email: dmbates@gmail.com + orcid: 0000-0001-8316-9503 + affiliation: + - name: University of Wisconsin - Madison + city: Madison + state: WI + url: https://www.wisc.edu + department: Statistics + - name: Phillip Alday + email: me@phillipalday.com + orcid: 0000-0002-9984-5745 + affiliation: + - name: Beacon Biosignals + url: https://beacon.bio +date: last-modified +date-format: iso +toc: true +bibliography: bibliography.bib +number-sections: true +engine: julia +julia: + exeflags: + - -tauto + - --project=@. +format: + html: + toc: true + toc-location: right + embed-resources: true +--- + +## Introduction {#sec-intro} + +Before devoting too much effort to efficient evaluation of the gradient of the profiled log-likelihood, we should check if using gradient-based optimization requires sufficiently fewer evaluations of the objective, and the gradient, than does derivative-free optimization. + +Here we fit a few models using automatic differentiation from [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and the `ForwardDiff` extension to [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) to optimize the profiled log-likelihood with the `LD_LBFGS` optimizer from [NLopt.jl](https://github.com/jump-dev/NLopt.jl), instead of the default `LN_NEWUOA` which does not use gradients. + +The results are more-or-less a toss-up when using `ForwardDiff` to evaluate the gradient. +A more efficient evaluation of the gradient, taking advantage of the sparse-blocked structure of the Cholesky factorization to evaluate the profiled log-likelihood, may tip the balance in favor of gradient-based methods. + +## Preliminaries {#sec-prelim} + +Load the packages to be used + +```{julia} +#| label: load_packages +using BenchmarkTools +using ForwardDiff +using MixedModels +using MixedModelsDatasets: dataset +using NLopt +using Tables: table +using TypedTables: Table +``` + +## Examples {#sec-examples} + +### Penicillin data {#sec-penicillin} + +Load the `penicillin` dataset +```{julia} +penicillin = Table(dataset(:penicillin)) +``` + +and define a model + +```{julia} +#| label: const_defs +#| output: false +m = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +θ = copy(m.θ) +k = length(θ) +const fitlog = sizehint!(Vector{Float64}(undef, 0), 200) +``` + +with an NLopt-compatible objective function + +```{julia} +#| label: obj_def +function obj(θ::Vector{Float64}, grad::Vector{Float64}) + val = objective(updateL!(setθ!(m, θ))) + push!(fitlog, val) + append!(fitlog, θ) + if !isempty(grad) + copyto!(grad, ForwardDiff.gradient(m, θ)) + append!(fitlog, grad) + end + return val +end +``` + +A benchmark of evaluating the objective only + +```{julia} +#| label: benchmark_obj +@benchmark objective(updateL!(setθ!($m, $θ))) seconds=1 +``` + +compared to evaluating both the objective and its gradient + +```{julia} +#| label: benchmark_obj_grad +let gr = Vector{Float64}(undef, k) + @benchmark obj($θ, $gr) seconds=1 +end +``` + +Notice that evaluating both the objective and the gradient, using `ForwardDiff.jl`, results in considerably more allocation of storage than does evaluating the objective alone. +(In part this is a reflection of the fact that considerable work has gone into tuning the objective evaluation so that it doesn't allocate a lot of memory.) + +Fitting the model using a derivative-free optimizer gives + +```{julia} +m.optsum.ftol_rel = 1.e-8 # for fair comparison with the gradient-based method +print(refit!(m)) +``` + +for which the optimization summary is + +```{julia} +m.optsum +``` + +The objective at the estimate is + +```{julia} +m.optsum.fmin +``` + +We set up and run the gradient-based optimizer L-BFGS as + +```{julia} +opt = NLopt.Opt(:LD_LBFGS, 2) +NLopt.ftol_rel!(opt, 1.e-8) +NLopt.min_objective!(opt, obj) +empty!(fitlog) +min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +``` + +where the fitlog is + +```{julia} +header = [:obj, :θ₁, :θ₂, :g₁, :g₂] +fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) +``` + +```{julia} +last(fltbl, 3) +``` + +### Sleepstudy {#sec-sleepstudy} + +```{julia} +sleepstudy = Table(dataset(:sleepstudy)) +``` + +```{julia} +m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy) +θ = copy(m.θ) +@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 +``` + + +```{julia} +@benchmark obj($θ, $(similar(θ))) seconds=1 +``` + +```{julia} +print(refit!(m)) +``` + +```{julia} +m.optsum +``` + +```{julia} +opt = NLopt.Opt(:LD_LBFGS, 3) +NLopt.ftol_rel!(opt, 1.e-8) +NLopt.min_objective!(opt, obj) +empty!(fitlog) +min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +``` + + +```{julia} +header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] +fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) +``` + + +```{julia} +last(fltbl, 3) +``` \ No newline at end of file diff --git a/experiments/Project.toml b/experiments/Project.toml new file mode 100644 index 000000000..45e93db75 --- /dev/null +++ b/experiments/Project.toml @@ -0,0 +1,10 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" +MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6" +NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" diff --git a/experiments/bibliography.bib b/experiments/bibliography.bib new file mode 100644 index 000000000..2397e3095 --- /dev/null +++ b/experiments/bibliography.bib @@ -0,0 +1,36 @@ +@article{Zhou03042019, + author = {Hua Zhou and Liuyi Hu and Jin Zhou and Kenneth Lange}, + title = {MM Algorithms for Variance Components Models}, + journal = {Journal of Computational and Graphical Statistics}, + volume = {28}, + number = {2}, + pages = {350--361}, + year = {2019}, + publisher = {ASA Website}, + doi = {10.1080/10618600.2018.1529601}, + note ={PMID: 31592195}, + URL = {https://doi.org/10.1080/10618600.2018.1529601}, + eprint = {https://doi.org/10.1080/10618600.2018.1529601} +} + +@Article{bates.maechler.etal:2015, + author = {Bates, Douglas and Maechler, Martin and Bolker, Benjamin M. and Walker, Steven}, + title = {Fitting Linear Mixed-Effects Models using lme4}, + doi = {10.18637/jss.v067.i01}, + number = {1}, + pages = {1--48}, + volume = {67}, + date-added = {2020-03-24}, + date-modified = {2016-02-12 06:52:06 +0000}, + file = {:2015/bates.maechler.etal_2015 - Fitting Linear Mixed.pdf:PDF}, + journal = {Journal of Statistical Software}, + year = {2015}, +} + +@article{bates2025mixed, + title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization}, + author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H}, + journal={arXiv preprint arXiv:2505.11674}, + year={2025} +} + From b55d6222d71a9c70dca015867a24dd289b0140e1 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 28 Dec 2025 23:19:21 -0600 Subject: [PATCH 02/15] slight optimization of gradient computation --- ext/MixedModelsForwardDiffExt.jl | 21 ++++++++++++++++--- {experiments => gradients}/.gitignore | 0 .../GradientEvaluation.qmd | 0 .../Gradient_based_optimization.qmd | 9 +++++--- {experiments => gradients}/Project.toml | 3 +++ {experiments => gradients}/bibliography.bib | 0 6 files changed, 27 insertions(+), 6 deletions(-) rename {experiments => gradients}/.gitignore (100%) rename {experiments => gradients}/GradientEvaluation.qmd (100%) rename {experiments => gradients}/Gradient_based_optimization.qmd (94%) rename {experiments => gradients}/Project.toml (92%) rename {experiments => gradients}/bibliography.bib (100%) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index c3263e620..757c4b7d2 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -30,7 +30,7 @@ using LinearAlgebra: LinearAlgebra, using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file -using ForwardDiff: ForwardDiff +using ForwardDiff: ForwardDiff, DiffResult, GradientConfig, Chunk using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -68,9 +68,24 @@ values. $(FORWARDDIFF) """ function ForwardDiff.gradient( - model::LinearMixedModel{T}, θ::Vector{T}=model.θ + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::GradientConfig=GradientConfig(model, x) ) where {T} - return ForwardDiff.gradient(fd_deviance(model), θ) + return ForwardDiff.gradient(model, θ, cfg, check) +end + +# gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T}, +# ::ForwardDiff.GradientConfig{T}, ::Val{CHK}) where {T, T, CHK, F<:LinearMixedModel{T}} + +function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::GradientConfig=GradientConfig(model, x) +) where {T} + return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg) +end + +function ForwardDiff.GradientConfig(model::LinearMixedModel, x::AbstractArray, chunk::Chunk = Chunk(x)) + return GradientConfig(fd_deviance(model), x, chunk) end """ diff --git a/experiments/.gitignore b/gradients/.gitignore similarity index 100% rename from experiments/.gitignore rename to gradients/.gitignore diff --git a/experiments/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd similarity index 100% rename from experiments/GradientEvaluation.qmd rename to gradients/GradientEvaluation.qmd diff --git a/experiments/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd similarity index 94% rename from experiments/Gradient_based_optimization.qmd rename to gradients/Gradient_based_optimization.qmd index ae2d0a47b..140ec6fbf 100644 --- a/experiments/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -51,6 +51,7 @@ Load the packages to be used using BenchmarkTools using ForwardDiff using MixedModels +using MixedModels: fd_deviance using MixedModelsDatasets: dataset using NLopt using Tables: table @@ -81,12 +82,13 @@ with an NLopt-compatible objective function ```{julia} #| label: obj_def +grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) function obj(θ::Vector{Float64}, grad::Vector{Float64}) val = objective(updateL!(setθ!(m, θ))) push!(fitlog, val) append!(fitlog, θ) if !isempty(grad) - copyto!(grad, ForwardDiff.gradient(m, θ)) + copyto!(grad, ForwardDiff.gradient!(grad, m, θ, grad_config)) append!(fitlog, grad) end return val @@ -161,6 +163,7 @@ sleepstudy = Table(dataset(:sleepstudy)) ```{julia} m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy) θ = copy(m.θ) +grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) @benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 ``` @@ -178,7 +181,7 @@ m.optsum ``` ```{julia} -opt = NLopt.Opt(:LD_LBFGS, 3) +opt = NLopt.Opt(:LD_LBFGS, length(θ)) NLopt.ftol_rel!(opt, 1.e-8) NLopt.min_objective!(opt, obj) empty!(fitlog) @@ -194,4 +197,4 @@ fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) ```{julia} last(fltbl, 3) -``` \ No newline at end of file +``` diff --git a/experiments/Project.toml b/gradients/Project.toml similarity index 92% rename from experiments/Project.toml rename to gradients/Project.toml index 45e93db75..220557789 100644 --- a/experiments/Project.toml +++ b/gradients/Project.toml @@ -8,3 +8,6 @@ MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6" NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" + +[sources] +MixedModels = {path = ".."} diff --git a/experiments/bibliography.bib b/gradients/bibliography.bib similarity index 100% rename from experiments/bibliography.bib rename to gradients/bibliography.bib From 4aa750e26512da743c52cc1c9b58fed0dcf32e0b Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 28 Dec 2025 23:21:36 -0600 Subject: [PATCH 03/15] kb07 --- gradients/Gradient_based_optimization.qmd | 43 ++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 140ec6fbf..53a96eac2 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -88,7 +88,7 @@ function obj(θ::Vector{Float64}, grad::Vector{Float64}) push!(fitlog, val) append!(fitlog, θ) if !isempty(grad) - copyto!(grad, ForwardDiff.gradient!(grad, m, θ, grad_config)) + ForwardDiff.gradient!(grad, m, θ, grad_config) append!(fitlog, grad) end return val @@ -198,3 +198,44 @@ fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) ```{julia} last(fltbl, 3) ``` + +### Kronmueller-Barr 2007 {#sec-kb07} + +```{julia} +kb07 = Table(dataset(:kb07)) +``` + +```{julia} +# this model is very overparameterized, but it's a test example +m = LinearMixedModel(@formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), kb07) +θ = copy(m.θ) +grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) +@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 +``` + + +```{julia} +@benchmark obj($θ, $(similar(θ))) seconds=5 +``` + +```{julia} +print(refit!(m)) +``` + +```{julia} +m.optsum +``` + +```{julia} +opt = NLopt.Opt(:LD_LBFGS, length(θ)) +NLopt.ftol_rel!(opt, 1.e-8) +NLopt.min_objective!(opt, obj) +empty!(fitlog) +min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +``` + + +```{julia} +header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] +fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) +``` From 2765eef6cd698979ed03921959ca141e132045bf Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 30 Dec 2025 09:06:03 -0600 Subject: [PATCH 04/15] Spelling mistakes? --- ext/MixedModelsForwardDiffExt.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index 757c4b7d2..ef4d4421f 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -69,7 +69,7 @@ $(FORWARDDIFF) """ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, x) + cfg::GradientConfig=GradientConfig(model, θ) ) where {T} return ForwardDiff.gradient(model, θ, cfg, check) end @@ -79,7 +79,7 @@ end function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, x) + cfg::GradientConfig=GradientConfig(model, θ) ) where {T} return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg) end From 644027406527ade5db64ed2110f26fff521d7c63 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 30 Dec 2025 11:29:41 -0600 Subject: [PATCH 05/15] Still not passing tests. In write-up made method comparisons fairer. --- ext/MixedModelsForwardDiffExt.jl | 2 +- gradients/Gradient_based_optimization.qmd | 167 ++++++++-------------- 2 files changed, 57 insertions(+), 112 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index ef4d4421f..324186065 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -71,7 +71,7 @@ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ) ) where {T} - return ForwardDiff.gradient(model, θ, cfg, check) + return ForwardDiff.gradient(model, θ, cfg)#, check) end # gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T}, diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 53a96eac2..06b27c2f9 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -48,7 +48,6 @@ Load the packages to be used ```{julia} #| label: load_packages -using BenchmarkTools using ForwardDiff using MixedModels using MixedModels: fd_deviance @@ -56,186 +55,132 @@ using MixedModelsDatasets: dataset using NLopt using Tables: table using TypedTables: Table -``` - -## Examples {#sec-examples} -### Penicillin data {#sec-penicillin} - -Load the `penicillin` dataset -```{julia} -penicillin = Table(dataset(:penicillin)) +const progress = false ``` -and define a model - -```{julia} -#| label: const_defs -#| output: false -m = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) -θ = copy(m.θ) -k = length(θ) -const fitlog = sizehint!(Vector{Float64}(undef, 0), 200) -``` +## Examples {#sec-examples} -with an NLopt-compatible objective function +We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`. ```{julia} -#| label: obj_def -grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) -function obj(θ::Vector{Float64}, grad::Vector{Float64}) +addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0')) +function gr_refit!(m::LinearMixedModel{T}) where {T} + θ = copy(m.optsum.initial) + k = length(θ) + fitlog = sizehint!(T[], 50 * k) + grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) + function obj(θ::Vector{Float64}, grad::Vector{Float64}) val = objective(updateL!(setθ!(m, θ))) push!(fitlog, val) append!(fitlog, θ) if !isempty(grad) - ForwardDiff.gradient!(grad, m, θ, grad_config) - append!(fitlog, grad) + ForwardDiff.gradient!(grad, m, θ, grad_config) + append!(fitlog, grad) + else + append!(fitlog, fill(NaN, k)) # never called with empty grad but just in case end return val + end + opt = NLopt.Opt(:LD_LBFGS, k) + NLopt.ftol_rel!(opt, 1.e-12) + NLopt.ftol_abs!(opt, 1.e-8) + NLopt.min_objective!(opt, obj) + min_f, min_x, ret = NLopt.optimize(opt, θ) + header = vcat([:obj], addinds('θ', k), addinds('g', k)) + return Table(table(transpose(reshape(fitlog, 2k + 1, :)); header)) end ``` -A benchmark of evaluating the objective only - -```{julia} -#| label: benchmark_obj -@benchmark objective(updateL!(setθ!($m, $θ))) seconds=1 -``` - -compared to evaluating both the objective and its gradient - -```{julia} -#| label: benchmark_obj_grad -let gr = Vector{Float64}(undef, k) - @benchmark obj($θ, $gr) seconds=1 -end -``` - -Notice that evaluating both the objective and the gradient, using `ForwardDiff.jl`, results in considerably more allocation of storage than does evaluating the objective alone. -(In part this is a reflection of the fact that considerable work has gone into tuning the objective evaluation so that it doesn't allocate a lot of memory.) +### Penicillin data {#sec-penicillin} -Fitting the model using a derivative-free optimizer gives +Define a model for the `penicillin` data ```{julia} -m.optsum.ftol_rel = 1.e-8 # for fair comparison with the gradient-based method -print(refit!(m)) +#| label: const_defs +#| output: false +m1 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress) +print(m1) ``` for which the optimization summary is ```{julia} -m.optsum +m1.optsum ``` -The objective at the estimate is +and refit the model using ForwardDiff gradient evaluations. ```{julia} -m.optsum.fmin +fitlog = gr_refit!(m1) ``` -We set up and run the gradient-based optimizer L-BFGS as +The objective at convergence is ```{julia} -opt = NLopt.Opt(:LD_LBFGS, 2) -NLopt.ftol_rel!(opt, 1.e-8) -NLopt.min_objective!(opt, obj) -empty!(fitlog) -min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +last(fitlog.obj) ``` -where the fitlog is - -```{julia} -header = [:obj, :θ₁, :θ₂, :g₁, :g₂] -fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) -``` +and the last few evaluations are ```{julia} -last(fltbl, 3) +last(fitlog, 5) ``` ### Sleepstudy {#sec-sleepstudy} ```{julia} -sleepstudy = Table(dataset(:sleepstudy)) -``` - -```{julia} -m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy) -θ = copy(m.θ) -grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) -@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 -``` - - -```{julia} -@benchmark obj($θ, $(similar(θ))) seconds=1 +m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) +print(m2) ``` ```{julia} -print(refit!(m)) +m2.optsum ``` ```{julia} -m.optsum +fitlog = gr_refit!(m2) ``` ```{julia} -opt = NLopt.Opt(:LD_LBFGS, length(θ)) -NLopt.ftol_rel!(opt, 1.e-8) -NLopt.min_objective!(opt, obj) -empty!(fitlog) -min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +last(fitlog.obj) ``` - ```{julia} -header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] -fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) -``` - - -```{julia} -last(fltbl, 3) +last(fitlog, 5) ``` ### Kronmueller-Barr 2007 {#sec-kb07} -```{julia} -kb07 = Table(dataset(:kb07)) -``` - ```{julia} # this model is very overparameterized, but it's a test example -m = LinearMixedModel(@formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), kb07) -θ = copy(m.θ) -grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) -@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 +m3 = fit( + MixedModel, + @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), + dataset(:kb07); + progress, +) +print(m3) ``` - ```{julia} -@benchmark obj($θ, $(similar(θ))) seconds=5 +m3.optsum ``` +Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector + ```{julia} -print(refit!(m)) +findall(iszero, m3.θ) ``` ```{julia} -m.optsum +fitlog = gr_refit!(m3) ``` ```{julia} -opt = NLopt.Opt(:LD_LBFGS, length(θ)) -NLopt.ftol_rel!(opt, 1.e-8) -NLopt.min_objective!(opt, obj) -empty!(fitlog) -min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +last(fitlog.obj) ``` - ```{julia} -header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] -fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) -``` +last(fitlog, 5) +``` \ No newline at end of file From 16cbd8e6ce2dfabe79691159297ab9b8fd2320db Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 12:53:44 -0600 Subject: [PATCH 06/15] test fix --- ext/MixedModelsForwardDiffExt.jl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index 324186065..a65f6c1df 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -30,7 +30,10 @@ using LinearAlgebra: LinearAlgebra, using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file -using ForwardDiff: ForwardDiff, DiffResult, GradientConfig, Chunk +using ForwardDiff: ForwardDiff, + Chunk, + DiffResult, + GradientConfig using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -59,6 +62,12 @@ const FORWARDDIFF = """ should be included is currently still being decided. """ +function ForwardDiff.GradientConfig( + model::LinearMixedModel, x::AbstractArray=model.θ, chunk::Chunk=Chunk(x) +) + return GradientConfig(fd_deviance(model), x, chunk) +end + """ ForwardDiff.gradient(model::LinearMixedModel) @@ -69,23 +78,18 @@ $(FORWARDDIFF) """ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, θ) -) where {T} - return ForwardDiff.gradient(model, θ, cfg)#, check) + cfg::GradientConfig=GradientConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.gradient!(similar(model.θ), fd_deviance(model), θ, cfg, check) end -# gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T}, -# ::ForwardDiff.GradientConfig{T}, ::Val{CHK}) where {T, T, CHK, F<:LinearMixedModel{T}} - function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, θ) -) where {T} - return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg) -end - -function ForwardDiff.GradientConfig(model::LinearMixedModel, x::AbstractArray, chunk::Chunk = Chunk(x)) - return GradientConfig(fd_deviance(model), x, chunk) + cfg::GradientConfig=GradientConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) end """ From f184c8834f5e5b0fe352f752968b520466f13d9e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:04:24 -0600 Subject: [PATCH 07/15] methods for HessianConfig and hessian! --- ext/MixedModelsForwardDiffExt.jl | 42 +++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index a65f6c1df..de2d80c54 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -32,8 +32,8 @@ using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file using ForwardDiff: ForwardDiff, Chunk, - DiffResult, - GradientConfig + GradientConfig, + HessianConfig using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -62,9 +62,13 @@ const FORWARDDIFF = """ should be included is currently still being decided. """ +##### +##### Gradients +##### + function ForwardDiff.GradientConfig( - model::LinearMixedModel, x::AbstractArray=model.θ, chunk::Chunk=Chunk(x) -) + model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x) +) where {T} return GradientConfig(fd_deviance(model), x, chunk) end @@ -81,10 +85,10 @@ function ForwardDiff.gradient( cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), ) where {T, CHK} - return ForwardDiff.gradient!(similar(model.θ), fd_deviance(model), θ, cfg, check) + return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check) end -function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, +function ForwardDiff.gradient!(result::AbstractArray, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), @@ -92,6 +96,16 @@ function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) end +##### +##### Hessians +##### + +function ForwardDiff.HessianConfig( + model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x) +) where {T} + return HessianConfig(fd_deviance(model), x, chunk) +end + """ ForwardDiff.hessian(model::LinearMixedModel) @@ -101,9 +115,19 @@ values. $(FORWARDDIFF) """ function ForwardDiff.hessian( - model::LinearMixedModel{T}, θ::Vector{T}=model.θ -) where {T} - return ForwardDiff.hessian(fd_deviance(model), θ) + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::HessianConfig=HessianConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check) +end + +function ForwardDiff.hessian!(result::AbstractArray, + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::HessianConfig=HessianConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check) end ##### From 8e8ee8532ad850bd56f3e67f8e1397def5b665a0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:04:52 -0600 Subject: [PATCH 08/15] format --- ext/MixedModelsForwardDiffExt.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index de2d80c54..15f850a86 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -84,7 +84,7 @@ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check) end @@ -92,7 +92,7 @@ function ForwardDiff.gradient!(result::AbstractArray, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) end @@ -118,7 +118,7 @@ function ForwardDiff.hessian( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::HessianConfig=HessianConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check) end @@ -126,7 +126,7 @@ function ForwardDiff.hessian!(result::AbstractArray, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::HessianConfig=HessianConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check) end From 072bd5d25de0c0d41d67d7e3506e25b7bb658e32 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:07:11 -0600 Subject: [PATCH 09/15] NEWS --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 0a63d124f..eb00af618 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +- Additional methods for pre-allocated result arrays and `*Config` instances have been added to the ForwardDiff extension. [#871]. + MixedModels v5.1.0 Release Notes ============================== - Nesting checks for the likelihoodratio test have been slightly tweaked to be more robust, at the cost of being slightly slower. In particular, the comparison of models with pre-centered variables with those with variables centered via StandardizedPredictors.jl was previously incorrectly rejected as non-nested, but should be correctly accepted as nested now. Additionally, some further logging messages are emitted when a nesting check fails. [#867] @@ -710,3 +712,4 @@ Package dependencies [#864]: https://github.com/JuliaStats/MixedModels.jl/issues/864 [#865]: https://github.com/JuliaStats/MixedModels.jl/issues/865 [#867]: https://github.com/JuliaStats/MixedModels.jl/issues/867 +[#871]: https://github.com/JuliaStats/MixedModels.jl/issues/871 From 03009d15ce84e73685ee40776e4bd7cf64dfc8c3 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:29:37 -0600 Subject: [PATCH 10/15] oops --- ext/MixedModelsForwardDiffExt.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index 15f850a86..149bfe355 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -119,7 +119,8 @@ function ForwardDiff.hessian( cfg::HessianConfig=HessianConfig(model, θ), check::Val{CHK}=Val(true), ) where {T,CHK} - return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check) + n = length(θ) + return ForwardDiff.hessian!(Matrix{T}(undef, n, n), model, θ, cfg, check) end function ForwardDiff.hessian!(result::AbstractArray, From 0bfd467925118c03289fcff0cc5766ee4166da4e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:42:19 -0600 Subject: [PATCH 11/15] docs fix: AoG update --- docs/Project.toml | 1 + docs/src/ecosystem.md | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index c396b21bc..4857b499c 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -26,6 +26,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" [compat] +AlgebraOfGraphics = "0.12" BenchmarkTools = "1" DataFrames = "1" Documenter = "1.3" diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md index 53d0ba028..82aa5fe45 100644 --- a/docs/src/ecosystem.md +++ b/docs/src/ecosystem.md @@ -8,7 +8,7 @@ Several packages extend the functionality of MixedModels.jl, both in ways specif ```@example Ecosystem using MixedModels -progress = false +progress = isinteractive() ``` ```@example Ecosystem @@ -182,16 +182,16 @@ Effects are particularly nice for visualizing the model fit and its predictions. using AlgebraOfGraphics # like ggplot2, but an algebra instead of a grammar using CairoMakie -plt1 = data(eff_logit) * - mapping(:age, :use; color=:anych) * - (visual(Lines) + mapping(; lower=:lower, upper=:upper) * visual(LinesFill)) +plt1 = data(eff_logit) * mapping(:age; color=:anych) * + (mapping(:use) * visual(Lines) + + mapping(:lower, :upper) * visual(Band; alpha=0.3)) draw(plt1) ``` ```@example Ecosystem -plt2 = data(eff_prob) * - mapping(:age, :use; color=:anych => "children") * - (visual(Lines) + mapping(; lower=:lower, upper=:upper) * visual(LinesFill)) +plt2 = data(eff_prob) * mapping(:age; color=:anych) * + (mapping(:use) * visual(Lines) + + mapping(:lower, :upper) * visual(Band; alpha=0.3)) draw(plt2) ``` From 463a7d66bfebd62aaa119bc26d5e0357c1a3842c Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Sat, 3 Jan 2026 12:32:21 -0600 Subject: [PATCH 12/15] Add information on gradient evaluation --- gradients/GradientEvaluation.qmd | 301 ++++++++++++++++++++-- gradients/Gradient_based_optimization.qmd | 10 +- gradients/Project.toml | 1 + gradients/bibliography.bib | 6 + 4 files changed, 296 insertions(+), 22 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 776221549..a3ee8ae03 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -1,5 +1,5 @@ --- -title: "Evaluation of the Gradient of the Profiled log-likelihood" +title: "Gradient of the Profiled log-likelihood" author: - name: Douglas Bates email: dmbates@gmail.com @@ -38,8 +38,8 @@ format: A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper. The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design. -The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. -The optimization itself is performed by a gradient-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. +The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and in [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. +The optimization itself is performed by a derivative-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. @@ -69,10 +69,11 @@ $$ {#eq-Sigma} for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. -In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, of +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship $$ -\boldsymbol{\Omega_\theta} = +\begin{aligned} +\boldsymbol{\Omega_\theta}&= \begin{bmatrix} \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & @@ -83,30 +84,37 @@ $$ \mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{y^\top X} & \mathbf{y^\top y}\\ -\end{bmatrix} -$$ {#eq-blockedOmega} - -where $\mathbf{L}_\theta$ has a similar blocked structure - -$$ -\mathbf{L}_\boldsymbol{\theta} = +\end{bmatrix}\\ +&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ +&= \begin{bmatrix} \mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ \mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ \mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} \end{bmatrix} -$$ {#eq-blockedL} +\begin{bmatrix} +\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ +\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ +\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} +\end{bmatrix}^\top +\end{aligned} +$$ {#eq-blockedOmega} + +where the diagonal elements of $\mathbf{L}_\theta$ are positive. -(In the actual computational methods the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) +(In the `MixedModels.jl` implementation the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) -The objective to be optimized is negative twice the profiled log-likelihood, +The objective to be optimized, on the scale of the deviance, is negative twice the profiled log-likelihood, $$ --2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y}) = -\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right] +\begin{aligned} +-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&= +\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\ +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c +\end{aligned} $$ {#eq-objective} -which is on the scale of the deviance (if we were able to define a deviance for these models). +where $c$ is a constant. As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is @@ -128,7 +136,41 @@ $$ \end{aligned} $$ {#eq-delterm1} -For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple in that they are block diagonal with a single non-zero diagonal block, which is an identity matrix. +For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple. +The partial derivatives are zeroes except for a single diagonal block, which is an identity matrix. + +### General expressions for differentiating a Cholesky factor + +@murray2016differentiation section 3.1 provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. + +Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as + +$$ +d\boldsymbol{\Omega}=d\mathbf{L}\mathbf{L}^\top + \mathbf{L} d\mathbf{L}^\top +$$ {#eq-infinitesimal} + +Pre-multiplying @eq-infinitesimal by $\mathbf{L}^{-1}$ and post-multiplying by $\mathbf{L}^{-\top}$ gives + +$$ +\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}=\mathbf{L}^{-1}d\mathbf{L} + d\mathbf{L}^\top\mathbf{L}^{-\top} +$$ {#eq-LOmegaLT} + +The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first. +We wish to isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, which we do with the $\Phi$ transformation applied to a symmetric matrix, which preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. +Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing + +$$ +\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)=\mathbf{L}^{-1}d\mathbf{L} +$$ {#eq-Phi_dOmega} + +or + +$$ +d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right) +$$ {#eq-dL} + +As we shall see, because we only need the derivative of the logarithms of the diagonal elements of $d\mathbf{L}$ to obtain the gradient of the profiled log-likelihood, and because $\mathbf{L}$ is lower triangular, we can stop at @eq-LOmegaLT. +Furthermore, several of the blocks in $\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}$ are either zero or exactly equal to the corresponding block of $\boldsymbol{\Omega_\theta}$. ## Examples @@ -140,13 +182,234 @@ Load the packages to be used #| label: load_packages #| warning: false #| output: false +using CairoMakie using FiniteDiff using LinearAlgebra using MixedModels using MixedModelsDatasets: dataset using TypedTables: Table + +const progress = isinteractive() # to suppress progress bars in non-interactive sessions +CairoMakie.activate!(type="svg") # use svg graphics output +``` + +### Dyestuff - a single, scalar random-effects term + +The `dyestuff` data set provides the yield of dyestuff in each of 5 samples from each of 6 batches of an intermediate product in the process of producing a dye. + +```{julia} +#| label: dyestuff_data +dyestuff = Table(dataset(:dyestuff)) +``` + +A mixed-effects model for these data includes an overall "intercept" term (whose estimate will be the sample mean because of the balanced design) and random effects for each level of `batch`. + +```{julia} +#| label: dyestuff_model +m01 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), dyestuff; progress) +θ = m01.θ # retain a copy of the estimate of θ +print(m01) +``` + +#### The objective as a function of $\theta_1$ + +@fig-obj_graph shows the objective (negative twice the log-likelihood) of this model as a function of $\theta_1$, the relative covariance parameter. + +```{julia} +#| label: fig-obj_graph +#| fig-cap: "Graph of the objective for model m01 as a function of θ₁" +#| code-fold: true +let f = Figure() + ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective") + lines!(ax, -0.25..1.5, objective!(m01)) + f +end +``` + +Notice that the objective is well-defined for negative values of $\theta_1$ and that it is an even function, in the sense that $f(-\theta_1)=f(\theta_1)\,\forall\theta_1$. + +This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. + +The maximum likelihood estimate (i.e. the minimizer of the objective) of $\boldsymbol{\theta}$ for this model is + +```{julia} +#| label: dyestuff_theta +updateL!(setθ!(m01, θ)) # restore the estimate of θ and the Cholesky factor +θ +``` + +#### Evaluating the gradient terms + +Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern + +```{julia} +BlockDescription(m01) +``` + +In @eq-Sigma the matrix $\boldsymbol{\Lambda_\theta}$ is defined as a $6\times6$ diagonal matrix. +Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diagonal block that is the identity, so that multiplication by $\boldsymbol{\Lambda_\theta}$ applies to the full matrix $\mathbf{A}$. + +```{julia} +Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2))) +Λ(θ) +``` + +The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is + +```{julia} +dΛdθ1 = Diagonal(vcat(ones(6), zeros(2))) +``` + +The matrix $\mathbf{A}$ from which $\boldsymbol{\Omega_\theta}$ is generated is stored in blocks. +We assemble these into a sparse matrix as + +```{julia} +A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A))) +``` + +which could be generated as the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of + +```{julia} +ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) +Int.(ZXy) # Int for more concise printing +``` + +::: {.callout-note collapse=true} +#### Printing Int. of a matrix with integer entries + +When we know that all the entries in a floating point matrix, `X`, will be integers, we convert it to integer elements as `Int.(X)` to save space in the output. +::: + +```{julia} +A == ZXy'ZXy +``` + +The matrix $\boldsymbol{\Omega_\theta}$ is + +```{julia} +Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2))) +Ω(θ) +``` + +The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix with + +```{julia} +L = LowerTriangular(sparseL(m01; full=true)) +``` + +We can check that it is indeed the lower Cholesky factor + +```{julia} +L * L' ≈ Ω(θ) +``` + +The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is + +```{julia} +dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1 +dΩdθ1(θ) +``` + +Notice that this matrix, like $\mathbf{A}$ is symmetric and has the same block structure as $\mathbf{A}$. +In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. + +Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to + +```{julia} +prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') +``` + +We note that @eq-delterm1 is the sum of the first 6 diagonal elements of `prePhi`, + +```{julia} +sum(prePhi[i,i] for i in 1:6) +``` + +which should equal + +```{julia} +ldfun(x::Float64) = logdet(updateL!(setθ!(m01, [x]))) +FiniteDiff.finite_difference_derivative(ldfun, only(θ)) +``` + +Similarly the gradient of the other non-constant term in @eq-logdet is + +```{julia} +size(m01.y, 1) * last(prePhi) ``` +compared to + +```{julia} +n_log_lyy_sq(x::Float64) = length(m01.y) * log(abs2(last(last(updateL!(setθ!(m01, [x])).L)))) +n_log_lyy_sq(only(θ)) +``` + +```{julia} +FiniteDiff.finite_difference_derivative(n_log_lyy_sq, only(θ)) +``` + +```{julia} +#| output: false +updateL!(setθ!(m01, θ)) # reset the value of θ in the model +``` + +If we wish to continue the evaluation of all of the elements of $d\mathbf{L}/d\theta_1$ we would use +the $\Phi$ transformation and premultiplication by $\mathbf{L}$ + +```{julia} +function Φ(S) + val = tril(S) # extract the lower triangle + for i in diagind(val) + val[i] *= 0.5 # halve the diagonal elements + end + return LowerTriangular(val) +end +dLdθ1 = L * Φ(prePhi) +``` + +We can check these results against results from finite difference methods. +First we check the $[1,1]$ entry of $\frac{d\mathbf{L}}{d\theta}$ and its derivative + +```{julia} +l11(θ::T) where {T<:Real} = first(first(updateL!(setθ!(m01, [θ])).L)) +l11(only(θ)) +``` + +and its finite-difference derivative + +```{julia} +FiniteDiff.finite_difference_derivative(l11, only(θ)) +``` + +which matches the leading diagonal elements of `dLdθ1`. + +Next we check the last diagonal element of $\mathbf{L}$, which is the square root of the penalized residual sum-of-squares. +```{julia} +rtprss(θ::T) where {T<:Real} = last(last(updateL!(setθ!(m01, [θ])).L)) +rtprss(only(θ)) +``` + +```{julia} +FiniteDiff.finite_difference_derivative(rtprss, only(θ)) +``` + +These calculations verify the values of diagonal elements of $\frac{d\mathbf{L}}{d\theta}$ that will be used to evaluate the gradient of the profiled log-likelihood. + +The derivative of the objective is more easily obtained by rearranging the terms in the objective to isolate constants. +First, because the determinant of a triangular matrix is the product of its diagonal elements, and hence the log of the determinant is the sum of the logs of its diagonal elements, + +$$ +\begin{aligned} +\frac{\partial\log\left|\mathbf{L_{ZZ}}\right|^2}{\partial\theta_i}&= +2\frac{\partial\sum_{j=1}^q \log\left(L_{jj}\right)}{\partial\theta_i}\\ +&=2\sum_{j=1}^q\frac{1}{L_{jj}}\frac{\partial{L_{jj}}}{\partial\theta_i} +\end{aligned} +$$ {#eq-logdet} + +Again, because $\mathbf{L}$ is lower triangular, the division by $L_{jj}$ in the summands of @eq-logdet exactly cancels the effect of multiplication on the left by $\mathbf{L}$ in @eq-dL in these diagonal terms. +Also the multiplier of $2$ in @eq-logdet cancels the halving of the diagonal in @eq-Phi_dOmega, leaving only @eq-delterm1. + ### Penicillin - two completely crossed scalar random-effects terms The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 06b27c2f9..d3167307f 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -97,9 +97,13 @@ end Define a model for the `penicillin` data ```{julia} -#| label: const_defs -#| output: false -m1 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress) +#| label: m1 +m1 = fit( + MixedModel, + @formula(diameter ~ 1 + (1|plate) + (1|sample)), + dataset(:penicillin); + progress, +) print(m1) ``` diff --git a/gradients/Project.toml b/gradients/Project.toml index 220557789..a05f9e152 100644 --- a/gradients/Project.toml +++ b/gradients/Project.toml @@ -1,5 +1,6 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib index 2397e3095..25109d8e4 100644 --- a/gradients/bibliography.bib +++ b/gradients/bibliography.bib @@ -34,3 +34,9 @@ @article{bates2025mixed year={2025} } +@article{murray2016differentiation, + title={Differentiation of the Cholesky decomposition}, + author={Murray, Iain}, + journal={arXiv preprint arXiv:1602.07527}, + year={2016} +} From ab0a7cb9b865d731baffd71cb01edcac4d25376f Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Mon, 5 Jan 2026 18:54:25 -0600 Subject: [PATCH 13/15] Short-cut method of gradient evaluation --- gradients/GradientEvaluation.qmd | 383 +++++++++---------------------- gradients/bibliography.bib | 6 +- 2 files changed, 113 insertions(+), 276 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index a3ee8ae03..ac652277d 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -43,18 +43,17 @@ The optimization itself is performed by a derivative-free optimizer, usually eit Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. -The purpose of this note is to explore whether these expressions can be implemented effectively, even if just for the variance components model, which, for our purposes, is a model in which all the random effects terms are simple, scalar terms. +The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients. -### Expressions for the gradient terms +### Model definition and evaluation of the objective and gradient The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as $$ \begin{aligned} + \mathbfcal{B}&\sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})\\ (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})& - \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)\\ - \mathbfcal{B}& - \sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma}) + \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right) \end{aligned} $$ {#eq-dists} @@ -69,7 +68,7 @@ $$ {#eq-Sigma} for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. -In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship $$ \begin{aligned} @@ -100,48 +99,43 @@ $$ \end{aligned} $$ {#eq-blockedOmega} -where the diagonal elements of $\mathbf{L}_\theta$ are positive. +where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive. +(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) -(In the `MixedModels.jl` implementation the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) +(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) -The objective to be optimized, on the scale of the deviance, is negative twice the profiled log-likelihood, +As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as $$ \begin{aligned} -2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&= \log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\ -&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c_\ell\\ +&=2\sum_{j=1}^q\log L_{j,j} + 2n \log L_{q+p+1,q+p+1} + c_\ell \end{aligned} $$ {#eq-objective} -where $c$ is a constant. +where $c_\ell$ is a constant. +That is, the objective is an affine function (a linear function plus a constant) of the logarithms of the diagonal elements of $\mathbf{L}_\boldsymbol{\theta}$. +It happens that the gradient of the objective, as a function of $\boldsymbol{\theta}$, expressed in this form is straightforward to evaluate, as shown in @sec-Cholesky_derivative. -As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is +As shown in @bates.maechler.etal:2015, sec 3.4 the REML criterion, which some prefer for parameter estimation, can be written as $$ \begin{aligned} -\nabla\log\left|\mathbf{L_ZZ}\right|^2 &= \nabla\log\left(\left|\mathbf{L_{ZZ}L_{ZZ}}^\top\right|\right)\\ -&=\nabla\log\left(\left|\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right|\right)\\ -&=\operatorname{tr}\left[\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) -\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right)^{-1}\right]\\ -&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} -\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) -\mathbf{L_{ZZ}}^{-\top} -\right]\\ -&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} -\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right) -\mathbf{L_{ZZ}}^{-\top} -\right] +-2\mathcal{L}_R(\boldsymbol{\theta}|\mathbf{y})&= +\log\left(\left|\mathbf{L_{ZZ}}\right|^2\left|\mathbf{L_{XX}}\right|^2\right) + (n-p) \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n-p}\right)\right]\\ +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + \log\left|\mathbf{L_{XX}}\right|^2 + (n-p)\log\ell^2_{\mathbf{yy}} + c_r\\ +&=2\sum_{j=1}^{q+p}\log L_{j,j} + 2(n-p)\log L_{q+p+1,q+p+1} + c_r \end{aligned} -$$ {#eq-delterm1} +$$ {#eq-objective} -For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple. -The partial derivatives are zeroes except for a single diagonal block, which is an identity matrix. +where $c_r$ is likewise a constant. +This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$. -### General expressions for differentiating a Cholesky factor +### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative} -@murray2016differentiation section 3.1 provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. +@murray2016differentiation, section 3.1, provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as @@ -156,7 +150,12 @@ $$ $$ {#eq-LOmegaLT} The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first. -We wish to isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, which we do with the $\Phi$ transformation applied to a symmetric matrix, which preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. +Thus, the diagonal of the left-hand side is exactly the result we wish to evaluate, twice the infinitesimal of the logarithms of the diagonal elements of $\mathbf{L}$. + +For completeness, we provide the conclusion of the derivation in @murray2016differentiation but we don't need the more general result of $d\mathbf{L}$ - we only need the particular result from the left-hand side of @eq-LOmegaLT. + +To evaluate $d\mathbf{L}$ we must isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, on the right-hand side of @eq-LOmegaLT, which we do with the $\Phi$ transformation applied to a symmetric matrix. +This transformation preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing $$ @@ -169,9 +168,6 @@ $$ d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right) $$ {#eq-dL} -As we shall see, because we only need the derivative of the logarithms of the diagonal elements of $d\mathbf{L}$ to obtain the gradient of the profiled log-likelihood, and because $\mathbf{L}$ is lower triangular, we can stop at @eq-LOmegaLT. -Furthermore, several of the blocks in $\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}$ are either zero or exactly equal to the corresponding block of $\boldsymbol{\Omega_\theta}$. - ## Examples To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples. @@ -189,7 +185,7 @@ using MixedModels using MixedModelsDatasets: dataset using TypedTables: Table -const progress = isinteractive() # to suppress progress bars in non-interactive sessions +const progress = isinteractive() # suppress progress bars in non-interactive sessions CairoMakie.activate!(type="svg") # use svg graphics output ``` @@ -230,16 +226,35 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. -The maximum likelihood estimate (i.e. the minimizer of the objective) of $\boldsymbol{\theta}$ for this model is +At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$ ```{julia} #| label: dyestuff_theta -updateL!(setθ!(m01, θ)) # restore the estimate of θ and the Cholesky factor -θ +only(θ) +``` + +the derivative should also be zero (in practice, close to zero). + +We can see from @fig-obj_graph that the derivative at $\theta_1=1.$ will be positive. + +To show the evaluation of the gradient at $\boldsymbol{\theta}=[1.]$ we reset the parameter in the model object to $[1.]$ + +```{julia} +#| output: false +updateL!(setθ!(m01, ones(1))); ``` #### Evaluating the gradient terms +In `MixedModels.jl` the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of + +```{julia} +ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) +Int.(ZXy) # Int for more concise printing +``` + +is stored as the $\mathbf{A}$ property of the model object. + Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern ```{julia} @@ -251,7 +266,7 @@ Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diag ```{julia} Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2))) -Λ(θ) +Λ([1.]) ``` The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is @@ -267,19 +282,7 @@ We assemble these into a sparse matrix as A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A))) ``` -which could be generated as the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of - -```{julia} -ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) -Int.(ZXy) # Int for more concise printing -``` - -::: {.callout-note collapse=true} -#### Printing Int. of a matrix with integer entries - -When we know that all the entries in a floating point matrix, `X`, will be integers, we convert it to integer elements as `Int.(X)` to save space in the output. -::: - +and check that these blocks are indeed the Gram matrix of the columns of $\mathbf{[ZXy]}$ ```{julia} A == ZXy'ZXy ``` @@ -288,10 +291,10 @@ The matrix $\boldsymbol{\Omega_\theta}$ is ```{julia} Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2))) -Ω(θ) +Ω([1.]) ``` -The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix with +The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix as ```{julia} L = LowerTriangular(sparseL(m01; full=true)) @@ -300,123 +303,58 @@ L = LowerTriangular(sparseL(m01; full=true)) We can check that it is indeed the lower Cholesky factor ```{julia} -L * L' ≈ Ω(θ) +L * L' ≈ Ω([1.]) ``` The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is ```{julia} dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1 -dΩdθ1(θ) +dΩdθ1([1.]) ``` -Notice that this matrix, like $\mathbf{A}$ is symmetric and has the same block structure as $\mathbf{A}$. +Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$. In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to ```{julia} -prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') -``` - -We note that @eq-delterm1 is the sum of the first 6 diagonal elements of `prePhi`, - -```{julia} -sum(prePhi[i,i] for i in 1:6) +prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L') ``` -which should equal +The derivative of the objective at $\theta_1=1$ is, therefore ```{julia} -ldfun(x::Float64) = logdet(updateL!(setθ!(m01, [x]))) -FiniteDiff.finite_difference_derivative(ldfun, only(θ)) +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) ``` -Similarly the gradient of the other non-constant term in @eq-logdet is +which we can compare to the finite-difference evaluation ```{julia} -size(m01.y, 1) * last(prePhi) -``` - -compared to - -```{julia} -n_log_lyy_sq(x::Float64) = length(m01.y) * log(abs2(last(last(updateL!(setθ!(m01, [x])).L)))) -n_log_lyy_sq(only(θ)) -``` - -```{julia} -FiniteDiff.finite_difference_derivative(n_log_lyy_sq, only(θ)) +FiniteDiff.finite_difference_gradient(objective!(m01), [1.]) ``` +If we repeat these steps at the parameter estimate we have ```{julia} #| output: false updateL!(setθ!(m01, θ)) # reset the value of θ in the model +L = LowerTriangular(sparseL(m01; full=true)) +prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') ``` -If we wish to continue the evaluation of all of the elements of $d\mathbf{L}/d\theta_1$ we would use -the $\Phi$ transformation and premultiplication by $\mathbf{L}$ - -```{julia} -function Φ(S) - val = tril(S) # extract the lower triangle - for i in diagind(val) - val[i] *= 0.5 # halve the diagonal elements - end - return LowerTriangular(val) -end -dLdθ1 = L * Φ(prePhi) -``` - -We can check these results against results from finite difference methods. -First we check the $[1,1]$ entry of $\frac{d\mathbf{L}}{d\theta}$ and its derivative - -```{julia} -l11(θ::T) where {T<:Real} = first(first(updateL!(setθ!(m01, [θ])).L)) -l11(only(θ)) -``` - -and its finite-difference derivative - -```{julia} -FiniteDiff.finite_difference_derivative(l11, only(θ)) -``` - -which matches the leading diagonal elements of `dLdθ1`. - -Next we check the last diagonal element of $\mathbf{L}$, which is the square root of the penalized residual sum-of-squares. -```{julia} -rtprss(θ::T) where {T<:Real} = last(last(updateL!(setθ!(m01, [θ])).L)) -rtprss(only(θ)) -``` +with the derivative being evaluated as ```{julia} -FiniteDiff.finite_difference_derivative(rtprss, only(θ)) +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) ``` -These calculations verify the values of diagonal elements of $\frac{d\mathbf{L}}{d\theta}$ that will be used to evaluate the gradient of the profiled log-likelihood. - -The derivative of the objective is more easily obtained by rearranging the terms in the objective to isolate constants. -First, because the determinant of a triangular matrix is the product of its diagonal elements, and hence the log of the determinant is the sum of the logs of its diagonal elements, - -$$ -\begin{aligned} -\frac{\partial\log\left|\mathbf{L_{ZZ}}\right|^2}{\partial\theta_i}&= -2\frac{\partial\sum_{j=1}^q \log\left(L_{jj}\right)}{\partial\theta_i}\\ -&=2\sum_{j=1}^q\frac{1}{L_{jj}}\frac{\partial{L_{jj}}}{\partial\theta_i} -\end{aligned} -$$ {#eq-logdet} - -Again, because $\mathbf{L}$ is lower triangular, the division by $L_{jj}$ in the summands of @eq-logdet exactly cancels the effect of multiplication on the left by $\mathbf{L}$ in @eq-dL in these diagonal terms. -Also the multiplier of $2$ in @eq-logdet cancels the halving of the diagonal in @eq-Phi_dOmega, leaving only @eq-delterm1. - ### Penicillin - two completely crossed scalar random-effects terms The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. ```{julia} #| label: penicillin_data -const penicillin = Table(dataset(:penicillin)) +penicillin = Table(dataset(:penicillin)) ``` We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`, @@ -425,7 +363,9 @@ We construct a `LinearMixedModel` struct with a single fixed-effect parameter, r #| label: m02 #| output: false #| warn: false -m02 = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +m02 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +θ = m02.θ +print(m02) ``` for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is @@ -441,10 +381,10 @@ The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left ```{julia} #| label: m02L -Lsparse = LowerTriangular(sparseL(updateL!(m02); full=true)) +Lsparse = LowerTriangular(sparseL(updateL!(setθ!(m02, ones(2))); full=true)) ``` -In practice, the full $\mathbf{L}$ matrix is stored in a blocked form +In practice, the $\mathbf{L}$ matrix is stored in a blocked form ```{julia} #| label: m02_blocks @@ -460,186 +400,81 @@ objective(m02) #### Evaluating terms in the gradient -For illustration of the gradient evaluation we create the lower-triangular sparse submatrix $\mathbf{L_{ZZ}}$ as +Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the `A` property of the model ```{julia} -#| label: m02LZZ -LZZsparse = LowerTriangular(sparseL(m02)) -``` - -from which $\log\left|\mathbf{L_{ZZ}}\right|^2$ can be evaluated as - -```{julia} -#| label: logdet_m02 -2. * sum(log, diag(LZZsparse)) -``` - -In practice we use the `logdet` function - -```{julia} -#| label: logdet__m02 -logdet(m02) -``` - -which evaluates this quantity from the blocked representation of $\mathbf{L}$. - -A finite-difference approximation to the gradient of the `logdet` at this value of $\boldsymbol{\theta}$ is - -```{julia} -ldfun(x::Vector{Float64}) = logdet(updateL!(setθ!(m02, x))) -FiniteDiff.finite_difference_gradient(ldfun, [1., 1.]) -``` - -The matrix $\mathbf{A_{ZZ}}=\mathbf{Z}^\top\mathbf{Z}$ for this model, as a dense matrix, is - -```{julia} -#| label: denseA -A = Int.(hvcat(2, first(m02.A), m02.A[2]', m02.A[2], m02.A[3])) -``` - -and the first face of $\nabla{\boldsymbol{\Lambda}}$ is - -```{julia} -#| label: nabla_Lambda -nabla1 = Int.(Diagonal(vcat(ones(Int, 24), zeros(Int, 6)))) -``` - -With $\boldsymbol{\Lambda(\theta)}$ being - -```{julia} -Λ(θ) = Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6))) -θ = ones(2) # initial parameter vector -Int.(Λ(θ)) # initial value of Λ -``` - -the first face of $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is - -```{julia} -#| label: symprod -symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 -Int.(symprod) -``` - -producing the matrix whose trace is desired as - -```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') # overwrites the value of symprod -``` - -yielding the trace as - -```{julia} -sum(diag(symprod)) -``` - -One point to notice here is that the $[1,1]$ block of this matrix is diagonal, with elements of - -```{julia} -((2 * first(θ)) .* first(m02.A).diag) ./ abs2.(first(m02.L).diag) -``` - -which can be used to simplify the evaluation of the first gradient term. -In particular, the gradient of a model with a single, scalar random-effects term is, unsurprisingly, straightforward. - -For the second element of the gradient we define - -```{julia} -nabla2 = Diagonal(vcat(zeros(24), ones(6))) -Int.(nabla2) -``` - -and - -```{julia} -symprod = nabla2 * A * Λ(ones(2)) + Λ(ones(2)) * A * nabla2 -``` - -The matrix whose trace is required is - -```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') -``` - -producing the second element of the gradient of `ldfun` as - -```{julia} -sum(diag(symprod)) +A2 = let blk = m02.A + hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6]) +end ``` -Notice that the entire $[1,1]$ block of this matrix is zero and will not need to be evaluated explicitly. - -We evaluate $\boldsymbol{\hat{\theta}}$ using a derivative-free optimizer as +The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is ```{julia} -θ = refit!(m02).θ +function Λ2(θ::Vector{Float64}) + length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) + return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2))) +end +Λ2([1.,1.]) ``` -after which the first face of `symprod` becomes +producing $\boldsymbol{\Omega}$ as ```{julia} -symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 +function Ω2(θ) + length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) + return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2))) +end +Ω2([1.,1.]) ``` -`LZZsparse` becomes +The partial derivatives of `Ω2` are constants ```{julia} -LZZsparse = LowerTriangular(sparseL(m02)) +#| output: false +dΛ2dθ1 = Diagonal(vcat(ones(24), zeros(8))) +dΛ2dθ2 = Diagonal(vcat(zeros(24), ones(6), zeros(2))) ``` -and the matrix whose trace is required is +For the ML objective (i.e. negative twice the log-likelihood) a finite difference gradient at the initial $\boldsymbol{\theta}=[1,1]^\top$ is ```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +FiniteDiff.finite_difference_gradient(objective!(m02), ones(2)) ``` -yielding the gradient term +To evaluate this quantity from the formula we create ```{julia} -sum(diag(symprod)) +dΩ2dθ1(θ) = dΛ2dθ1 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ1' +Int.(dΩ2dθ1([1., 1.])) # Int to save space when printing ``` -which can be compared to the finite difference value ```{julia} -FiniteDiff.finite_difference_gradient(ldfun, θ) +prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ1([1.,1.])), Lsparse') ``` -(Note, this is the first element of the gradient of the `logdet` term only, not the gradient of the objective which is near zero +yielding the first component of the gradient as ```{julia} -FiniteDiff.finite_difference_gradient(objective!(m02), θ) +dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) ``` -as it should be at the optimum.) - -For the second element of the gradient of `ldfun` we have +For the second component of the gradient ```{julia} -symprod = nabla2 * A * Λ(θ) + Λ(θ) * A * nabla2 +dΩ2dθ2(θ) = dΛ2dθ2 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ2' +Int.(dΩ2dθ2([1., 1.])) ``` -After pre- and post-division by `LZZsparse`, this becomes - ```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse') ``` -yielding the second element of the gradient of `ldfun` as - ```{julia} -sum(diag(symprod)) +dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) ``` -#### Factoring the symmetric matrix - -The matrix $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is symmetric and has the same sparsity structure as $\mathbf{Z^\top Z}$, which is positive semi-definite. -However, it is not clear that the non-zero blocks in $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ will be positive semi-definite in the general case. -In the case of a single variance component it will be positive definite when $\theta_1>0$ because it is $2\theta_1\mathbf{A}$. - - ### References {.unnumbered} ::: {#refs} diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib index 25109d8e4..decf33365 100644 --- a/gradients/bibliography.bib +++ b/gradients/bibliography.bib @@ -31,12 +31,14 @@ @article{bates2025mixed title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization}, author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H}, journal={arXiv preprint arXiv:2505.11674}, - year={2025} + year={2025}, + url={https://arxiv.org/pdf/2505.11674} } @article{murray2016differentiation, title={Differentiation of the Cholesky decomposition}, author={Murray, Iain}, journal={arXiv preprint arXiv:1602.07527}, - year={2016} + year={2016}, + url={https://arxiv.org/pdf/1602.07527} } From 2e8ac3787d9ba2beff9de3880ca4e66c37b75ee0 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Fri, 9 Jan 2026 11:53:18 -0600 Subject: [PATCH 14/15] Partial gradient for vector-valued r.e.'s --- gradients/GradientEvaluation.qmd | 125 +++++++++++++++++++++++++++---- gradients/gradient.jl | 39 ++++++++++ 2 files changed, 148 insertions(+), 16 deletions(-) create mode 100644 gradients/gradient.jl diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index ac652277d..174c786c4 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -45,7 +45,7 @@ Expressions for the gradient of the profiled log-likelihood were given in sec. 3 The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients. -### Model definition and evaluation of the objective and gradient +### Model definition and evaluation of the objective The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as @@ -58,31 +58,27 @@ $$ $$ {#eq-dists} where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$. -Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite. +Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is symmetric and positive semi-definite. We express it as $$ -\boldsymbol{\Sigma} = \sigma^2 -\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} +\boldsymbol{\Sigma} = \sigma^2\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} $$ {#eq-Sigma} for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship + $$ \begin{aligned} \boldsymbol{\Omega_\theta}&= \begin{bmatrix} \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& -\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X}& \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\ -\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & -\mathbf{X^\top X} & -\mathbf{X^\top y}\\ -\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & -\mathbf{y^\top X} & -\mathbf{y^\top y}\\ +\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{X^\top X} & \mathbf{X^\top y}\\ +\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{y^\top X} & \mathbf{y^\top y}\\ \end{bmatrix}\\ &=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ &= @@ -100,9 +96,7 @@ $$ $$ {#eq-blockedOmega} where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive. -(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) - -(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) +(We assume that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as @@ -131,7 +125,49 @@ $$ $$ {#eq-objective} where $c_r$ is likewise a constant. -This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$. +This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_\boldsymbol{\theta}}$. + +### Reformulation for evaluation of derivatives + +When differentiating the ML or REML objective with respect to elements of $\boldsymbol{\theta}$, it is convenient to amalgamate the blocks of $\boldsymbol{\Omega_\theta}$ derived from $\mathbf{X}$ and $\mathbf{y}$ and to re-express @eq-blockedOmega as + +$$ +\begin{aligned} +\boldsymbol{\Omega_\theta}&= +\begin{bmatrix} +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}& +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top[Xy]}\\ +\mathbf{[Xy]^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{[Xy]^\top[Xy]} +\end{bmatrix} + +\begin{bmatrix} +\mathbf{I} & \mathbf{0}\\ +\mathbf{0} & \mathbf{0} +\end{bmatrix}\\ +&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ +&= +\begin{bmatrix} +\mathbf{L_{Z,Z}} & \mathbf{0}\\ +\mathbf{L_{Xy,Z}} & \mathbf{L_{Xy,Xy}} +\end{bmatrix} +\begin{bmatrix} +\mathbf{L_{Z,Z}} & \mathbf{0} \\ +\mathbf{L_{Xy,Z}} & \mathbf{L_{Xy,Xy}} +\end{bmatrix}^\top +\end{aligned} +$$ {#eq-blockedOmega_mod} + +where $\mathbf{[Xy]}$ represents the $n\times(p+1)$ matrix that is the horizontal concatenation of $\mathbf{X}$ and $\mathbf{y}$. +The matrices $\mathbf{A_{11}}=\mathbf{Z^\top Z}$, $\mathbf{A_{21}}=\mathbf{[Xy]^\top Z}$ and $\mathbf{A_{22}}=\mathbf{[Xy]^\top[Xy]}$, assembled as + +$$ +\mathbf{A}=\begin{bmatrix} +\mathbf{A_{11}} & \mathbf{A_{21}^\top}\\ +\mathbf{A_{21}} & \mathbf{A_{22}} +\end{bmatrix} , +$$ {#eq-Amat} + +are precomputed and stored as the `A` property in a `LinearMixedModel` object when random effects are associated with a single grouping factor. ### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative} @@ -178,8 +214,10 @@ Load the packages to be used #| label: load_packages #| warning: false #| output: false +using BenchmarkTools using CairoMakie using FiniteDiff +using ForwardDiff using LinearAlgebra using MixedModels using MixedModelsDatasets: dataset @@ -408,7 +446,7 @@ A2 = let blk = m02.A end ``` -The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is +The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is ```{julia} function Λ2(θ::Vector{Float64}) @@ -475,6 +513,61 @@ prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse') dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) ``` +### Sleepstudy - a single vector-valued random-effects term + +```{julia} +m03 = fit(MixedModel, @formula(reaction ~ 1 + days + (1+days|subj)), dataset(:sleepstudy); progress) +θ03 = m03.θ +print(m03) +``` + +For this model we create a function to evaluate the gradient components. +First, the gradient of the logarithm of the square of the determinant of $\mathbf{L_{1,1}}$ + +```{julia} +ldfun(θ::Vector{Float64}) = logdet(updateL!(setθ!(m03, θ))) +ldfun(θ03) +``` + +A finite-difference approximation to the gradient is + +```{julia} +FiniteDiff.finite_difference_gradient(ldfun, θ03) +``` + +with the corresponding analytic value + +```{julia} +include("gradient.jl") +grad_comp(updateL!(setθ!(m03, θ03))) # need to reset θ after the finite_difference operation +``` + + +#### Speed of evaluation + +First, the evaluation of the log-determinant of $\mathbf{L_{1,1}}$ + +```{julia} +@benchmark logdet(updateL!(setθ!($m03, $θ03))) seconds=1 +``` + +Then the subsequent evaluation of the gradient of the log-determinant + +```{julia} +@benchmark grad_comp($m03) seconds=1 +``` + +For comparison, the finite-difference evaluation +```{julia} +@benchmark FiniteDiff.finite_difference_gradient($ldfun, $θ03) seconds=1 +``` + +and the forward-difference version + +```{julia} +@benchmark ForwardDiff.gradient($m03, $θ03) seconds=1 +``` + ### References {.unnumbered} ::: {#refs} diff --git a/gradients/gradient.jl b/gradients/gradient.jl new file mode 100644 index 000000000..8b0a8ff79 --- /dev/null +++ b/gradients/gradient.jl @@ -0,0 +1,39 @@ +using LinearAlgebra +using MixedModels +using MixedModelsDatasets: dataset + +""" + grad_comp(m::LinearMixedModel) + +Returns the gradient of the log-determinant part of the objective for `m`, +which must have a single, vector-valued random-effects term. +""" +function grad_comp(m::LinearMixedModel{T}) where {T} + (; reterms, parmap, A, L) = m + A11 = first(A).data + L11 = first(L).data + λ = only(reterms).λ # checks that there is exactly one random-effects term + λdot = similar(λ) + face = similar(λ.data) + grad = zeros(T, length(parmap)) + for (p, pm) in enumerate(parmap) + fill!(λdot, zero(T)) + λdot[pm[2], pm[3]] = one(T) + for k in axes(A11, 3) # loop over faces of A[1].data + rmul!(lmul!(λ', copyto!(face, view(A11, :, :, k))), λdot) + for i in axes(face, 1) # symmetrize the face and double the diagonal + for j in 1:(i - 1) + ijsum = face[i, j] + face[j, i] + face[j, i] = face[i, j] = ijsum + end + face[i, i] *= 2 + end + Lface = LowerTriangular(view(L11, :, :, k)) + rdiv!(ldiv!(Lface, face), Lface') + for i in diagind(face) + grad[p] += face[i] + end + end + end + return grad +end \ No newline at end of file From cfba091042cc8f3f8ec4dd4af3e2e3b5883d46a1 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 13 Jan 2026 19:04:52 -0600 Subject: [PATCH 15/15] Expand docs, start src/gradient.jl --- gradients/GradientEvaluation.qmd | 41 ++++++-- gradients/Gradient_based_optimization.qmd | 123 ++++++++++++++++++---- src/MixedModels.jl | 1 + src/gradient.jl | 56 ++++++++++ test/grad.jl | 73 +++++++++++++ test/runtests.jl | 1 + 6 files changed, 269 insertions(+), 26 deletions(-) create mode 100644 src/gradient.jl create mode 100644 test/grad.jl diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 174c786c4..9c11192ff 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -220,6 +220,7 @@ using FiniteDiff using ForwardDiff using LinearAlgebra using MixedModels +using MixedModels: Omega_dot_diag_block! using MixedModelsDatasets: dataset using TypedTables: Table @@ -251,11 +252,13 @@ print(m01) ```{julia} #| label: fig-obj_graph -#| fig-cap: "Graph of the objective for model m01 as a function of θ₁" +#| fig-cap: "Graph of the objective for model m01 as a function of θ₁. The light blue horizontal line is at the minimum of the objective. The vertical line is at the parameter estimate." #| code-fold: true let f = Figure() ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective") lines!(ax, -0.25..1.5, objective!(m01)) + hlines!(objective(updateL!(setθ!(m01, θ))); alpha=0.4) + vlines!(only(θ); alpha=0.4) f end ``` @@ -264,7 +267,7 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. -At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$ +At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$, which is the ratio of the standard deviation of the random effects to the residual standard deviation, is ```{julia} #| label: dyestuff_theta @@ -352,12 +355,16 @@ dΩdθ1([1.]) ``` Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$. -In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. +In fact, the $[2,1]$ block of this matrix is exactly the same as the $[2,1]$ block of $\mathbf{A}$. -Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to +We do the premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ in two stages, to check the form of the intermediate result ```{julia} -prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L') +prePhi = ldiv!(L, dΩdθ1([1.])) +``` + +```{julia} +rdiv!(prePhi, L') ``` The derivative of the objective at $\theta_1=1$ is, therefore @@ -386,6 +393,25 @@ with the derivative being evaluated as dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) ``` +#### Using blocked factors + +It would not be practical to create the `sparseL` matrix and a dense copy of $\mathbf{A}$, as is done here, for general cases. +The whole point of using a blocked Cholesky factor for evaluating the objective is that the $[1,1]$ block is either diagonal or uniform-block-diagonal, which can result in considerable savings of object size and execution time when evaluating the objective. + +We would want to retain this time and memory savings when evaluating the gradient by creating a blocked computation for evaluation of the gradient. + +The blocked representations of both the $\mathbf{A}$ and $\mathbf{L}$ arrays store only the lower triangle. +It appears that it will be necessary to store the upper triangle in addition to the lower triangle to be able to evaluate the gradient. +There may be a clever way around this but right now I can't see one. + +To create the blocked representation of $\Omega$ we begin with the diagonal block in which the parameter component to be differentiated occurs. +It is assumed that the block has already been allocated. + +```{julia} +dΩdθ1_blk = Omega_dot_diag_block!(similar(first(m01.A)), m01, 1) +``` + + ### Penicillin - two completely crossed scalar random-effects terms The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. @@ -444,6 +470,7 @@ Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the A2 = let blk = m02.A hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6]) end +Int.(A2) ``` The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is @@ -453,7 +480,7 @@ function Λ2(θ::Vector{Float64}) length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2))) end -Λ2([1.,1.]) +Int.(Λ2([1.,1.])) ``` producing $\boldsymbol{\Omega}$ as @@ -463,7 +490,7 @@ function Ω2(θ) length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2))) end -Ω2([1.,1.]) +Int.(Ω2([1.,1.])) ``` The partial derivatives of `Ω2` are constants diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index d3167307f..96ee4eb08 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -48,6 +48,8 @@ Load the packages to be used ```{julia} #| label: load_packages +#| output: false +using BenchmarkTools using ForwardDiff using MixedModels using MixedModels: fd_deviance @@ -56,7 +58,7 @@ using NLopt using Tables: table using TypedTables: Table -const progress = false +const progress = isinteractive() ``` ## Examples {#sec-examples} @@ -64,6 +66,7 @@ const progress = false We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`. ```{julia} +#| output: false addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0')) function gr_refit!(m::LinearMixedModel{T}) where {T} θ = copy(m.optsum.initial) @@ -83,8 +86,9 @@ function gr_refit!(m::LinearMixedModel{T}) where {T} return val end opt = NLopt.Opt(:LD_LBFGS, k) - NLopt.ftol_rel!(opt, 1.e-12) + NLopt.ftol_rel!(opt, 1.e-10) NLopt.ftol_abs!(opt, 1.e-8) + NLopt.initial_step!(opt, fill(0.5, k)) NLopt.min_objective!(opt, obj) min_f, min_x, ret = NLopt.optimize(opt, θ) header = vcat([:obj], addinds('θ', k), addinds('g', k)) @@ -98,31 +102,32 @@ Define a model for the `penicillin` data ```{julia} #| label: m1 -m1 = fit( +pnm01 = fit( MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress, ) -print(m1) +pnm01_obj = objective(pnm01) +print(pnm01) ``` for which the optimization summary is ```{julia} -m1.optsum +pnm01.optsum ``` and refit the model using ForwardDiff gradient evaluations. ```{julia} -fitlog = gr_refit!(m1) +fitlog = gr_refit!(pnm01) ``` -The objective at convergence is +The objective at convergence, compared to the derivative-free optimum ```{julia} -last(fitlog.obj) +pnm01_obj - last(fitlog.obj) ``` and the last few evaluations are @@ -131,23 +136,89 @@ and the last few evaluations are last(fitlog, 5) ``` +### Pastes {#sec-pastes} + +```{julia} +psm01 = fit(MixedModel, @formula(strength ~ 1 + (1|batch/cask)), dataset(:pastes); progress) +psm01_obj = objective(psm01) +print(psm01) +``` + +```{julia} +psm01.optsum +``` + +```{julia} +fitlog = gr_refit!(psm01) +``` + +```{julia} +psm01_obj - last(fitlog.obj) +``` + +```{julia} +last(fitlog, 5) +``` + +### Insteval {#sec-insteval} + +```{julia} +insteval = dataset(:insteval) +contrasts = Dict(:service => EffectsCoding()) +iem01 = fit( + MixedModel, + @formula(y ~ 1 + service + (1|s) + (1|d) + (1|dept)), + insteval; + progress, contrasts, +) +iem01_obj = objective(iem01) +print(iem01) +``` + +```{julia} +iem01.optsum +``` + +```{julia} +fitlog = gr_refit!(iem01) +``` + +```{julia} +iem01_obj - last(fitlog.obj) +``` + +```{julia} +last(fitlog, 5) +``` + +This is an example where the number of evaluations to convergence is lower when using the gradient but the time to fit the model is much greater - primarily because the ForwardDiff gradient allocates so much memory. + +```{julia} +@benchmark refit!($iem01; progress=false) seconds=10 +``` + +```{julia} +@benchmark gr_refit!($iem01) seconds=30 +``` + ### Sleepstudy {#sec-sleepstudy} ```{julia} -m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) -print(m2) +slm01 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) +slm01_obj = objective(slm01) +print(slm01) ``` ```{julia} -m2.optsum +slm01.optsum ``` ```{julia} -fitlog = gr_refit!(m2) +fitlog = gr_refit!(slm01) ``` ```{julia} -last(fitlog.obj) +slm01_obj - last(fitlog.obj) ``` ```{julia} @@ -158,33 +229,47 @@ last(fitlog, 5) ```{julia} # this model is very overparameterized, but it's a test example -m3 = fit( +kbm01 = fit( MixedModel, @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), dataset(:kb07); progress, ) -print(m3) +print(kbm01) ``` ```{julia} -m3.optsum +kbm01.optsum ``` Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector ```{julia} -findall(iszero, m3.θ) +findall(iszero, kbm01.θ) ``` +Refitting with the gradient takes a very long time because ForwardDiff is poorly suited to optimization problems with many parameters. ```{julia} -fitlog = gr_refit!(m3) +#| eval: false +fitlog = gr_refit!(kbm01) ``` ```{julia} +#| eval: false last(fitlog.obj) ``` ```{julia} +#| eval: false last(fitlog, 5) -``` \ No newline at end of file +``` + +## Conclusions {#sec-conclusions} + +Generally the gradient-based optimizers converge in fewer evaluations than the derivative-free optimizers (`psm01` in @sec-pastes is an exception). +Although the `ftol_rel` criterion is looser for the gradient-based optimizer it usually achieves a lower optimum value, as shown by the differences like `pnm01_obj - last(fitlog.obj)` being positive. + +I think the most interesting result is for the `insteval` data where the three-parameter optimization takes 81 function evaluations for `LN_NEWUOA` but 34 evaluations for `LD_LBFGS`. +However, the ForwardDiff gradient evaluation takes much longer because it allocates so much memory (and it may be using a non-BLAS Cholesky factorization of $1128\times1128$ symmetric matrix). + +I think this is a case where an analytic gradient could be useful. diff --git a/src/MixedModels.jl b/src/MixedModels.jl index 61d184c84..f3400bc98 100644 --- a/src/MixedModels.jl +++ b/src/MixedModels.jl @@ -214,6 +214,7 @@ include("blockdescription.jl") include("grouping.jl") include("mimeshow.jl") include("serialization.jl") +include("gradient.jl") include("profile/profile.jl") include("MixedModelsNLoptExt.jl") using .MixedModelsNLoptExt diff --git a/src/gradient.jl b/src/gradient.jl new file mode 100644 index 000000000..b6c68bdbb --- /dev/null +++ b/src/gradient.jl @@ -0,0 +1,56 @@ +# Evaluate analytic gradient of the objective for ML or REML fitting of a LinearMixedModel + +""" + Omega_dot_diag_block!(blk, m::LinearMixedModel, p::Integer) + +Fill `blk` as the non-zero diagonal block of ∂Ω/∂θₚ for parameter number `p` of model `m`. + +For any `p` only one diagonal block of ∂Ω/∂θₚ will be non-zero. +""" +function Omega_dot_diag_block!( + blk::Diagonal{T, Vector{T}}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + isone(i) && isone(j) || throw(ArgumentError("parameter $b should be from a scalar r.e. term")) + blk_diag = blk.diag + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + length(blk_diag) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths")) + λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) + for k in eachindex(blk_diag, A_diag) + blk_diag[k] = T(2) * λ * A_diag[k] + end + return blk +end + +function Omega_dot_diag_block!( + blk::UniformBlockDiagonal{T}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + Ablk = A[kp1choose2(b)] + if !isa(Ablk, UniformBlockDiagonal{T}) + throw(ArgumentError("parmap[p] = $(parmap[p]) but A[$(kp1choose2(b))] is not UniformBlockDiagonal")) + end + blk_dat = fill!(blk.data, zero(T)) + Ablk_dat = Ablk.data + λ = reterms[b].λ + for k in axes(blk_dat, 3) + # right multiply by λ-dot-transpose, which is zeros except for a single 1 at the i'th column and j'th row + # thus we copy the i'th column of the k'th face of Ablk_dat into the j'th column of the k'th face of blk_dat + copyto!(view(blk_dat, :, i, k), view(Ablk_dat, :, j, k)) + lmul!(λ, view(blk_dat, :, :, k)) # left-multiply by λ + for jj in axes(λ, 2) # symmetrize the face while multiplying the diagonal by 2 + for ii in 1:(jj - 1) + val = blk_dat[ii, jj, k] + blk_dat[jj, ii, k] + blk_dat[ii, jj, k] = blk_dat[jj, ii, k] = val + end + blk_dat[jj, jj, k] *= T(2) + end + end + return blk +end diff --git a/test/grad.jl b/test/grad.jl new file mode 100644 index 000000000..1e036be7d --- /dev/null +++ b/test/grad.jl @@ -0,0 +1,73 @@ +# using FiniteDiff +using LinearAlgebra +using MixedModels +using Test +using MixedModels: Omega_dot_diag_block! + +using MixedModelsDatasets: dataset + +include("modelcache.jl") + +@testset "gradient" begin + @testset "single_scalar" begin + fm1 = only(models(:dyestuff2)) + θ = fm1.θ + blk = Omega_dot_diag_block!(similar(first(fm1.A)), fm1, 1) + @test all(≈(10. * only(θ)), blk.diag) + Omega_dot_diag_block!(blk, updateL!(setθ!(fm1, ones(1))), 1) + @test all(==(10.), blk.diag) + updateL!(setθ!(fm1, θ)) # restore the estimated parameter values + + fm2 = first(models(:pastes)) + θ = fm2.θ + blk = Omega_dot_diag_block!(similar(first(fm2.A)), fm2, 1) + @test all(≈(4. * only(θ)), blk.diag) + Omega_dot_diag_block!(blk, updateL!(setθ!(fm2, ones(1))), 1) + @test all(==(4.), blk.diag) + updateL!(setθ!(fm2, θ)) + + fm3 = last(models(:pastes)) # first of two nested, scalar r.e. terms + θ = fm3.θ + blk = Omega_dot_diag_block!(similar(first(fm3.A)), fm3, 1) + @test all(≈(4. * first(θ)), blk.diag) + + fm4 = only(models(:penicillin)) + blk = Omega_dot_diag_block!(similar(first(fm4.A)), fm4, 1) + @test all(≈(12. * first(fm4.θ)), blk.diag) + + fm5 = first(models(:sleepstudy)) + blk = Omega_dot_diag_block!(similar(first(fm5.A)), fm5, 1) + @test all(≈(20. * only(fm5.θ)), blk.diag) + end + @testset "single_vector" begin + fm6 = last(models(:sleepstudy)) + λ = only(fm6.reterms).λ + θ = fm6.θ + blk = Omega_dot_diag_block!(UniformBlockDiagonal(similar(first(fm6.L).data)), fm6, 1) + blk_dat = blk.data + A11_dat = first(fm6.A).data + @test all(≈(20. * first(θ)), view(blk_dat, 1, 1, :)) + @test all(iszero, view(blk_dat, 2, 2, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) + odiag = dot(view(λ, 2, :), view(A11_dat, :, 1, 1)) + @test all(≈(odiag), view(blk_dat, 1, 2, :)) + + Omega_dot_diag_block!(blk, fm6, 2) + @test all(iszero, view(blk_dat, 1, 1, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # result should be symmetric + @test all(==(10. * first(θ)), view(blk_dat, 1, 2, :)) + diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 1, 1)) + @test all(≈(diag2), view(blk_dat, 2, 2, :)) + + Omega_dot_diag_block!(blk, fm6, 3) + @test all(iszero, view(blk_dat, 1, 1, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # faces of result should be symmetric + @test all(≈(45. * first(θ)), view(blk_dat, 1, 2, :)) + diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 2, 1)) + @test all(≈(diag2), view(blk_dat, 2, 2, :)) + +# FiniteDiff.finite_difference_gradient(objective!(fm6), θ) +# ldfun(m::LinearMixedModel, θ::Vector{Float64}) = logdet(updateL!(setθ!(m, θ))) + + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 0631ca3a0..20cb6d4f5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,6 +47,7 @@ include("mime.jl") include("optsummary.jl") include("predict.jl") include("sigma.jl") +include("grad.jl") @testset "PRIMA" include("prima.jl") @testset "ForwardDiff" include("forwarddiff.jl")