From a2cbbaeb2ed655fe2c8940f67519a7ab12899d12 Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Sun, 28 Dec 2025 22:20:31 -0600
Subject: [PATCH 01/15] write up on gradients

---
 experiments/.gitignore                      |   4 +
 experiments/GradientEvaluation.qmd          | 383 ++++++++++++++++++++
 experiments/Gradient_based_optimization.qmd | 197 ++++++++++
 experiments/Project.toml                    |  10 +
 experiments/bibliography.bib                |  36 ++
 5 files changed, 630 insertions(+)
 create mode 100644 experiments/.gitignore
 create mode 100644 experiments/GradientEvaluation.qmd
 create mode 100644 experiments/Gradient_based_optimization.qmd
 create mode 100644 experiments/Project.toml
 create mode 100644 experiments/bibliography.bib

diff --git a/experiments/.gitignore b/experiments/.gitignore
new file mode 100644
index 000000000..7d064f72a
--- /dev/null
+++ b/experiments/.gitignore
@@ -0,0 +1,4 @@
+*.html
+*\~
+*.swp
+
diff --git a/experiments/GradientEvaluation.qmd b/experiments/GradientEvaluation.qmd
new file mode 100644
index 000000000..776221549
--- /dev/null
+++ b/experiments/GradientEvaluation.qmd
@@ -0,0 +1,383 @@
+---
+title: "Evaluation of the Gradient of the Profiled log-likelihood"
+author:
+  - name: Douglas Bates
+    email: dmbates@gmail.com
+    orcid: 0000-0001-8316-9503
+    affiliation:
+      - name: University of Wisconsin - Madison
+        city: Madison
+        state: WI
+        url: https://www.wisc.edu
+        department: Statistics
+  - name: Phillip Alday
+    email: me@phillipalday.com
+    orcid: 0000-0002-9984-5745
+    affiliation:
+      - name: Beacon Biosignals
+        url: https://beacon.bio
+date: last-modified
+date-format: iso
+toc: true
+bibliography: bibliography.bib
+number-sections: true
+engine: julia
+julia:
+  exeflags:
+    - -tauto
+    - --project=@.
+format:
+  html:
+    toc: true
+    toc-location: right
+    embed-resources: true
+---
+
+## Introduction {#sec-intro}
+
+A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper.
+The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design.
+
+The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood.
+The optimization itself is performed by a gradient-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers.
+
+Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages.
+
+The purpose of this note is to explore whether these expressions can be implemented effectively, even if just for the variance components model, which, for our purposes, is a model in which all the random effects terms are simple, scalar terms.
+
+### Expressions for the gradient terms
+
+The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as
+
+$$
+\begin{aligned}
+    (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})&
+    \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)\\
+    \mathbfcal{B}&
+    \sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})
+\end{aligned}
+$$ {#eq-dists}
+
+where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$.
+Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite.
+We express it as
+
+$$
+\boldsymbol{\Sigma} = \sigma^2
+\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}}
+$$ {#eq-Sigma}
+
+for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$.
+
+In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, of
+
+$$
+\boldsymbol{\Omega_\theta} =
+\begin{bmatrix}
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}&
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} &
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\
+\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} &
+\mathbf{X^\top X} &
+\mathbf{X^\top y}\\
+\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} &
+\mathbf{y^\top X} &
+\mathbf{y^\top y}\\
+\end{bmatrix}
+$$ {#eq-blockedOmega}
+
+where $\mathbf{L}_\theta$ has a similar blocked structure
+
+$$
+\mathbf{L}_\boldsymbol{\theta} =
+\begin{bmatrix}
+\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\
+\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\
+\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}}
+\end{bmatrix}
+$$ {#eq-blockedL}
+
+(In the actual computational methods the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
+
+The objective to be optimized is negative twice the profiled log-likelihood,
+
+$$
+-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y}) =
+\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]
+$$ {#eq-objective}
+
+which is on the scale of the deviance (if we were able to define a deviance for these models).
+
+As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is
+
+$$
+\begin{aligned}
+\nabla\log\left|\mathbf{L_ZZ}\right|^2 &= \nabla\log\left(\left|\mathbf{L_{ZZ}L_{ZZ}}^\top\right|\right)\\
+&=\nabla\log\left(\left|\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right|\right)\\
+&=\operatorname{tr}\left[\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right)
+\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right)^{-1}\right]\\
+&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1}
+\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right)
+\mathbf{L_{ZZ}}^{-\top}
+\right]\\
+&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1}
+\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
+\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)
+\mathbf{L_{ZZ}}^{-\top}
+\right]
+\end{aligned}
+$$ {#eq-delterm1}
+
+For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple in that they are block diagonal with a single non-zero diagonal block, which is an identity matrix.
+
+## Examples
+
+To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples.
+
+Load the packages to be used
+
+```{julia}
+#| label: load_packages
+#| warning: false
+#| output: false
+using FiniteDiff
+using LinearAlgebra
+using MixedModels
+using MixedModelsDatasets: dataset
+using TypedTables: Table
+```
+
+### Penicillin - two completely crossed scalar random-effects terms
+
+The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s.
+
+```{julia}
+#| label: penicillin_data
+const penicillin = Table(dataset(:penicillin))
+```
+
+We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`,
+
+```{julia}
+#| label: m02
+#| output: false
+#| warn: false
+m02 = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin)
+```
+
+for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is
+
+```{julia}
+#| label: m02ZXy
+Int.(hcat(collect(first(m02.reterms)), collect(last(m02.reterms)), m02.X, m02.y))
+```
+
+in which the first 24 columns are the indicators for `plate`, the next 6 columns are the indicators for `sample`, the second-to-last column is the single column of the fixed-effects model matrix, $\mathbf{X}$, and the last column is $\mathbf{y}$.
+
+The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left[1,1\right]^\top$, can be expressed as a lower-triangular sparse matrix as
+
+```{julia}
+#| label: m02L
+Lsparse = LowerTriangular(sparseL(updateL!(m02); full=true))
+```
+
+In practice, the full $\mathbf{L}$ matrix is stored in a blocked form
+
+```{julia}
+#| label: m02_blocks
+BlockDescription(m02)
+```
+
+from which the profiled objective (negative twice the log-likelihood) can be evaluated as
+
+```{julia}
+#| label: m02L_initial_objective
+objective(m02)
+```
+
+#### Evaluating terms in the gradient
+
+For illustration of the gradient evaluation we create the lower-triangular sparse submatrix $\mathbf{L_{ZZ}}$ as
+
+```{julia}
+#| label: m02LZZ
+LZZsparse = LowerTriangular(sparseL(m02))
+```
+
+from which $\log\left|\mathbf{L_{ZZ}}\right|^2$ can be evaluated as
+
+```{julia}
+#| label: logdet_m02
+2. * sum(log, diag(LZZsparse))
+```
+
+In practice we use the `logdet` function
+
+```{julia}
+#| label: logdet__m02
+logdet(m02)
+```
+
+which evaluates this quantity from the blocked representation of $\mathbf{L}$.
+
+A finite-difference approximation to the gradient of the `logdet` at this value of $\boldsymbol{\theta}$ is
+
+```{julia}
+ldfun(x::Vector{Float64}) = logdet(updateL!(setθ!(m02, x)))
+FiniteDiff.finite_difference_gradient(ldfun, [1., 1.]) 
+```
+
+The matrix $\mathbf{A_{ZZ}}=\mathbf{Z}^\top\mathbf{Z}$ for this model, as a dense matrix, is
+
+```{julia}
+#| label: denseA
+A = Int.(hvcat(2, first(m02.A), m02.A[2]', m02.A[2], m02.A[3]))
+```
+
+and the first face of $\nabla{\boldsymbol{\Lambda}}$ is
+
+```{julia}
+#| label: nabla_Lambda
+nabla1 = Int.(Diagonal(vcat(ones(Int, 24), zeros(Int, 6))))
+```
+
+With $\boldsymbol{\Lambda(\theta)}$ being
+
+```{julia}
+Λ(θ) = Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6)))
+θ = ones(2)      # initial parameter vector
+Int.(Λ(θ))       # initial value of Λ
+```
+
+the first face of $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
+\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is
+
+```{julia}
+#| label: symprod
+symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1
+Int.(symprod)
+```
+
+producing the matrix whose trace is desired as
+
+```{julia}
+rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')  # overwrites the value of symprod
+```
+
+yielding the trace as
+
+```{julia}
+sum(diag(symprod))
+```
+
+One point to notice here is that the $[1,1]$ block of this matrix is diagonal, with elements of
+
+```{julia}
+((2 * first(θ)) .* first(m02.A).diag) ./ abs2.(first(m02.L).diag)
+```
+
+which can be used to simplify the evaluation of the first gradient term.
+In particular, the gradient of a model with a single, scalar random-effects term is, unsurprisingly, straightforward.
+
+For the second element of the gradient we define
+
+```{julia}
+nabla2 = Diagonal(vcat(zeros(24), ones(6)))
+Int.(nabla2)
+```
+
+and
+
+```{julia}
+symprod = nabla2 * A * Λ(ones(2)) + Λ(ones(2)) * A * nabla2
+```
+
+The matrix whose trace is required is
+
+```{julia}
+rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')
+```
+
+producing the second element of the gradient of `ldfun` as
+
+```{julia}
+sum(diag(symprod))
+```
+
+Notice that the entire $[1,1]$ block of this matrix is zero and will not need to be evaluated explicitly.
+
+We evaluate $\boldsymbol{\hat{\theta}}$ using a derivative-free optimizer as
+
+```{julia}
+θ = refit!(m02).θ
+```
+
+after which the first face of `symprod` becomes
+
+```{julia}
+symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1
+```
+
+`LZZsparse` becomes
+
+```{julia}
+LZZsparse = LowerTriangular(sparseL(m02))
+```
+
+and the matrix whose trace is required is
+
+```{julia}
+rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')
+```
+
+yielding the gradient term
+
+```{julia}
+sum(diag(symprod))
+```
+
+which can be compared to the finite difference value
+
+```{julia}
+FiniteDiff.finite_difference_gradient(ldfun, θ) 
+```
+
+(Note, this is the first element of the gradient of the `logdet` term only, not the gradient of the objective which is near zero
+
+```{julia}
+FiniteDiff.finite_difference_gradient(objective!(m02), θ)
+```
+
+as it should be at the optimum.)
+
+For the second element of the gradient of `ldfun` we have
+
+```{julia}
+symprod = nabla2 * A * Λ(θ) + Λ(θ) * A * nabla2
+```
+
+After pre- and post-division by `LZZsparse`, this becomes
+
+```{julia}
+rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')
+```
+
+yielding the second element of the gradient of `ldfun` as
+
+```{julia}
+sum(diag(symprod))
+```
+
+#### Factoring the symmetric matrix 
+
+The matrix $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
+\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is symmetric and has the same sparsity structure as $\mathbf{Z^\top Z}$, which is positive semi-definite.
+However, it is not clear that the non-zero blocks in $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
+\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$  will be positive semi-definite in the general case.
+In the case of a single variance component it will be positive definite when $\theta_1>0$ because it is $2\theta_1\mathbf{A}$.
+
+
+### References {.unnumbered}
+
+::: {#refs}
+:::
diff --git a/experiments/Gradient_based_optimization.qmd b/experiments/Gradient_based_optimization.qmd
new file mode 100644
index 000000000..ae2d0a47b
--- /dev/null
+++ b/experiments/Gradient_based_optimization.qmd
@@ -0,0 +1,197 @@
+---
+title: "Gradient-based Optimization of the Profiled log-likelihood"
+author:
+  - name: Douglas Bates
+    email: dmbates@gmail.com
+    orcid: 0000-0001-8316-9503
+    affiliation:
+      - name: University of Wisconsin - Madison
+        city: Madison
+        state: WI
+        url: https://www.wisc.edu
+        department: Statistics
+  - name: Phillip Alday
+    email: me@phillipalday.com
+    orcid: 0000-0002-9984-5745
+    affiliation:
+      - name: Beacon Biosignals
+        url: https://beacon.bio
+date: last-modified
+date-format: iso
+toc: true
+bibliography: bibliography.bib
+number-sections: true
+engine: julia
+julia:
+  exeflags:
+    - -tauto
+    - --project=@.
+format:
+  html:
+    toc: true
+    toc-location: right
+    embed-resources: true
+---
+
+## Introduction {#sec-intro}
+
+Before devoting too much effort to efficient evaluation of the gradient of the profiled log-likelihood, we should check if using gradient-based optimization requires sufficiently fewer evaluations of the objective, and the gradient, than does derivative-free optimization.
+
+Here we fit a few models using automatic differentiation from [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and the `ForwardDiff` extension to [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) to optimize the profiled log-likelihood with the `LD_LBFGS` optimizer from [NLopt.jl](https://github.com/jump-dev/NLopt.jl), instead of the default `LN_NEWUOA` which does not use gradients.
+
+The results are more-or-less a toss-up when using `ForwardDiff` to evaluate the gradient.
+A more efficient evaluation of the gradient, taking advantage of the sparse-blocked structure of the Cholesky factorization to evaluate the profiled log-likelihood, may tip the balance in favor of gradient-based methods.
+
+## Preliminaries {#sec-prelim}
+
+Load the packages to be used
+
+```{julia}
+#| label: load_packages
+using BenchmarkTools
+using ForwardDiff
+using MixedModels
+using MixedModelsDatasets: dataset
+using NLopt
+using Tables: table
+using TypedTables: Table
+```
+
+## Examples {#sec-examples}
+
+### Penicillin data {#sec-penicillin}
+
+Load the `penicillin` dataset
+```{julia}
+penicillin = Table(dataset(:penicillin))
+```
+
+and define a model
+
+```{julia}
+#| label: const_defs
+#| output: false
+m = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin)
+θ = copy(m.θ)
+k = length(θ)
+const fitlog = sizehint!(Vector{Float64}(undef, 0), 200)
+```
+
+with an NLopt-compatible objective function
+
+```{julia}
+#| label: obj_def
+function obj(θ::Vector{Float64}, grad::Vector{Float64})
+    val = objective(updateL!(setθ!(m, θ)))
+    push!(fitlog, val)
+    append!(fitlog, θ)
+    if !isempty(grad)
+        copyto!(grad, ForwardDiff.gradient(m, θ))
+        append!(fitlog, grad)
+    end
+    return val
+end
+```
+
+A benchmark of evaluating the objective only
+
+```{julia}
+#| label: benchmark_obj
+@benchmark objective(updateL!(setθ!($m, $θ))) seconds=1
+```
+
+compared to evaluating both the objective and its gradient
+
+```{julia}
+#| label: benchmark_obj_grad
+let gr = Vector{Float64}(undef, k)
+  @benchmark obj($θ, $gr)  seconds=1
+end
+```
+
+Notice that evaluating both the objective and the gradient, using `ForwardDiff.jl`, results in considerably more allocation of storage than does evaluating the objective alone.
+(In part this is a reflection of the fact that considerable work has gone into tuning the objective evaluation so that it doesn't allocate a lot of memory.)
+
+Fitting the model using a derivative-free optimizer gives
+
+```{julia}
+m.optsum.ftol_rel = 1.e-8         # for fair comparison with the gradient-based method
+print(refit!(m))
+```
+
+for which the optimization summary is
+
+```{julia}
+m.optsum
+```
+
+The objective at the estimate is
+
+```{julia}
+m.optsum.fmin
+```
+
+We set up and run the gradient-based optimizer L-BFGS as
+
+```{julia}
+opt = NLopt.Opt(:LD_LBFGS, 2)
+NLopt.ftol_rel!(opt, 1.e-8)
+NLopt.min_objective!(opt, obj)
+empty!(fitlog)
+min_f, min_x, ret = NLopt.optimize(opt, copy(θ))
+```
+
+where the fitlog is
+
+```{julia}
+header = [:obj, :θ₁, :θ₂, :g₁, :g₂]
+fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
+```
+
+```{julia}
+last(fltbl, 3)
+```
+
+### Sleepstudy {#sec-sleepstudy}
+
+```{julia}
+sleepstudy = Table(dataset(:sleepstudy))
+```
+
+```{julia}
+m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy)
+θ = copy(m.θ)
+@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1
+```
+
+
+```{julia}
+@benchmark obj($θ, $(similar(θ))) seconds=1
+```
+
+```{julia}
+print(refit!(m))
+```
+
+```{julia}
+m.optsum
+```
+
+```{julia}
+opt = NLopt.Opt(:LD_LBFGS, 3)
+NLopt.ftol_rel!(opt, 1.e-8)
+NLopt.min_objective!(opt, obj)
+empty!(fitlog)
+min_f, min_x, ret = NLopt.optimize(opt, copy(θ))
+```
+
+
+```{julia}
+header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃]
+fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
+```
+
+
+```{julia}
+last(fltbl, 3)
+```
\ No newline at end of file
diff --git a/experiments/Project.toml b/experiments/Project.toml
new file mode 100644
index 000000000..45e93db75
--- /dev/null
+++ b/experiments/Project.toml
@@ -0,0 +1,10 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
+MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6"
+NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
diff --git a/experiments/bibliography.bib b/experiments/bibliography.bib
new file mode 100644
index 000000000..2397e3095
--- /dev/null
+++ b/experiments/bibliography.bib
@@ -0,0 +1,36 @@
+@article{Zhou03042019,
+    author = {Hua Zhou and Liuyi Hu and Jin Zhou and Kenneth Lange},
+    title = {MM Algorithms for Variance Components Models},
+    journal = {Journal of Computational and Graphical Statistics},
+    volume = {28},
+    number = {2},
+    pages = {350--361},
+    year = {2019},
+    publisher = {ASA Website},
+    doi = {10.1080/10618600.2018.1529601},
+    note ={PMID: 31592195},
+    URL = {https://doi.org/10.1080/10618600.2018.1529601},
+    eprint = {https://doi.org/10.1080/10618600.2018.1529601}
+}
+
+@Article{bates.maechler.etal:2015,
+  author        = {Bates, Douglas and Maechler, Martin and Bolker, Benjamin M. and Walker, Steven},
+  title         = {Fitting Linear Mixed-Effects Models using lme4},
+  doi           = {10.18637/jss.v067.i01},
+  number        = {1},
+  pages         = {1--48},
+  volume        = {67},
+  date-added    = {2020-03-24},
+  date-modified = {2016-02-12 06:52:06 +0000},
+  file          = {:2015/bates.maechler.etal_2015 - Fitting Linear Mixed.pdf:PDF},
+  journal       = {Journal of Statistical Software},
+  year          = {2015},
+}
+
+@article{bates2025mixed,
+  title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization},
+  author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H},
+  journal={arXiv preprint arXiv:2505.11674},
+  year={2025}
+}
+

From b55d6222d71a9c70dca015867a24dd289b0140e1 Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Sun, 28 Dec 2025 23:19:21 -0600
Subject: [PATCH 02/15] slight optimization of gradient computation

---
 ext/MixedModelsForwardDiffExt.jl              | 21 ++++++++++++++++---
 {experiments => gradients}/.gitignore         |  0
 .../GradientEvaluation.qmd                    |  0
 .../Gradient_based_optimization.qmd           |  9 +++++---
 {experiments => gradients}/Project.toml       |  3 +++
 {experiments => gradients}/bibliography.bib   |  0
 6 files changed, 27 insertions(+), 6 deletions(-)
 rename {experiments => gradients}/.gitignore (100%)
 rename {experiments => gradients}/GradientEvaluation.qmd (100%)
 rename {experiments => gradients}/Gradient_based_optimization.qmd (94%)
 rename {experiments => gradients}/Project.toml (92%)
 rename {experiments => gradients}/bibliography.bib (100%)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index c3263e620..757c4b7d2 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -30,7 +30,7 @@ using LinearAlgebra: LinearAlgebra,
 using SparseArrays: SparseArrays, nzrange
 
 # Stuff we're defining in this file
-using ForwardDiff: ForwardDiff
+using ForwardDiff: ForwardDiff, DiffResult, GradientConfig, Chunk
 using MixedModels: fd_cholUnblocked!,
     fd_deviance,
     fd_logdet,
@@ -68,9 +68,24 @@ values.
 $(FORWARDDIFF)
 """
 function ForwardDiff.gradient(
-    model::LinearMixedModel{T}, θ::Vector{T}=model.θ
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::GradientConfig=GradientConfig(model, x)
 ) where {T}
-    return ForwardDiff.gradient(fd_deviance(model), θ)
+    return ForwardDiff.gradient(model, θ, cfg, check)
+end
+
+#  gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T},
+#  ::ForwardDiff.GradientConfig{T}, ::Val{CHK}) where {T, T, CHK, F<:LinearMixedModel{T}}
+
+function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult},
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::GradientConfig=GradientConfig(model, x)
+) where {T}
+    return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg)
+end
+
+function ForwardDiff.GradientConfig(model::LinearMixedModel, x::AbstractArray, chunk::Chunk = Chunk(x))
+    return GradientConfig(fd_deviance(model), x, chunk)
 end
 
 """
diff --git a/experiments/.gitignore b/gradients/.gitignore
similarity index 100%
rename from experiments/.gitignore
rename to gradients/.gitignore
diff --git a/experiments/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd
similarity index 100%
rename from experiments/GradientEvaluation.qmd
rename to gradients/GradientEvaluation.qmd
diff --git a/experiments/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd
similarity index 94%
rename from experiments/Gradient_based_optimization.qmd
rename to gradients/Gradient_based_optimization.qmd
index ae2d0a47b..140ec6fbf 100644
--- a/experiments/Gradient_based_optimization.qmd
+++ b/gradients/Gradient_based_optimization.qmd
@@ -51,6 +51,7 @@ Load the packages to be used
 using BenchmarkTools
 using ForwardDiff
 using MixedModels
+using MixedModels: fd_deviance
 using MixedModelsDatasets: dataset
 using NLopt
 using Tables: table
@@ -81,12 +82,13 @@ with an NLopt-compatible objective function
 
 ```{julia}
 #| label: obj_def
+grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
 function obj(θ::Vector{Float64}, grad::Vector{Float64})
     val = objective(updateL!(setθ!(m, θ)))
     push!(fitlog, val)
     append!(fitlog, θ)
     if !isempty(grad)
-        copyto!(grad, ForwardDiff.gradient(m, θ))
+        copyto!(grad, ForwardDiff.gradient!(grad, m, θ, grad_config))
         append!(fitlog, grad)
     end
     return val
@@ -161,6 +163,7 @@ sleepstudy = Table(dataset(:sleepstudy))
 ```{julia}
 m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy)
 θ = copy(m.θ)
+grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
 @benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1
 ```
 
@@ -178,7 +181,7 @@ m.optsum
 ```
 
 ```{julia}
-opt = NLopt.Opt(:LD_LBFGS, 3)
+opt = NLopt.Opt(:LD_LBFGS, length(θ))
 NLopt.ftol_rel!(opt, 1.e-8)
 NLopt.min_objective!(opt, obj)
 empty!(fitlog)
@@ -194,4 +197,4 @@ fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
 
 ```{julia}
 last(fltbl, 3)
-```
\ No newline at end of file
+```
diff --git a/experiments/Project.toml b/gradients/Project.toml
similarity index 92%
rename from experiments/Project.toml
rename to gradients/Project.toml
index 45e93db75..220557789 100644
--- a/experiments/Project.toml
+++ b/gradients/Project.toml
@@ -8,3 +8,6 @@ MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6"
 NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
+
+[sources]
+MixedModels = {path = ".."}
diff --git a/experiments/bibliography.bib b/gradients/bibliography.bib
similarity index 100%
rename from experiments/bibliography.bib
rename to gradients/bibliography.bib

From 4aa750e26512da743c52cc1c9b58fed0dcf32e0b Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Sun, 28 Dec 2025 23:21:36 -0600
Subject: [PATCH 03/15] kb07

---
 gradients/Gradient_based_optimization.qmd | 43 ++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd
index 140ec6fbf..53a96eac2 100644
--- a/gradients/Gradient_based_optimization.qmd
+++ b/gradients/Gradient_based_optimization.qmd
@@ -88,7 +88,7 @@ function obj(θ::Vector{Float64}, grad::Vector{Float64})
     push!(fitlog, val)
     append!(fitlog, θ)
     if !isempty(grad)
-        copyto!(grad, ForwardDiff.gradient!(grad, m, θ, grad_config))
+        ForwardDiff.gradient!(grad, m, θ, grad_config)
         append!(fitlog, grad)
     end
     return val
@@ -198,3 +198,44 @@ fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
 ```{julia}
 last(fltbl, 3)
 ```
+
+### Kronmueller-Barr 2007 {#sec-kb07}
+
+```{julia}
+kb07 = Table(dataset(:kb07))
+```
+
+```{julia}
+# this model is very overparameterized, but it's a test example
+m = LinearMixedModel(@formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), kb07)
+θ = copy(m.θ)
+grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
+@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1
+```
+
+
+```{julia}
+@benchmark obj($θ, $(similar(θ))) seconds=5
+```
+
+```{julia}
+print(refit!(m))
+```
+
+```{julia}
+m.optsum
+```
+
+```{julia}
+opt = NLopt.Opt(:LD_LBFGS, length(θ))
+NLopt.ftol_rel!(opt, 1.e-8)
+NLopt.min_objective!(opt, obj)
+empty!(fitlog)
+min_f, min_x, ret = NLopt.optimize(opt, copy(θ))
+```
+
+
+```{julia}
+header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃]
+fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
+```

From 2765eef6cd698979ed03921959ca141e132045bf Mon Sep 17 00:00:00 2001
From: Douglas Bates <dmbates@gmail.com>
Date: Tue, 30 Dec 2025 09:06:03 -0600
Subject: [PATCH 04/15] Spelling mistakes?

---
 ext/MixedModelsForwardDiffExt.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index 757c4b7d2..ef4d4421f 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -69,7 +69,7 @@ $(FORWARDDIFF)
 """
 function ForwardDiff.gradient(
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
-    cfg::GradientConfig=GradientConfig(model, x)
+    cfg::GradientConfig=GradientConfig(model, θ)
 ) where {T}
     return ForwardDiff.gradient(model, θ, cfg, check)
 end
@@ -79,7 +79,7 @@ end
 
 function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult},
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
-    cfg::GradientConfig=GradientConfig(model, x)
+    cfg::GradientConfig=GradientConfig(model, θ)
 ) where {T}
     return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg)
 end

From 644027406527ade5db64ed2110f26fff521d7c63 Mon Sep 17 00:00:00 2001
From: Douglas Bates <dmbates@gmail.com>
Date: Tue, 30 Dec 2025 11:29:41 -0600
Subject: [PATCH 05/15] Still not passing tests. In write-up made method
 comparisons fairer.

---
 ext/MixedModelsForwardDiffExt.jl          |   2 +-
 gradients/Gradient_based_optimization.qmd | 167 ++++++++--------------
 2 files changed, 57 insertions(+), 112 deletions(-)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index ef4d4421f..324186065 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -71,7 +71,7 @@ function ForwardDiff.gradient(
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
     cfg::GradientConfig=GradientConfig(model, θ)
 ) where {T}
-    return ForwardDiff.gradient(model, θ, cfg, check)
+    return ForwardDiff.gradient(model, θ, cfg)#, check)
 end
 
 #  gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T},
diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd
index 53a96eac2..06b27c2f9 100644
--- a/gradients/Gradient_based_optimization.qmd
+++ b/gradients/Gradient_based_optimization.qmd
@@ -48,7 +48,6 @@ Load the packages to be used
 
 ```{julia}
 #| label: load_packages
-using BenchmarkTools
 using ForwardDiff
 using MixedModels
 using MixedModels: fd_deviance
@@ -56,186 +55,132 @@ using MixedModelsDatasets: dataset
 using NLopt
 using Tables: table
 using TypedTables: Table
-```
-
-## Examples {#sec-examples}
 
-### Penicillin data {#sec-penicillin}
-
-Load the `penicillin` dataset
-```{julia}
-penicillin = Table(dataset(:penicillin))
+const progress = false
 ```
 
-and define a model
-
-```{julia}
-#| label: const_defs
-#| output: false
-m = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin)
-θ = copy(m.θ)
-k = length(θ)
-const fitlog = sizehint!(Vector{Float64}(undef, 0), 200)
-```
+## Examples {#sec-examples}
 
-with an NLopt-compatible objective function
+We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`.
 
 ```{julia}
-#| label: obj_def
-grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
-function obj(θ::Vector{Float64}, grad::Vector{Float64})
+addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0'))
+function gr_refit!(m::LinearMixedModel{T}) where {T}
+  θ = copy(m.optsum.initial)
+  k = length(θ)
+  fitlog = sizehint!(T[], 50 * k)
+  grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
+  function obj(θ::Vector{Float64}, grad::Vector{Float64})
     val = objective(updateL!(setθ!(m, θ)))
     push!(fitlog, val)
     append!(fitlog, θ)
     if !isempty(grad)
-        ForwardDiff.gradient!(grad, m, θ, grad_config)
-        append!(fitlog, grad)
+      ForwardDiff.gradient!(grad, m, θ, grad_config)
+      append!(fitlog, grad)
+    else
+      append!(fitlog, fill(NaN, k))  # never called with empty grad but just in case
     end
     return val
+  end
+  opt = NLopt.Opt(:LD_LBFGS, k)
+  NLopt.ftol_rel!(opt, 1.e-12)
+  NLopt.ftol_abs!(opt, 1.e-8)
+  NLopt.min_objective!(opt, obj)
+  min_f, min_x, ret = NLopt.optimize(opt, θ)
+  header = vcat([:obj], addinds('θ', k), addinds('g', k))
+  return Table(table(transpose(reshape(fitlog, 2k + 1, :)); header))
 end
 ```
 
-A benchmark of evaluating the objective only
-
-```{julia}
-#| label: benchmark_obj
-@benchmark objective(updateL!(setθ!($m, $θ))) seconds=1
-```
-
-compared to evaluating both the objective and its gradient
-
-```{julia}
-#| label: benchmark_obj_grad
-let gr = Vector{Float64}(undef, k)
-  @benchmark obj($θ, $gr)  seconds=1
-end
-```
-
-Notice that evaluating both the objective and the gradient, using `ForwardDiff.jl`, results in considerably more allocation of storage than does evaluating the objective alone.
-(In part this is a reflection of the fact that considerable work has gone into tuning the objective evaluation so that it doesn't allocate a lot of memory.)
+### Penicillin data {#sec-penicillin}
 
-Fitting the model using a derivative-free optimizer gives
+Define a model for the `penicillin` data
 
 ```{julia}
-m.optsum.ftol_rel = 1.e-8         # for fair comparison with the gradient-based method
-print(refit!(m))
+#| label: const_defs
+#| output: false
+m1 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress)
+print(m1)
 ```
 
 for which the optimization summary is
 
 ```{julia}
-m.optsum
+m1.optsum
 ```
 
-The objective at the estimate is
+and refit the model using ForwardDiff gradient evaluations.
 
 ```{julia}
-m.optsum.fmin
+fitlog = gr_refit!(m1)
 ```
 
-We set up and run the gradient-based optimizer L-BFGS as
+The objective at convergence is
 
 ```{julia}
-opt = NLopt.Opt(:LD_LBFGS, 2)
-NLopt.ftol_rel!(opt, 1.e-8)
-NLopt.min_objective!(opt, obj)
-empty!(fitlog)
-min_f, min_x, ret = NLopt.optimize(opt, copy(θ))
+last(fitlog.obj)
 ```
 
-where the fitlog is
-
-```{julia}
-header = [:obj, :θ₁, :θ₂, :g₁, :g₂]
-fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
-```
+and the last few evaluations are
 
 ```{julia}
-last(fltbl, 3)
+last(fitlog, 5)
 ```
 
 ### Sleepstudy {#sec-sleepstudy}
 
 ```{julia}
-sleepstudy = Table(dataset(:sleepstudy))
-```
-
-```{julia}
-m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy)
-θ = copy(m.θ)
-grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
-@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1
-```
-
-
-```{julia}
-@benchmark obj($θ, $(similar(θ))) seconds=1
+m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress)
+print(m2)
 ```
 
 ```{julia}
-print(refit!(m))
+m2.optsum
 ```
 
 ```{julia}
-m.optsum
+fitlog = gr_refit!(m2)
 ```
 
 ```{julia}
-opt = NLopt.Opt(:LD_LBFGS, length(θ))
-NLopt.ftol_rel!(opt, 1.e-8)
-NLopt.min_objective!(opt, obj)
-empty!(fitlog)
-min_f, min_x, ret = NLopt.optimize(opt, copy(θ))
+last(fitlog.obj)
 ```
 
-
 ```{julia}
-header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃]
-fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
-```
-
-
-```{julia}
-last(fltbl, 3)
+last(fitlog, 5)
 ```
 
 ### Kronmueller-Barr 2007 {#sec-kb07}
 
-```{julia}
-kb07 = Table(dataset(:kb07))
-```
-
 ```{julia}
 # this model is very overparameterized, but it's a test example
-m = LinearMixedModel(@formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), kb07)
-θ = copy(m.θ)
-grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
-@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1
+m3 = fit(
+  MixedModel,
+  @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)),
+  dataset(:kb07);
+  progress,
+)
+print(m3)
 ```
 
-
 ```{julia}
-@benchmark obj($θ, $(similar(θ))) seconds=5
+m3.optsum
 ```
 
+Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector
+
 ```{julia}
-print(refit!(m))
+findall(iszero, m3.θ)
 ```
 
 ```{julia}
-m.optsum
+fitlog = gr_refit!(m3)
 ```
 
 ```{julia}
-opt = NLopt.Opt(:LD_LBFGS, length(θ))
-NLopt.ftol_rel!(opt, 1.e-8)
-NLopt.min_objective!(opt, obj)
-empty!(fitlog)
-min_f, min_x, ret = NLopt.optimize(opt, copy(θ))
+last(fitlog.obj)
 ```
 
-
 ```{julia}
-header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃]
-fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header))
-```
+last(fitlog, 5)
+```
\ No newline at end of file

From 16cbd8e6ce2dfabe79691159297ab9b8fd2320db Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Tue, 30 Dec 2025 12:53:44 -0600
Subject: [PATCH 06/15] test fix

---
 ext/MixedModelsForwardDiffExt.jl | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index 324186065..a65f6c1df 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -30,7 +30,10 @@ using LinearAlgebra: LinearAlgebra,
 using SparseArrays: SparseArrays, nzrange
 
 # Stuff we're defining in this file
-using ForwardDiff: ForwardDiff, DiffResult, GradientConfig, Chunk
+using ForwardDiff: ForwardDiff,
+    Chunk,
+    DiffResult,
+    GradientConfig
 using MixedModels: fd_cholUnblocked!,
     fd_deviance,
     fd_logdet,
@@ -59,6 +62,12 @@ const FORWARDDIFF = """
     should be included is currently still being decided.
 """
 
+function ForwardDiff.GradientConfig(
+    model::LinearMixedModel, x::AbstractArray=model.θ, chunk::Chunk=Chunk(x)
+)
+    return GradientConfig(fd_deviance(model), x, chunk)
+end
+
 """
     ForwardDiff.gradient(model::LinearMixedModel)
 
@@ -69,23 +78,18 @@ $(FORWARDDIFF)
 """
 function ForwardDiff.gradient(
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
-    cfg::GradientConfig=GradientConfig(model, θ)
-) where {T}
-    return ForwardDiff.gradient(model, θ, cfg)#, check)
+    cfg::GradientConfig=GradientConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T, CHK}
+    return ForwardDiff.gradient!(similar(model.θ), fd_deviance(model), θ, cfg, check)
 end
 
-#  gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T},
-#  ::ForwardDiff.GradientConfig{T}, ::Val{CHK}) where {T, T, CHK, F<:LinearMixedModel{T}}
-
 function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult},
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
-    cfg::GradientConfig=GradientConfig(model, θ)
-) where {T}
-    return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg)
-end
-
-function ForwardDiff.GradientConfig(model::LinearMixedModel, x::AbstractArray, chunk::Chunk = Chunk(x))
-    return GradientConfig(fd_deviance(model), x, chunk)
+    cfg::GradientConfig=GradientConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T, CHK}
+    return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check)
 end
 
 """

From f184c8834f5e5b0fe352f752968b520466f13d9e Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Tue, 30 Dec 2025 13:04:24 -0600
Subject: [PATCH 07/15] methods for HessianConfig and hessian!

---
 ext/MixedModelsForwardDiffExt.jl | 42 +++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index a65f6c1df..de2d80c54 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -32,8 +32,8 @@ using SparseArrays: SparseArrays, nzrange
 # Stuff we're defining in this file
 using ForwardDiff: ForwardDiff,
     Chunk,
-    DiffResult,
-    GradientConfig
+    GradientConfig,
+    HessianConfig
 using MixedModels: fd_cholUnblocked!,
     fd_deviance,
     fd_logdet,
@@ -62,9 +62,13 @@ const FORWARDDIFF = """
     should be included is currently still being decided.
 """
 
+#####
+##### Gradients
+#####
+
 function ForwardDiff.GradientConfig(
-    model::LinearMixedModel, x::AbstractArray=model.θ, chunk::Chunk=Chunk(x)
-)
+    model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x)
+) where {T}
     return GradientConfig(fd_deviance(model), x, chunk)
 end
 
@@ -81,10 +85,10 @@ function ForwardDiff.gradient(
     cfg::GradientConfig=GradientConfig(model, θ),
     check::Val{CHK}=Val(true),
 ) where {T, CHK}
-    return ForwardDiff.gradient!(similar(model.θ), fd_deviance(model), θ, cfg, check)
+    return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check)
 end
 
-function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult},
+function ForwardDiff.gradient!(result::AbstractArray,
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
     cfg::GradientConfig=GradientConfig(model, θ),
     check::Val{CHK}=Val(true),
@@ -92,6 +96,16 @@ function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult},
     return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check)
 end
 
+#####
+##### Hessians
+#####
+
+function ForwardDiff.HessianConfig(
+    model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x)
+) where {T}
+    return HessianConfig(fd_deviance(model), x, chunk)
+end
+
 """
     ForwardDiff.hessian(model::LinearMixedModel)
 
@@ -101,9 +115,19 @@ values.
 $(FORWARDDIFF)
 """
 function ForwardDiff.hessian(
-    model::LinearMixedModel{T}, θ::Vector{T}=model.θ
-) where {T}
-    return ForwardDiff.hessian(fd_deviance(model), θ)
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::HessianConfig=HessianConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T, CHK}
+    return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check)
+end
+
+function ForwardDiff.hessian!(result::AbstractArray,
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::HessianConfig=HessianConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T, CHK}
+    return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check)
 end
 
 #####

From 8e8ee8532ad850bd56f3e67f8e1397def5b665a0 Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Tue, 30 Dec 2025 13:04:52 -0600
Subject: [PATCH 08/15] format

---
 ext/MixedModelsForwardDiffExt.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index de2d80c54..15f850a86 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -84,7 +84,7 @@ function ForwardDiff.gradient(
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
     cfg::GradientConfig=GradientConfig(model, θ),
     check::Val{CHK}=Val(true),
-) where {T, CHK}
+) where {T,CHK}
     return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check)
 end
 
@@ -92,7 +92,7 @@ function ForwardDiff.gradient!(result::AbstractArray,
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
     cfg::GradientConfig=GradientConfig(model, θ),
     check::Val{CHK}=Val(true),
-) where {T, CHK}
+) where {T,CHK}
     return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check)
 end
 
@@ -118,7 +118,7 @@ function ForwardDiff.hessian(
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
     cfg::HessianConfig=HessianConfig(model, θ),
     check::Val{CHK}=Val(true),
-) where {T, CHK}
+) where {T,CHK}
     return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check)
 end
 
@@ -126,7 +126,7 @@ function ForwardDiff.hessian!(result::AbstractArray,
     model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
     cfg::HessianConfig=HessianConfig(model, θ),
     check::Val{CHK}=Val(true),
-) where {T, CHK}
+) where {T,CHK}
     return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check)
 end
 

From 072bd5d25de0c0d41d67d7e3506e25b7bb658e32 Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Tue, 30 Dec 2025 13:07:11 -0600
Subject: [PATCH 09/15] NEWS

---
 NEWS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 0a63d124f..eb00af618 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+- Additional methods for pre-allocated result arrays and `*Config` instances have been added to the ForwardDiff extension. [#871].
+
 MixedModels v5.1.0 Release Notes
 ==============================
 - Nesting checks for the likelihoodratio test have been slightly tweaked to be more robust, at the cost of being slightly slower. In particular, the comparison of models with pre-centered variables with those with variables centered via StandardizedPredictors.jl was previously incorrectly rejected as non-nested, but should be correctly accepted as nested now. Additionally, some further logging messages are emitted when a nesting check fails. [#867]
@@ -710,3 +712,4 @@ Package dependencies
 [#864]: https://github.com/JuliaStats/MixedModels.jl/issues/864
 [#865]: https://github.com/JuliaStats/MixedModels.jl/issues/865
 [#867]: https://github.com/JuliaStats/MixedModels.jl/issues/867
+[#871]: https://github.com/JuliaStats/MixedModels.jl/issues/871

From 03009d15ce84e73685ee40776e4bd7cf64dfc8c3 Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Tue, 30 Dec 2025 13:29:37 -0600
Subject: [PATCH 10/15] oops

---
 ext/MixedModelsForwardDiffExt.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index 15f850a86..149bfe355 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -119,7 +119,8 @@ function ForwardDiff.hessian(
     cfg::HessianConfig=HessianConfig(model, θ),
     check::Val{CHK}=Val(true),
 ) where {T,CHK}
-    return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check)
+    n = length(θ)
+    return ForwardDiff.hessian!(Matrix{T}(undef, n, n), model, θ, cfg, check)
 end
 
 function ForwardDiff.hessian!(result::AbstractArray,

From 0bfd467925118c03289fcff0cc5766ee4166da4e Mon Sep 17 00:00:00 2001
From: Phillip Alday <me@phillipalday.com>
Date: Tue, 30 Dec 2025 13:42:19 -0600
Subject: [PATCH 11/15] docs fix: AoG update

---
 docs/Project.toml     |  1 +
 docs/src/ecosystem.md | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index c396b21bc..4857b499c 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -26,6 +26,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 
 [compat]
+AlgebraOfGraphics = "0.12"
 BenchmarkTools = "1"
 DataFrames = "1"
 Documenter = "1.3"
diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md
index 53d0ba028..82aa5fe45 100644
--- a/docs/src/ecosystem.md
+++ b/docs/src/ecosystem.md
@@ -8,7 +8,7 @@ Several packages extend the functionality of MixedModels.jl, both in ways specif
 
 ```@example Ecosystem
 using MixedModels
-progress = false
+progress = isinteractive()
 ```
 
 ```@example Ecosystem
@@ -182,16 +182,16 @@ Effects are particularly nice for visualizing the model fit and its predictions.
 using AlgebraOfGraphics # like ggplot2, but an algebra instead of a grammar
 using CairoMakie
 
-plt1 = data(eff_logit) *
-      mapping(:age, :use; color=:anych) *
-      (visual(Lines) + mapping(; lower=:lower, upper=:upper) * visual(LinesFill))
+plt1 = data(eff_logit) * mapping(:age; color=:anych) *
+      (mapping(:use) * visual(Lines) +
+       mapping(:lower, :upper) * visual(Band; alpha=0.3))
 draw(plt1)
 ```
 
 ```@example Ecosystem
-plt2 = data(eff_prob) *
-      mapping(:age, :use; color=:anych => "children") *
-      (visual(Lines) + mapping(; lower=:lower, upper=:upper) * visual(LinesFill))
+plt2 = data(eff_prob) * mapping(:age; color=:anych) *
+      (mapping(:use) * visual(Lines) +
+       mapping(:lower, :upper) * visual(Band; alpha=0.3))
 draw(plt2)
 ```
 

From 463a7d66bfebd62aaa119bc26d5e0357c1a3842c Mon Sep 17 00:00:00 2001
From: Douglas Bates <dmbates@gmail.com>
Date: Sat, 3 Jan 2026 12:32:21 -0600
Subject: [PATCH 12/15] Add information on gradient evaluation

---
 gradients/GradientEvaluation.qmd          | 301 ++++++++++++++++++++--
 gradients/Gradient_based_optimization.qmd |  10 +-
 gradients/Project.toml                    |   1 +
 gradients/bibliography.bib                |   6 +
 4 files changed, 296 insertions(+), 22 deletions(-)

diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd
index 776221549..a3ee8ae03 100644
--- a/gradients/GradientEvaluation.qmd
+++ b/gradients/GradientEvaluation.qmd
@@ -1,5 +1,5 @@
 ---
-title: "Evaluation of the Gradient of the Profiled log-likelihood"
+title: "Gradient of the Profiled log-likelihood"
 author:
   - name: Douglas Bates
     email: dmbates@gmail.com
@@ -38,8 +38,8 @@ format:
 A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper.
 The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design.
 
-The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood.
-The optimization itself is performed by a gradient-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers.
+The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and in [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood.
+The optimization itself is performed by a derivative-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers.
 
 Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages.
 
@@ -69,10 +69,11 @@ $$ {#eq-Sigma}
 
 for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$.
 
-In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, of
+In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship
 
 $$
-\boldsymbol{\Omega_\theta} =
+\begin{aligned}
+\boldsymbol{\Omega_\theta}&=
 \begin{bmatrix}
 \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}&
 \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} &
@@ -83,30 +84,37 @@ $$
 \mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} &
 \mathbf{y^\top X} &
 \mathbf{y^\top y}\\
-\end{bmatrix}
-$$ {#eq-blockedOmega}
-
-where $\mathbf{L}_\theta$ has a similar blocked structure
-
-$$
-\mathbf{L}_\boldsymbol{\theta} =
+\end{bmatrix}\\
+&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\
+&=
 \begin{bmatrix}
 \mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\
 \mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\
 \mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}}
 \end{bmatrix}
-$$ {#eq-blockedL}
+\begin{bmatrix}
+\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\
+\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\
+\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}}
+\end{bmatrix}^\top
+\end{aligned}
+$$ {#eq-blockedOmega}
+
+where the diagonal elements of $\mathbf{L}_\theta$ are positive.
 
-(In the actual computational methods the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
+(In the `MixedModels.jl` implementation the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
 
-The objective to be optimized is negative twice the profiled log-likelihood,
+The objective to be optimized, on the scale of the deviance, is negative twice the profiled log-likelihood, 
 
 $$
--2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y}) =
-\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]
+\begin{aligned}
+-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&=
+\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\
+&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c
+\end{aligned}
 $$ {#eq-objective}
 
-which is on the scale of the deviance (if we were able to define a deviance for these models).
+where $c$ is a constant.
 
 As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is
 
@@ -128,7 +136,41 @@ $$
 \end{aligned}
 $$ {#eq-delterm1}
 
-For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple in that they are block diagonal with a single non-zero diagonal block, which is an identity matrix.
+For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple.
+The partial derivatives are zeroes except for a single diagonal block, which is an identity matrix.
+
+### General expressions for differentiating a Cholesky factor
+
+@murray2016differentiation section 3.1 provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega.
+
+Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as
+
+$$
+d\boldsymbol{\Omega}=d\mathbf{L}\mathbf{L}^\top + \mathbf{L} d\mathbf{L}^\top
+$$ {#eq-infinitesimal}
+
+Pre-multiplying @eq-infinitesimal by $\mathbf{L}^{-1}$ and post-multiplying by $\mathbf{L}^{-\top}$ gives
+
+$$
+\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}=\mathbf{L}^{-1}d\mathbf{L} + d\mathbf{L}^\top\mathbf{L}^{-\top}
+$$ {#eq-LOmegaLT}
+
+The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first.
+We wish to isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, which we do with the $\Phi$ transformation applied to a symmetric matrix, which preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle.
+Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing
+
+$$
+\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)=\mathbf{L}^{-1}d\mathbf{L}
+$$ {#eq-Phi_dOmega}
+
+or
+
+$$
+d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)
+$$ {#eq-dL}
+
+As we shall see, because we only need the derivative of the logarithms of the diagonal elements of $d\mathbf{L}$ to obtain the gradient of the profiled log-likelihood, and because $\mathbf{L}$ is lower triangular, we can stop at @eq-LOmegaLT.
+Furthermore, several of the blocks in $\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}$ are either zero or exactly equal to the corresponding block of $\boldsymbol{\Omega_\theta}$.
 
 ## Examples
 
@@ -140,13 +182,234 @@ Load the packages to be used
 #| label: load_packages
 #| warning: false
 #| output: false
+using CairoMakie
 using FiniteDiff
 using LinearAlgebra
 using MixedModels
 using MixedModelsDatasets: dataset
 using TypedTables: Table
+
+const progress = isinteractive()  # to suppress progress bars in non-interactive sessions
+CairoMakie.activate!(type="svg")  # use svg graphics output
+```
+
+### Dyestuff - a single, scalar random-effects term
+
+The `dyestuff` data set provides the yield of dyestuff in each of 5 samples from each of 6 batches of an intermediate product in the process of producing a dye.
+
+```{julia}
+#| label: dyestuff_data
+dyestuff = Table(dataset(:dyestuff))
+```
+
+A mixed-effects model for these data includes an overall "intercept" term (whose estimate will be the sample mean because of the balanced design) and random effects for each level of `batch`.
+
+```{julia}
+#| label: dyestuff_model
+m01 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), dyestuff; progress)
+θ = m01.θ     # retain a copy of the estimate of θ
+print(m01)
+```
+
+#### The objective as a function of $\theta_1$
+
+@fig-obj_graph shows the objective (negative twice the log-likelihood) of this model as a function of $\theta_1$, the relative covariance parameter.
+
+```{julia}
+#| label: fig-obj_graph
+#| fig-cap: "Graph of the objective for model m01 as a function of θ₁"
+#| code-fold: true
+let f = Figure()
+  ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective")
+  lines!(ax, -0.25..1.5, objective!(m01))
+  f
+end
+```
+
+Notice that the objective is well-defined for negative values of $\theta_1$ and that it is an even function, in the sense that $f(-\theta_1)=f(\theta_1)\,\forall\theta_1$.
+
+This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function.
+
+The maximum likelihood estimate (i.e. the minimizer of the objective) of $\boldsymbol{\theta}$ for this model is
+
+```{julia}
+#| label: dyestuff_theta
+updateL!(setθ!(m01, θ))    # restore the estimate of θ and the Cholesky factor
+θ
+```
+
+#### Evaluating the gradient terms
+
+Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern
+
+```{julia}
+BlockDescription(m01)
+```
+
+In @eq-Sigma the matrix $\boldsymbol{\Lambda_\theta}$ is defined as a $6\times6$ diagonal matrix.
+Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diagonal block that is the identity, so that multiplication by $\boldsymbol{\Lambda_\theta}$ applies to the full matrix $\mathbf{A}$.
+
+```{julia}
+Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2)))
+Λ(θ)
+```
+
+The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is
+
+```{julia}
+dΛdθ1 = Diagonal(vcat(ones(6), zeros(2)))
+```
+
+The matrix $\mathbf{A}$ from which $\boldsymbol{\Omega_\theta}$ is generated is stored in blocks.
+We assemble these into a sparse matrix as
+
+```{julia}
+A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A)))
+```
+
+which could be generated as the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of
+
+```{julia}
+ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y)
+Int.(ZXy)    # Int for more concise printing
+```
+
+::: {.callout-note collapse=true}
+#### Printing Int. of a matrix with integer entries
+
+When we know that all the entries in a floating point matrix, `X`, will be integers, we convert it to integer elements as `Int.(X)` to save space in the output.
+:::
+
+```{julia}
+A == ZXy'ZXy
+```
+
+The matrix $\boldsymbol{\Omega_\theta}$ is
+
+```{julia}
+Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2)))
+Ω(θ)
+```
+
+The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix with
+
+```{julia}
+L = LowerTriangular(sparseL(m01; full=true))
+```
+
+We can check that it is indeed the lower Cholesky factor
+
+```{julia}
+L * L' ≈ Ω(θ)
+```
+
+The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is
+
+```{julia}
+dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1
+dΩdθ1(θ)
+```
+
+Notice that this matrix, like $\mathbf{A}$ is symmetric and has the same block structure as $\mathbf{A}$.
+In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$.
+
+Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to
+
+```{julia}
+prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L')
+```
+
+We note that @eq-delterm1 is the sum of the first 6 diagonal elements of `prePhi`,
+
+```{julia}
+sum(prePhi[i,i] for i in 1:6)
+```
+
+which should equal
+
+```{julia}
+ldfun(x::Float64) = logdet(updateL!(setθ!(m01, [x])))
+FiniteDiff.finite_difference_derivative(ldfun, only(θ))
+```
+
+Similarly the gradient of the other non-constant term in @eq-logdet is
+
+```{julia}
+size(m01.y, 1) * last(prePhi)
 ```
 
+compared to 
+
+```{julia}
+n_log_lyy_sq(x::Float64) = length(m01.y) * log(abs2(last(last(updateL!(setθ!(m01, [x])).L))))
+n_log_lyy_sq(only(θ))
+```
+
+```{julia}
+FiniteDiff.finite_difference_derivative(n_log_lyy_sq, only(θ))
+```
+
+```{julia}
+#| output: false
+updateL!(setθ!(m01, θ))   # reset the value of θ in the model
+```
+
+If we wish to continue the evaluation of all of the elements of $d\mathbf{L}/d\theta_1$ we would use
+the $\Phi$ transformation and premultiplication by $\mathbf{L}$
+
+```{julia}
+function Φ(S)
+  val = tril(S)    # extract the lower triangle
+  for i in diagind(val)
+    val[i] *= 0.5  # halve the diagonal elements
+  end
+  return LowerTriangular(val)
+end
+dLdθ1 = L * Φ(prePhi)
+```
+
+We can check these results against results from finite difference methods.
+First we check the $[1,1]$ entry of $\frac{d\mathbf{L}}{d\theta}$ and its derivative
+
+```{julia}
+l11(θ::T) where {T<:Real} = first(first(updateL!(setθ!(m01, [θ])).L))
+l11(only(θ))
+```
+
+and its finite-difference derivative
+
+```{julia}
+FiniteDiff.finite_difference_derivative(l11, only(θ))
+```
+
+which matches the leading diagonal elements of `dLdθ1`.
+
+Next we check the last diagonal element of $\mathbf{L}$, which is the square root of the penalized residual sum-of-squares.
+```{julia}
+rtprss(θ::T) where {T<:Real} = last(last(updateL!(setθ!(m01, [θ])).L))
+rtprss(only(θ))
+```
+
+```{julia}
+FiniteDiff.finite_difference_derivative(rtprss, only(θ))
+```
+
+These calculations verify the values of diagonal elements of $\frac{d\mathbf{L}}{d\theta}$ that will be used to evaluate the gradient of the profiled log-likelihood.
+
+The derivative of the objective is more easily obtained by rearranging the terms in the objective to isolate constants.
+First, because the determinant of a triangular matrix is the product of its diagonal elements, and hence the log of the determinant is the sum of the logs of its diagonal elements,
+
+$$
+\begin{aligned}
+\frac{\partial\log\left|\mathbf{L_{ZZ}}\right|^2}{\partial\theta_i}&=
+2\frac{\partial\sum_{j=1}^q \log\left(L_{jj}\right)}{\partial\theta_i}\\
+&=2\sum_{j=1}^q\frac{1}{L_{jj}}\frac{\partial{L_{jj}}}{\partial\theta_i}
+\end{aligned}
+$$ {#eq-logdet}
+
+Again, because $\mathbf{L}$ is lower triangular, the division by $L_{jj}$ in the summands of @eq-logdet exactly cancels the effect of multiplication on the left by $\mathbf{L}$ in @eq-dL in these diagonal terms.
+Also the multiplier of $2$ in @eq-logdet cancels the halving of the diagonal in @eq-Phi_dOmega, leaving only @eq-delterm1.
+
 ### Penicillin - two completely crossed scalar random-effects terms
 
 The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s.
diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd
index 06b27c2f9..d3167307f 100644
--- a/gradients/Gradient_based_optimization.qmd
+++ b/gradients/Gradient_based_optimization.qmd
@@ -97,9 +97,13 @@ end
 Define a model for the `penicillin` data
 
 ```{julia}
-#| label: const_defs
-#| output: false
-m1 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress)
+#| label: m1
+m1 = fit(
+  MixedModel,
+  @formula(diameter ~ 1 + (1|plate) + (1|sample)),
+  dataset(:penicillin);
+  progress,
+)
 print(m1)
 ```
 
diff --git a/gradients/Project.toml b/gradients/Project.toml
index 220557789..a05f9e152 100644
--- a/gradients/Project.toml
+++ b/gradients/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib
index 2397e3095..25109d8e4 100644
--- a/gradients/bibliography.bib
+++ b/gradients/bibliography.bib
@@ -34,3 +34,9 @@ @article{bates2025mixed
   year={2025}
 }
 
+@article{murray2016differentiation,
+  title={Differentiation of the Cholesky decomposition},
+  author={Murray, Iain},
+  journal={arXiv preprint arXiv:1602.07527},
+  year={2016}
+}

From ab0a7cb9b865d731baffd71cb01edcac4d25376f Mon Sep 17 00:00:00 2001
From: Douglas Bates <dmbates@gmail.com>
Date: Mon, 5 Jan 2026 18:54:25 -0600
Subject: [PATCH 13/15] Short-cut method of gradient evaluation

---
 gradients/GradientEvaluation.qmd | 383 +++++++++----------------------
 gradients/bibliography.bib       |   6 +-
 2 files changed, 113 insertions(+), 276 deletions(-)

diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd
index a3ee8ae03..ac652277d 100644
--- a/gradients/GradientEvaluation.qmd
+++ b/gradients/GradientEvaluation.qmd
@@ -43,18 +43,17 @@ The optimization itself is performed by a derivative-free optimizer, usually eit
 
 Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages.
 
-The purpose of this note is to explore whether these expressions can be implemented effectively, even if just for the variance components model, which, for our purposes, is a model in which all the random effects terms are simple, scalar terms.
+The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients.
 
-### Expressions for the gradient terms
+### Model definition and evaluation of the objective and gradient
 
 The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as
 
 $$
 \begin{aligned}
+    \mathbfcal{B}&\sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})\\
     (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})&
-    \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)\\
-    \mathbfcal{B}&
-    \sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})
+    \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)
 \end{aligned}
 $$ {#eq-dists}
 
@@ -69,7 +68,7 @@ $$ {#eq-Sigma}
 
 for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$.
 
-In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship
+In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship
 
 $$
 \begin{aligned}
@@ -100,48 +99,43 @@ $$
 \end{aligned}
 $$ {#eq-blockedOmega}
 
-where the diagonal elements of $\mathbf{L}_\theta$ are positive.
+where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive.
+(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.)
 
-(In the `MixedModels.jl` implementation the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
+(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
 
-The objective to be optimized, on the scale of the deviance, is negative twice the profiled log-likelihood, 
+As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as
 
 $$
 \begin{aligned}
 -2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&=
 \log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\
-&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c
+&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c_\ell\\
+&=2\sum_{j=1}^q\log L_{j,j} + 2n \log L_{q+p+1,q+p+1} + c_\ell
 \end{aligned}
 $$ {#eq-objective}
 
-where $c$ is a constant.
+where $c_\ell$ is a constant.
+That is, the objective is an affine function (a linear function plus a constant) of the logarithms of the diagonal elements of $\mathbf{L}_\boldsymbol{\theta}$.
+It happens that the gradient of the objective, as a function of $\boldsymbol{\theta}$, expressed in this form is straightforward to evaluate, as shown in @sec-Cholesky_derivative.
 
-As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is
+As shown in @bates.maechler.etal:2015, sec 3.4 the REML criterion, which some prefer for parameter estimation, can be written as
 
 $$
 \begin{aligned}
-\nabla\log\left|\mathbf{L_ZZ}\right|^2 &= \nabla\log\left(\left|\mathbf{L_{ZZ}L_{ZZ}}^\top\right|\right)\\
-&=\nabla\log\left(\left|\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right|\right)\\
-&=\operatorname{tr}\left[\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right)
-\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right)^{-1}\right]\\
-&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1}
-\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right)
-\mathbf{L_{ZZ}}^{-\top}
-\right]\\
-&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1}
-\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
-\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)
-\mathbf{L_{ZZ}}^{-\top}
-\right]
+-2\mathcal{L}_R(\boldsymbol{\theta}|\mathbf{y})&=
+\log\left(\left|\mathbf{L_{ZZ}}\right|^2\left|\mathbf{L_{XX}}\right|^2\right) + (n-p) \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n-p}\right)\right]\\
+&=\log\left|\mathbf{L_{ZZ}}\right|^2 + \log\left|\mathbf{L_{XX}}\right|^2 + (n-p)\log\ell^2_{\mathbf{yy}} + c_r\\
+&=2\sum_{j=1}^{q+p}\log L_{j,j} + 2(n-p)\log L_{q+p+1,q+p+1} + c_r
 \end{aligned}
-$$ {#eq-delterm1}
+$$ {#eq-objective}
 
-For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple.
-The partial derivatives are zeroes except for a single diagonal block, which is an identity matrix.
+where $c_r$ is likewise a constant.
+This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$.
 
-### General expressions for differentiating a Cholesky factor
+### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative}
 
-@murray2016differentiation section 3.1 provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega.
+@murray2016differentiation, section 3.1, provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega.
 
 Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as
 
@@ -156,7 +150,12 @@ $$
 $$ {#eq-LOmegaLT}
 
 The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first.
-We wish to isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, which we do with the $\Phi$ transformation applied to a symmetric matrix, which preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle.
+Thus, the diagonal of the left-hand side is exactly the result we wish to evaluate, twice the infinitesimal of the logarithms of the diagonal elements of $\mathbf{L}$.
+
+For completeness, we provide the conclusion of the derivation in @murray2016differentiation but we don't need the more general result of $d\mathbf{L}$ - we only need the particular result from the left-hand side of @eq-LOmegaLT.
+
+To evaluate $d\mathbf{L}$ we must isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, on the right-hand side of @eq-LOmegaLT, which we do with the $\Phi$ transformation applied to a symmetric matrix.
+This transformation preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle.
 Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing
 
 $$
@@ -169,9 +168,6 @@ $$
 d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)
 $$ {#eq-dL}
 
-As we shall see, because we only need the derivative of the logarithms of the diagonal elements of $d\mathbf{L}$ to obtain the gradient of the profiled log-likelihood, and because $\mathbf{L}$ is lower triangular, we can stop at @eq-LOmegaLT.
-Furthermore, several of the blocks in $\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}$ are either zero or exactly equal to the corresponding block of $\boldsymbol{\Omega_\theta}$.
-
 ## Examples
 
 To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples.
@@ -189,7 +185,7 @@ using MixedModels
 using MixedModelsDatasets: dataset
 using TypedTables: Table
 
-const progress = isinteractive()  # to suppress progress bars in non-interactive sessions
+const progress = isinteractive()  # suppress progress bars in non-interactive sessions
 CairoMakie.activate!(type="svg")  # use svg graphics output
 ```
 
@@ -230,16 +226,35 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and
 
 This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function.
 
-The maximum likelihood estimate (i.e. the minimizer of the objective) of $\boldsymbol{\theta}$ for this model is
+At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$
 
 ```{julia}
 #| label: dyestuff_theta
-updateL!(setθ!(m01, θ))    # restore the estimate of θ and the Cholesky factor
-θ
+only(θ)
+```
+
+the derivative should also be zero (in practice, close to zero).
+
+We can see from @fig-obj_graph that the derivative at $\theta_1=1.$ will be positive.
+
+To show the evaluation of the gradient at $\boldsymbol{\theta}=[1.]$ we reset the parameter in the model object to $[1.]$
+
+```{julia}
+#| output: false
+updateL!(setθ!(m01, ones(1)));
 ```
 
 #### Evaluating the gradient terms
 
+In `MixedModels.jl` the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of
+
+```{julia}
+ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y)
+Int.(ZXy)    # Int for more concise printing
+```
+
+is stored as the $\mathbf{A}$ property of the model object.
+
 Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern
 
 ```{julia}
@@ -251,7 +266,7 @@ Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diag
 
 ```{julia}
 Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2)))
-Λ(θ)
+Λ([1.])
 ```
 
 The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is
@@ -267,19 +282,7 @@ We assemble these into a sparse matrix as
 A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A)))
 ```
 
-which could be generated as the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of
-
-```{julia}
-ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y)
-Int.(ZXy)    # Int for more concise printing
-```
-
-::: {.callout-note collapse=true}
-#### Printing Int. of a matrix with integer entries
-
-When we know that all the entries in a floating point matrix, `X`, will be integers, we convert it to integer elements as `Int.(X)` to save space in the output.
-:::
-
+and check that these blocks are indeed the Gram matrix of the columns of $\mathbf{[ZXy]}$
 ```{julia}
 A == ZXy'ZXy
 ```
@@ -288,10 +291,10 @@ The matrix $\boldsymbol{\Omega_\theta}$ is
 
 ```{julia}
 Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2)))
-Ω(θ)
+Ω([1.])
 ```
 
-The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix with
+The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix as
 
 ```{julia}
 L = LowerTriangular(sparseL(m01; full=true))
@@ -300,123 +303,58 @@ L = LowerTriangular(sparseL(m01; full=true))
 We can check that it is indeed the lower Cholesky factor
 
 ```{julia}
-L * L' ≈ Ω(θ)
+L * L' ≈ Ω([1.])
 ```
 
 The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is
 
 ```{julia}
 dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1
-dΩdθ1(θ)
+dΩdθ1([1.])
 ```
 
-Notice that this matrix, like $\mathbf{A}$ is symmetric and has the same block structure as $\mathbf{A}$.
+Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$.
 In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$.
 
 Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to
 
 ```{julia}
-prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L')
-```
-
-We note that @eq-delterm1 is the sum of the first 6 diagonal elements of `prePhi`,
-
-```{julia}
-sum(prePhi[i,i] for i in 1:6)
+prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L')
 ```
 
-which should equal
+The derivative of the objective at $\theta_1=1$ is, therefore
 
 ```{julia}
-ldfun(x::Float64) = logdet(updateL!(setθ!(m01, [x])))
-FiniteDiff.finite_difference_derivative(ldfun, only(θ))
+dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi))
 ```
 
-Similarly the gradient of the other non-constant term in @eq-logdet is
+which we can compare to the finite-difference evaluation
 
 ```{julia}
-size(m01.y, 1) * last(prePhi)
-```
-
-compared to 
-
-```{julia}
-n_log_lyy_sq(x::Float64) = length(m01.y) * log(abs2(last(last(updateL!(setθ!(m01, [x])).L))))
-n_log_lyy_sq(only(θ))
-```
-
-```{julia}
-FiniteDiff.finite_difference_derivative(n_log_lyy_sq, only(θ))
+FiniteDiff.finite_difference_gradient(objective!(m01), [1.])
 ```
 
+If we repeat these steps at the parameter estimate we have
 ```{julia}
 #| output: false
 updateL!(setθ!(m01, θ))   # reset the value of θ in the model
+L = LowerTriangular(sparseL(m01; full=true))
+prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L')
 ```
 
-If we wish to continue the evaluation of all of the elements of $d\mathbf{L}/d\theta_1$ we would use
-the $\Phi$ transformation and premultiplication by $\mathbf{L}$
-
-```{julia}
-function Φ(S)
-  val = tril(S)    # extract the lower triangle
-  for i in diagind(val)
-    val[i] *= 0.5  # halve the diagonal elements
-  end
-  return LowerTriangular(val)
-end
-dLdθ1 = L * Φ(prePhi)
-```
-
-We can check these results against results from finite difference methods.
-First we check the $[1,1]$ entry of $\frac{d\mathbf{L}}{d\theta}$ and its derivative
-
-```{julia}
-l11(θ::T) where {T<:Real} = first(first(updateL!(setθ!(m01, [θ])).L))
-l11(only(θ))
-```
-
-and its finite-difference derivative
-
-```{julia}
-FiniteDiff.finite_difference_derivative(l11, only(θ))
-```
-
-which matches the leading diagonal elements of `dLdθ1`.
-
-Next we check the last diagonal element of $\mathbf{L}$, which is the square root of the penalized residual sum-of-squares.
-```{julia}
-rtprss(θ::T) where {T<:Real} = last(last(updateL!(setθ!(m01, [θ])).L))
-rtprss(only(θ))
-```
+with the derivative being evaluated as
 
 ```{julia}
-FiniteDiff.finite_difference_derivative(rtprss, only(θ))
+dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi))
 ```
 
-These calculations verify the values of diagonal elements of $\frac{d\mathbf{L}}{d\theta}$ that will be used to evaluate the gradient of the profiled log-likelihood.
-
-The derivative of the objective is more easily obtained by rearranging the terms in the objective to isolate constants.
-First, because the determinant of a triangular matrix is the product of its diagonal elements, and hence the log of the determinant is the sum of the logs of its diagonal elements,
-
-$$
-\begin{aligned}
-\frac{\partial\log\left|\mathbf{L_{ZZ}}\right|^2}{\partial\theta_i}&=
-2\frac{\partial\sum_{j=1}^q \log\left(L_{jj}\right)}{\partial\theta_i}\\
-&=2\sum_{j=1}^q\frac{1}{L_{jj}}\frac{\partial{L_{jj}}}{\partial\theta_i}
-\end{aligned}
-$$ {#eq-logdet}
-
-Again, because $\mathbf{L}$ is lower triangular, the division by $L_{jj}$ in the summands of @eq-logdet exactly cancels the effect of multiplication on the left by $\mathbf{L}$ in @eq-dL in these diagonal terms.
-Also the multiplier of $2$ in @eq-logdet cancels the halving of the diagonal in @eq-Phi_dOmega, leaving only @eq-delterm1.
-
 ### Penicillin - two completely crossed scalar random-effects terms
 
 The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s.
 
 ```{julia}
 #| label: penicillin_data
-const penicillin = Table(dataset(:penicillin))
+penicillin = Table(dataset(:penicillin))
 ```
 
 We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`,
@@ -425,7 +363,9 @@ We construct a `LinearMixedModel` struct with a single fixed-effect parameter, r
 #| label: m02
 #| output: false
 #| warn: false
-m02 = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin)
+m02 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin)
+θ = m02.θ
+print(m02)
 ```
 
 for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is
@@ -441,10 +381,10 @@ The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left
 
 ```{julia}
 #| label: m02L
-Lsparse = LowerTriangular(sparseL(updateL!(m02); full=true))
+Lsparse = LowerTriangular(sparseL(updateL!(setθ!(m02, ones(2))); full=true))
 ```
 
-In practice, the full $\mathbf{L}$ matrix is stored in a blocked form
+In practice, the $\mathbf{L}$ matrix is stored in a blocked form
 
 ```{julia}
 #| label: m02_blocks
@@ -460,186 +400,81 @@ objective(m02)
 
 #### Evaluating terms in the gradient
 
-For illustration of the gradient evaluation we create the lower-triangular sparse submatrix $\mathbf{L_{ZZ}}$ as
+Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the `A` property of the model
 
 ```{julia}
-#| label: m02LZZ
-LZZsparse = LowerTriangular(sparseL(m02))
-```
-
-from which $\log\left|\mathbf{L_{ZZ}}\right|^2$ can be evaluated as
-
-```{julia}
-#| label: logdet_m02
-2. * sum(log, diag(LZZsparse))
-```
-
-In practice we use the `logdet` function
-
-```{julia}
-#| label: logdet__m02
-logdet(m02)
-```
-
-which evaluates this quantity from the blocked representation of $\mathbf{L}$.
-
-A finite-difference approximation to the gradient of the `logdet` at this value of $\boldsymbol{\theta}$ is
-
-```{julia}
-ldfun(x::Vector{Float64}) = logdet(updateL!(setθ!(m02, x)))
-FiniteDiff.finite_difference_gradient(ldfun, [1., 1.]) 
-```
-
-The matrix $\mathbf{A_{ZZ}}=\mathbf{Z}^\top\mathbf{Z}$ for this model, as a dense matrix, is
-
-```{julia}
-#| label: denseA
-A = Int.(hvcat(2, first(m02.A), m02.A[2]', m02.A[2], m02.A[3]))
-```
-
-and the first face of $\nabla{\boldsymbol{\Lambda}}$ is
-
-```{julia}
-#| label: nabla_Lambda
-nabla1 = Int.(Diagonal(vcat(ones(Int, 24), zeros(Int, 6))))
-```
-
-With $\boldsymbol{\Lambda(\theta)}$ being
-
-```{julia}
-Λ(θ) = Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6)))
-θ = ones(2)      # initial parameter vector
-Int.(Λ(θ))       # initial value of Λ
-```
-
-the first face of $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
-\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is
-
-```{julia}
-#| label: symprod
-symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1
-Int.(symprod)
-```
-
-producing the matrix whose trace is desired as
-
-```{julia}
-rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')  # overwrites the value of symprod
-```
-
-yielding the trace as
-
-```{julia}
-sum(diag(symprod))
-```
-
-One point to notice here is that the $[1,1]$ block of this matrix is diagonal, with elements of
-
-```{julia}
-((2 * first(θ)) .* first(m02.A).diag) ./ abs2.(first(m02.L).diag)
-```
-
-which can be used to simplify the evaluation of the first gradient term.
-In particular, the gradient of a model with a single, scalar random-effects term is, unsurprisingly, straightforward.
-
-For the second element of the gradient we define
-
-```{julia}
-nabla2 = Diagonal(vcat(zeros(24), ones(6)))
-Int.(nabla2)
-```
-
-and
-
-```{julia}
-symprod = nabla2 * A * Λ(ones(2)) + Λ(ones(2)) * A * nabla2
-```
-
-The matrix whose trace is required is
-
-```{julia}
-rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')
-```
-
-producing the second element of the gradient of `ldfun` as
-
-```{julia}
-sum(diag(symprod))
+A2 = let blk = m02.A
+  hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6])
+end
 ```
 
-Notice that the entire $[1,1]$ block of this matrix is zero and will not need to be evaluated explicitly.
-
-We evaluate $\boldsymbol{\hat{\theta}}$ using a derivative-free optimizer as
+The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is
 
 ```{julia}
-θ = refit!(m02).θ
+function Λ2(θ::Vector{Float64})
+  length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2"))
+  return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2)))
+end
+Λ2([1.,1.])
 ```
 
-after which the first face of `symprod` becomes
+producing $\boldsymbol{\Omega}$ as
 
 ```{julia}
-symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1
+function Ω2(θ)
+  length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2"))
+  return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2)))
+end
+Ω2([1.,1.])
 ```
 
-`LZZsparse` becomes
+The partial derivatives of `Ω2` are constants
 
 ```{julia}
-LZZsparse = LowerTriangular(sparseL(m02))
+#| output: false
+dΛ2dθ1 = Diagonal(vcat(ones(24), zeros(8)))
+dΛ2dθ2 = Diagonal(vcat(zeros(24), ones(6), zeros(2)))
 ```
 
-and the matrix whose trace is required is
+For the ML objective (i.e. negative twice the log-likelihood) a finite difference gradient at the initial $\boldsymbol{\theta}=[1,1]^\top$ is
 
 ```{julia}
-rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')
+FiniteDiff.finite_difference_gradient(objective!(m02), ones(2))
 ```
 
-yielding the gradient term
+To evaluate this quantity from the formula we create
 
 ```{julia}
-sum(diag(symprod))
+dΩ2dθ1(θ) = dΛ2dθ1 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ1'
+Int.(dΩ2dθ1([1., 1.]))  # Int to save space when printing
 ```
 
-which can be compared to the finite difference value
 
 ```{julia}
-FiniteDiff.finite_difference_gradient(ldfun, θ) 
+prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ1([1.,1.])), Lsparse')
 ```
 
-(Note, this is the first element of the gradient of the `logdet` term only, not the gradient of the objective which is near zero
+yielding the first component of the gradient as 
 
 ```{julia}
-FiniteDiff.finite_difference_gradient(objective!(m02), θ)
+dot(diag(prePhi), vcat(ones(30), 0., length(m02.y)))
 ```
 
-as it should be at the optimum.)
-
-For the second element of the gradient of `ldfun` we have
+For the second component of the gradient
 
 ```{julia}
-symprod = nabla2 * A * Λ(θ) + Λ(θ) * A * nabla2
+dΩ2dθ2(θ) = dΛ2dθ2 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ2'
+Int.(dΩ2dθ2([1., 1.]))
 ```
 
-After pre- and post-division by `LZZsparse`, this becomes
-
 ```{julia}
-rdiv!(ldiv!(LZZsparse, symprod), LZZsparse')
+prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse')
 ```
 
-yielding the second element of the gradient of `ldfun` as
-
 ```{julia}
-sum(diag(symprod))
+dot(diag(prePhi), vcat(ones(30), 0., length(m02.y)))
 ```
 
-#### Factoring the symmetric matrix 
-
-The matrix $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
-\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is symmetric and has the same sparsity structure as $\mathbf{Z^\top Z}$, which is positive semi-definite.
-However, it is not clear that the non-zero blocks in $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+
-\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$  will be positive semi-definite in the general case.
-In the case of a single variance component it will be positive definite when $\theta_1>0$ because it is $2\theta_1\mathbf{A}$.
-
-
 ### References {.unnumbered}
 
 ::: {#refs}
diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib
index 25109d8e4..decf33365 100644
--- a/gradients/bibliography.bib
+++ b/gradients/bibliography.bib
@@ -31,12 +31,14 @@ @article{bates2025mixed
   title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization},
   author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H},
   journal={arXiv preprint arXiv:2505.11674},
-  year={2025}
+  year={2025},
+  url={https://arxiv.org/pdf/2505.11674}
 }
 
 @article{murray2016differentiation,
   title={Differentiation of the Cholesky decomposition},
   author={Murray, Iain},
   journal={arXiv preprint arXiv:1602.07527},
-  year={2016}
+  year={2016},
+  url={https://arxiv.org/pdf/1602.07527}
 }

From 2e8ac3787d9ba2beff9de3880ca4e66c37b75ee0 Mon Sep 17 00:00:00 2001
From: Douglas Bates <dmbates@gmail.com>
Date: Fri, 9 Jan 2026 11:53:18 -0600
Subject: [PATCH 14/15] Partial gradient for vector-valued r.e.'s

---
 gradients/GradientEvaluation.qmd | 125 +++++++++++++++++++++++++++----
 gradients/gradient.jl            |  39 ++++++++++
 2 files changed, 148 insertions(+), 16 deletions(-)
 create mode 100644 gradients/gradient.jl

diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd
index ac652277d..174c786c4 100644
--- a/gradients/GradientEvaluation.qmd
+++ b/gradients/GradientEvaluation.qmd
@@ -45,7 +45,7 @@ Expressions for the gradient of the profiled log-likelihood were given in sec. 3
 
 The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients.
 
-### Model definition and evaluation of the objective and gradient
+### Model definition and evaluation of the objective
 
 The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as
 
@@ -58,31 +58,27 @@ $$
 $$ {#eq-dists}
 
 where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$.
-Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite.
+Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is symmetric and positive semi-definite.
 We express it as
 
 $$
-\boldsymbol{\Sigma} = \sigma^2
-\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}}
+\boldsymbol{\Sigma} = \sigma^2\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}}
 $$ {#eq-Sigma}
 
 for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$.
 
 In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship
 
+
 $$
 \begin{aligned}
 \boldsymbol{\Omega_\theta}&=
 \begin{bmatrix}
 \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}&
-\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} &
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X}&
 \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\
-\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} &
-\mathbf{X^\top X} &
-\mathbf{X^\top y}\\
-\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} &
-\mathbf{y^\top X} &
-\mathbf{y^\top y}\\
+\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{X^\top X} & \mathbf{X^\top y}\\
+\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{y^\top X} & \mathbf{y^\top y}\\
 \end{bmatrix}\\
 &=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\
 &=
@@ -100,9 +96,7 @@ $$
 $$ {#eq-blockedOmega}
 
 where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive.
-(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.)
-
-(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
+(We assume that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.)
 
 As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as
 
@@ -131,7 +125,49 @@ $$
 $$ {#eq-objective}
 
 where $c_r$ is likewise a constant.
-This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$.
+This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_\boldsymbol{\theta}}$.
+
+### Reformulation for evaluation of derivatives
+
+When differentiating the ML or REML objective with respect to elements of $\boldsymbol{\theta}$, it is convenient to amalgamate the blocks of $\boldsymbol{\Omega_\theta}$ derived from $\mathbf{X}$ and $\mathbf{y}$ and to re-express @eq-blockedOmega as
+
+$$
+\begin{aligned}
+\boldsymbol{\Omega_\theta}&=
+\begin{bmatrix}
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}&
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top[Xy]}\\
+\mathbf{[Xy]^\top Z}\boldsymbol{\Lambda_\theta} &
+\mathbf{[Xy]^\top[Xy]}
+\end{bmatrix} +
+\begin{bmatrix}
+\mathbf{I} & \mathbf{0}\\
+\mathbf{0} & \mathbf{0}
+\end{bmatrix}\\
+&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\
+&=
+\begin{bmatrix}
+\mathbf{L_{Z,Z}} & \mathbf{0}\\
+\mathbf{L_{Xy,Z}} & \mathbf{L_{Xy,Xy}}
+\end{bmatrix}
+\begin{bmatrix}
+\mathbf{L_{Z,Z}} & \mathbf{0} \\
+\mathbf{L_{Xy,Z}} & \mathbf{L_{Xy,Xy}}
+\end{bmatrix}^\top
+\end{aligned}
+$$ {#eq-blockedOmega_mod}
+
+where $\mathbf{[Xy]}$ represents the $n\times(p+1)$ matrix that is the horizontal concatenation of $\mathbf{X}$ and $\mathbf{y}$.
+The matrices $\mathbf{A_{11}}=\mathbf{Z^\top Z}$, $\mathbf{A_{21}}=\mathbf{[Xy]^\top Z}$ and $\mathbf{A_{22}}=\mathbf{[Xy]^\top[Xy]}$, assembled as
+
+$$
+\mathbf{A}=\begin{bmatrix}
+\mathbf{A_{11}} & \mathbf{A_{21}^\top}\\
+\mathbf{A_{21}} & \mathbf{A_{22}}
+\end{bmatrix} ,
+$$ {#eq-Amat}
+
+are precomputed and stored as the `A` property in a `LinearMixedModel` object when random effects are associated with a single grouping factor.
 
 ### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative}
 
@@ -178,8 +214,10 @@ Load the packages to be used
 #| label: load_packages
 #| warning: false
 #| output: false
+using BenchmarkTools
 using CairoMakie
 using FiniteDiff
+using ForwardDiff
 using LinearAlgebra
 using MixedModels
 using MixedModelsDatasets: dataset
@@ -408,7 +446,7 @@ A2 = let blk = m02.A
 end
 ```
 
-The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is
+The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is
 
 ```{julia}
 function Λ2(θ::Vector{Float64})
@@ -475,6 +513,61 @@ prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse')
 dot(diag(prePhi), vcat(ones(30), 0., length(m02.y)))
 ```
 
+### Sleepstudy - a single vector-valued random-effects term
+
+```{julia}
+m03 = fit(MixedModel, @formula(reaction ~ 1 + days + (1+days|subj)), dataset(:sleepstudy); progress)
+θ03 = m03.θ
+print(m03)
+```
+
+For this model we create a function to evaluate the gradient components.
+First, the gradient of the logarithm of the square of the determinant of $\mathbf{L_{1,1}}$
+
+```{julia}
+ldfun(θ::Vector{Float64}) = logdet(updateL!(setθ!(m03, θ)))
+ldfun(θ03)
+```
+
+A finite-difference approximation to the gradient is
+
+```{julia}
+FiniteDiff.finite_difference_gradient(ldfun, θ03)
+```
+
+with the corresponding analytic value
+
+```{julia}
+include("gradient.jl")
+grad_comp(updateL!(setθ!(m03, θ03)))    # need to reset θ after the finite_difference operation
+```
+
+
+#### Speed of evaluation
+
+First, the evaluation of the log-determinant of $\mathbf{L_{1,1}}$
+
+```{julia}
+@benchmark logdet(updateL!(setθ!($m03, $θ03))) seconds=1
+```
+
+Then the subsequent evaluation of the gradient of the log-determinant
+
+```{julia}
+@benchmark grad_comp($m03)  seconds=1
+```
+
+For comparison, the finite-difference evaluation
+```{julia}
+@benchmark FiniteDiff.finite_difference_gradient($ldfun, $θ03)  seconds=1
+```
+
+and the forward-difference version
+
+```{julia}
+@benchmark ForwardDiff.gradient($m03, $θ03)   seconds=1
+```
+
 ### References {.unnumbered}
 
 ::: {#refs}
diff --git a/gradients/gradient.jl b/gradients/gradient.jl
new file mode 100644
index 000000000..8b0a8ff79
--- /dev/null
+++ b/gradients/gradient.jl
@@ -0,0 +1,39 @@
+using LinearAlgebra
+using MixedModels
+using MixedModelsDatasets: dataset
+
+"""
+    grad_comp(m::LinearMixedModel)
+
+Returns the gradient of the log-determinant part of the objective for `m`,
+which must have a single, vector-valued random-effects term.
+"""
+function grad_comp(m::LinearMixedModel{T}) where {T}
+    (; reterms, parmap, A, L) = m
+    A11 = first(A).data
+    L11 = first(L).data
+    λ = only(reterms).λ                 # checks that there is exactly one random-effects term
+    λdot = similar(λ)
+    face = similar(λ.data)
+    grad = zeros(T, length(parmap))
+    for (p, pm) in enumerate(parmap)
+        fill!(λdot, zero(T))
+        λdot[pm[2], pm[3]] = one(T)
+        for k in axes(A11, 3)           # loop over faces of A[1].data
+            rmul!(lmul!(λ', copyto!(face, view(A11, :, :, k))), λdot)
+            for i in axes(face, 1)      # symmetrize the face and double the diagonal
+                for j in 1:(i - 1)
+                    ijsum = face[i, j] + face[j, i]
+                    face[j, i] = face[i, j] = ijsum
+                end
+                face[i, i] *= 2
+            end
+            Lface = LowerTriangular(view(L11, :, :, k))
+            rdiv!(ldiv!(Lface, face), Lface')
+            for i in diagind(face)
+                grad[p] += face[i]
+            end
+        end 
+    end
+    return grad
+end
\ No newline at end of file

From cfba091042cc8f3f8ec4dd4af3e2e3b5883d46a1 Mon Sep 17 00:00:00 2001
From: Douglas Bates <dmbates@gmail.com>
Date: Tue, 13 Jan 2026 19:04:52 -0600
Subject: [PATCH 15/15] Expand docs, start src/gradient.jl

---
 gradients/GradientEvaluation.qmd          |  41 ++++++--
 gradients/Gradient_based_optimization.qmd | 123 ++++++++++++++++++----
 src/MixedModels.jl                        |   1 +
 src/gradient.jl                           |  56 ++++++++++
 test/grad.jl                              |  73 +++++++++++++
 test/runtests.jl                          |   1 +
 6 files changed, 269 insertions(+), 26 deletions(-)
 create mode 100644 src/gradient.jl
 create mode 100644 test/grad.jl

diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd
index 174c786c4..9c11192ff 100644
--- a/gradients/GradientEvaluation.qmd
+++ b/gradients/GradientEvaluation.qmd
@@ -220,6 +220,7 @@ using FiniteDiff
 using ForwardDiff
 using LinearAlgebra
 using MixedModels
+using MixedModels: Omega_dot_diag_block!
 using MixedModelsDatasets: dataset
 using TypedTables: Table
 
@@ -251,11 +252,13 @@ print(m01)
 
 ```{julia}
 #| label: fig-obj_graph
-#| fig-cap: "Graph of the objective for model m01 as a function of θ₁"
+#| fig-cap: "Graph of the objective for model m01 as a function of θ₁. The light blue horizontal line is at the minimum of the objective.  The vertical line is at the parameter estimate."
 #| code-fold: true
 let f = Figure()
   ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective")
   lines!(ax, -0.25..1.5, objective!(m01))
+  hlines!(objective(updateL!(setθ!(m01, θ))); alpha=0.4)
+  vlines!(only(θ); alpha=0.4)
   f
 end
 ```
@@ -264,7 +267,7 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and
 
 This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function.
 
-At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$
+At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$, which is the ratio of the standard deviation of the random effects to the residual standard deviation, is
 
 ```{julia}
 #| label: dyestuff_theta
@@ -352,12 +355,16 @@ dΩdθ1([1.])
 ```
 
 Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$.
-In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$.
+In fact, the $[2,1]$ block of this matrix is exactly the same as the $[2,1]$ block of $\mathbf{A}$.
 
-Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to
+We do the premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ in two stages, to check the form of the intermediate result
 
 ```{julia}
-prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L')
+prePhi = ldiv!(L, dΩdθ1([1.]))
+```
+
+```{julia}
+rdiv!(prePhi, L')
 ```
 
 The derivative of the objective at $\theta_1=1$ is, therefore
@@ -386,6 +393,25 @@ with the derivative being evaluated as
 dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi))
 ```
 
+#### Using blocked factors
+
+It would not be practical to create the `sparseL` matrix and a dense copy of $\mathbf{A}$, as is done here, for general cases.
+The whole point of using a blocked Cholesky factor for evaluating the objective is that the $[1,1]$ block is either diagonal or uniform-block-diagonal, which can result in considerable savings of object size and execution time when evaluating the objective.
+
+We would want to retain this time and memory savings when evaluating the gradient by creating a blocked computation for evaluation of the gradient.
+
+The blocked representations of both the $\mathbf{A}$ and $\mathbf{L}$ arrays store only the lower triangle.
+It appears that it will be necessary to store the upper triangle in addition to the lower triangle to be able to evaluate the gradient.
+There may be a clever way around this but right now I can't see one.
+
+To create the blocked representation of $\Omega$ we begin with the diagonal block in which the parameter component to be differentiated occurs.
+It is assumed that the block has already been allocated.
+
+```{julia}
+dΩdθ1_blk = Omega_dot_diag_block!(similar(first(m01.A)), m01, 1)
+```
+
+
 ### Penicillin - two completely crossed scalar random-effects terms
 
 The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s.
@@ -444,6 +470,7 @@ Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the
 A2 = let blk = m02.A
   hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6])
 end
+Int.(A2)
 ```
 
 The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is
@@ -453,7 +480,7 @@ function Λ2(θ::Vector{Float64})
   length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2"))
   return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2)))
 end
-Λ2([1.,1.])
+Int.(Λ2([1.,1.]))
 ```
 
 producing $\boldsymbol{\Omega}$ as
@@ -463,7 +490,7 @@ function Ω2(θ)
   length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2"))
   return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2)))
 end
-Ω2([1.,1.])
+Int.(Ω2([1.,1.]))
 ```
 
 The partial derivatives of `Ω2` are constants
diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd
index d3167307f..96ee4eb08 100644
--- a/gradients/Gradient_based_optimization.qmd
+++ b/gradients/Gradient_based_optimization.qmd
@@ -48,6 +48,8 @@ Load the packages to be used
 
 ```{julia}
 #| label: load_packages
+#| output: false
+using BenchmarkTools
 using ForwardDiff
 using MixedModels
 using MixedModels: fd_deviance
@@ -56,7 +58,7 @@ using NLopt
 using Tables: table
 using TypedTables: Table
 
-const progress = false
+const progress = isinteractive()
 ```
 
 ## Examples {#sec-examples}
@@ -64,6 +66,7 @@ const progress = false
 We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`.
 
 ```{julia}
+#| output: false
 addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0'))
 function gr_refit!(m::LinearMixedModel{T}) where {T}
   θ = copy(m.optsum.initial)
@@ -83,8 +86,9 @@ function gr_refit!(m::LinearMixedModel{T}) where {T}
     return val
   end
   opt = NLopt.Opt(:LD_LBFGS, k)
-  NLopt.ftol_rel!(opt, 1.e-12)
+  NLopt.ftol_rel!(opt, 1.e-10)
   NLopt.ftol_abs!(opt, 1.e-8)
+  NLopt.initial_step!(opt, fill(0.5, k))
   NLopt.min_objective!(opt, obj)
   min_f, min_x, ret = NLopt.optimize(opt, θ)
   header = vcat([:obj], addinds('θ', k), addinds('g', k))
@@ -98,31 +102,32 @@ Define a model for the `penicillin` data
 
 ```{julia}
 #| label: m1
-m1 = fit(
+pnm01 = fit(
   MixedModel,
   @formula(diameter ~ 1 + (1|plate) + (1|sample)),
   dataset(:penicillin);
   progress,
 )
-print(m1)
+pnm01_obj = objective(pnm01)
+print(pnm01)
 ```
 
 for which the optimization summary is
 
 ```{julia}
-m1.optsum
+pnm01.optsum
 ```
 
 and refit the model using ForwardDiff gradient evaluations.
 
 ```{julia}
-fitlog = gr_refit!(m1)
+fitlog = gr_refit!(pnm01)
 ```
 
-The objective at convergence is
+The objective at convergence, compared to the derivative-free optimum
 
 ```{julia}
-last(fitlog.obj)
+pnm01_obj - last(fitlog.obj)
 ```
 
 and the last few evaluations are
@@ -131,23 +136,89 @@ and the last few evaluations are
 last(fitlog, 5)
 ```
 
+### Pastes {#sec-pastes}
+
+```{julia}
+psm01 = fit(MixedModel, @formula(strength ~ 1 + (1|batch/cask)), dataset(:pastes); progress)
+psm01_obj = objective(psm01)
+print(psm01)
+```
+
+```{julia}
+psm01.optsum
+```
+
+```{julia}
+fitlog = gr_refit!(psm01)
+```
+
+```{julia}
+psm01_obj - last(fitlog.obj)
+```
+
+```{julia}
+last(fitlog, 5)
+```
+
+### Insteval {#sec-insteval}
+
+```{julia}
+insteval = dataset(:insteval)
+contrasts = Dict(:service => EffectsCoding())
+iem01 = fit(
+  MixedModel,
+  @formula(y ~ 1 + service + (1|s) + (1|d) + (1|dept)),
+  insteval;
+  progress, contrasts,
+)
+iem01_obj = objective(iem01)
+print(iem01)
+```
+
+```{julia}
+iem01.optsum
+```
+
+```{julia}
+fitlog = gr_refit!(iem01)
+```
+
+```{julia}
+iem01_obj - last(fitlog.obj)
+```
+
+```{julia}
+last(fitlog, 5)
+```
+
+This is an example where the number of evaluations to convergence is lower when using the gradient but the time to fit the model is much greater - primarily because the ForwardDiff gradient allocates so much memory.
+
+```{julia}
+@benchmark refit!($iem01; progress=false) seconds=10
+```
+
+```{julia}
+@benchmark gr_refit!($iem01)    seconds=30
+```
+
 ### Sleepstudy {#sec-sleepstudy}
 
 ```{julia}
-m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress)
-print(m2)
+slm01 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress)
+slm01_obj = objective(slm01)
+print(slm01)
 ```
 
 ```{julia}
-m2.optsum
+slm01.optsum
 ```
 
 ```{julia}
-fitlog = gr_refit!(m2)
+fitlog = gr_refit!(slm01)
 ```
 
 ```{julia}
-last(fitlog.obj)
+slm01_obj - last(fitlog.obj)
 ```
 
 ```{julia}
@@ -158,33 +229,47 @@ last(fitlog, 5)
 
 ```{julia}
 # this model is very overparameterized, but it's a test example
-m3 = fit(
+kbm01 = fit(
   MixedModel,
   @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)),
   dataset(:kb07);
   progress,
 )
-print(m3)
+print(kbm01)
 ```
 
 ```{julia}
-m3.optsum
+kbm01.optsum
 ```
 
 Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector
 
 ```{julia}
-findall(iszero, m3.θ)
+findall(iszero, kbm01.θ)
 ```
 
+Refitting with the gradient takes a very long time because ForwardDiff is poorly suited to optimization problems with many parameters.
 ```{julia}
-fitlog = gr_refit!(m3)
+#| eval: false
+fitlog = gr_refit!(kbm01)
 ```
 
 ```{julia}
+#| eval: false
 last(fitlog.obj)
 ```
 
 ```{julia}
+#| eval: false
 last(fitlog, 5)
-```
\ No newline at end of file
+```
+
+## Conclusions {#sec-conclusions}
+
+Generally the gradient-based optimizers converge in fewer evaluations than the derivative-free optimizers (`psm01` in @sec-pastes is an exception).
+Although the `ftol_rel` criterion is looser for the gradient-based optimizer it usually achieves a lower optimum value, as shown by the differences like `pnm01_obj - last(fitlog.obj)` being positive.
+
+I think the most interesting result is for the `insteval` data where the three-parameter optimization takes 81 function evaluations for `LN_NEWUOA` but 34 evaluations for `LD_LBFGS`.
+However, the ForwardDiff gradient evaluation takes much longer because it allocates so much memory (and it may be using a non-BLAS Cholesky factorization of $1128\times1128$ symmetric matrix).
+
+I think this is a case where an analytic gradient could be useful.
diff --git a/src/MixedModels.jl b/src/MixedModels.jl
index 61d184c84..f3400bc98 100644
--- a/src/MixedModels.jl
+++ b/src/MixedModels.jl
@@ -214,6 +214,7 @@ include("blockdescription.jl")
 include("grouping.jl")
 include("mimeshow.jl")
 include("serialization.jl")
+include("gradient.jl")
 include("profile/profile.jl")
 include("MixedModelsNLoptExt.jl")
 using .MixedModelsNLoptExt
diff --git a/src/gradient.jl b/src/gradient.jl
new file mode 100644
index 000000000..b6c68bdbb
--- /dev/null
+++ b/src/gradient.jl
@@ -0,0 +1,56 @@
+# Evaluate analytic gradient of the objective for ML or REML fitting of a LinearMixedModel
+
+"""
+  Omega_dot_diag_block!(blk, m::LinearMixedModel, p::Integer)
+
+Fill `blk` as the non-zero diagonal block of ∂Ω/∂θₚ for parameter number `p` of model `m`.
+
+For any `p` only one diagonal block of ∂Ω/∂θₚ will be non-zero.
+"""
+function Omega_dot_diag_block!(
+  blk::Diagonal{T, Vector{T}},
+  m::LinearMixedModel{T},
+  p::Integer,
+) where {T}  
+  (; parmap, A, reterms) = m
+  b, i, j = parmap[p]
+  isone(i) && isone(j) || throw(ArgumentError("parameter $b should be from a scalar r.e. term"))
+  blk_diag = blk.diag
+  A_diag = A[kp1choose2(b)].diag   # will throw an error if the A[b,b] block is not Diagonal
+  length(blk_diag) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths"))
+  λ = only(reterms[b].λ)           # will throw an error if reterms[b] is not of size (1,1)
+  for k in eachindex(blk_diag, A_diag)
+    blk_diag[k] = T(2) * λ * A_diag[k]
+  end       
+  return blk
+end
+
+function Omega_dot_diag_block!(
+  blk::UniformBlockDiagonal{T},
+  m::LinearMixedModel{T},
+  p::Integer,
+) where {T}
+  (; parmap, A, reterms) = m
+  b, i, j = parmap[p]
+  Ablk = A[kp1choose2(b)]
+  if !isa(Ablk, UniformBlockDiagonal{T})
+    throw(ArgumentError("parmap[p] = $(parmap[p]) but A[$(kp1choose2(b))] is not UniformBlockDiagonal"))
+  end
+  blk_dat = fill!(blk.data, zero(T))
+  Ablk_dat = Ablk.data
+  λ = reterms[b].λ
+  for k in axes(blk_dat, 3)
+      # right multiply by λ-dot-transpose, which is zeros except for a single 1 at the i'th column and j'th row
+      # thus we copy the i'th column of the k'th face of Ablk_dat into the j'th column of the k'th face of blk_dat
+    copyto!(view(blk_dat, :, i, k), view(Ablk_dat, :, j, k))
+    lmul!(λ, view(blk_dat, :, :, k)) # left-multiply by λ
+    for jj in axes(λ, 2)             # symmetrize the face while multiplying the diagonal by 2
+      for ii in 1:(jj - 1)
+        val = blk_dat[ii, jj, k] + blk_dat[jj, ii, k]
+        blk_dat[ii, jj, k] = blk_dat[jj, ii, k] = val
+      end
+      blk_dat[jj, jj, k] *= T(2)
+    end
+  end
+  return blk
+end
diff --git a/test/grad.jl b/test/grad.jl
new file mode 100644
index 000000000..1e036be7d
--- /dev/null
+++ b/test/grad.jl
@@ -0,0 +1,73 @@
+# using FiniteDiff
+using LinearAlgebra
+using MixedModels
+using Test
+using MixedModels: Omega_dot_diag_block!
+
+using MixedModelsDatasets: dataset
+
+include("modelcache.jl")
+
+@testset "gradient" begin
+    @testset "single_scalar" begin
+        fm1 = only(models(:dyestuff2))
+        θ = fm1.θ
+        blk = Omega_dot_diag_block!(similar(first(fm1.A)), fm1, 1)
+        @test all(≈(10. * only(θ)), blk.diag)
+        Omega_dot_diag_block!(blk, updateL!(setθ!(fm1, ones(1))), 1)
+        @test all(==(10.), blk.diag)
+        updateL!(setθ!(fm1, θ))         # restore the estimated parameter values
+
+        fm2 = first(models(:pastes))
+        θ = fm2.θ
+        blk = Omega_dot_diag_block!(similar(first(fm2.A)), fm2, 1)
+        @test all(≈(4. * only(θ)), blk.diag)
+        Omega_dot_diag_block!(blk, updateL!(setθ!(fm2, ones(1))), 1)
+        @test all(==(4.), blk.diag)
+        updateL!(setθ!(fm2, θ))
+
+        fm3 = last(models(:pastes))     # first of two nested, scalar r.e. terms
+        θ = fm3.θ
+        blk = Omega_dot_diag_block!(similar(first(fm3.A)), fm3, 1)
+        @test all(≈(4. * first(θ)), blk.diag)
+
+        fm4 = only(models(:penicillin))
+        blk = Omega_dot_diag_block!(similar(first(fm4.A)), fm4, 1)
+        @test all(≈(12. * first(fm4.θ)), blk.diag)
+
+        fm5 = first(models(:sleepstudy))
+        blk = Omega_dot_diag_block!(similar(first(fm5.A)), fm5, 1)
+        @test all(≈(20. * only(fm5.θ)), blk.diag)
+    end
+    @testset "single_vector" begin
+        fm6 = last(models(:sleepstudy))
+        λ = only(fm6.reterms).λ
+        θ = fm6.θ
+        blk = Omega_dot_diag_block!(UniformBlockDiagonal(similar(first(fm6.L).data)), fm6, 1)
+        blk_dat = blk.data
+        A11_dat = first(fm6.A).data
+        @test all(≈(20. * first(θ)), view(blk_dat, 1, 1, :))
+        @test all(iszero, view(blk_dat, 2, 2, :))
+        @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :))
+        odiag = dot(view(λ, 2, :), view(A11_dat, :, 1, 1))
+        @test all(≈(odiag), view(blk_dat, 1, 2, :))
+
+        Omega_dot_diag_block!(blk, fm6, 2)
+        @test all(iszero, view(blk_dat, 1, 1, :))
+        @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :))   # result should be symmetric
+        @test all(==(10. * first(θ)), view(blk_dat, 1, 2, :))
+        diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 1, 1))
+        @test all(≈(diag2), view(blk_dat, 2, 2, :))
+
+        Omega_dot_diag_block!(blk, fm6, 3)
+        @test all(iszero, view(blk_dat, 1, 1, :))
+        @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :))   # faces of result should be symmetric
+        @test all(≈(45. * first(θ)), view(blk_dat, 1, 2, :))
+        diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 2, 1))
+        @test all(≈(diag2), view(blk_dat, 2, 2, :))
+
+#        FiniteDiff.finite_difference_gradient(objective!(fm6), θ)
+#        ldfun(m::LinearMixedModel, θ::Vector{Float64}) = logdet(updateL!(setθ!(m, θ)))
+
+    end
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 0631ca3a0..20cb6d4f5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -47,6 +47,7 @@ include("mime.jl")
 include("optsummary.jl")
 include("predict.jl")
 include("sigma.jl")
+include("grad.jl")
 
 @testset "PRIMA" include("prima.jl")
 @testset "ForwardDiff" include("forwarddiff.jl")