diff --git a/NEWS.md b/NEWS.md
index c46f64194..935584ffa 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+- Additional methods for pre-allocated result arrays and `*Config` instances have been added to the ForwardDiff extension. [#871].
+
 MixedModels v5.2.0 Release Notes
 ==============================
 - The use of the `wts` keyword argument has been deprecated in favor of the keyword argument `weights`, in line with the deprecation in GLM.jl v1.9.1. The usage (and subsequent interpretation) remains otherwise unchanged. [#873]
@@ -714,4 +716,5 @@ Package dependencies
 [#864]: https://github.com/JuliaStats/MixedModels.jl/issues/864
 [#865]: https://github.com/JuliaStats/MixedModels.jl/issues/865
 [#867]: https://github.com/JuliaStats/MixedModels.jl/issues/867
+[#871]: https://github.com/JuliaStats/MixedModels.jl/issues/871
 [#873]: https://github.com/JuliaStats/MixedModels.jl/issues/873
diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl
index c3263e620..149bfe355 100644
--- a/ext/MixedModelsForwardDiffExt.jl
+++ b/ext/MixedModelsForwardDiffExt.jl
@@ -30,7 +30,10 @@ using LinearAlgebra: LinearAlgebra,
 using SparseArrays: SparseArrays, nzrange
 
 # Stuff we're defining in this file
-using ForwardDiff: ForwardDiff
+using ForwardDiff: ForwardDiff,
+    Chunk,
+    GradientConfig,
+    HessianConfig
 using MixedModels: fd_cholUnblocked!,
     fd_deviance,
     fd_logdet,
@@ -59,6 +62,16 @@ const FORWARDDIFF = """
     should be included is currently still being decided.
 """
 
+#####
+##### Gradients
+#####
+
+function ForwardDiff.GradientConfig(
+    model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x)
+) where {T}
+    return GradientConfig(fd_deviance(model), x, chunk)
+end
+
 """
     ForwardDiff.gradient(model::LinearMixedModel)
 
@@ -68,9 +81,29 @@ values.
 $(FORWARDDIFF)
 """
 function ForwardDiff.gradient(
-    model::LinearMixedModel{T}, θ::Vector{T}=model.θ
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::GradientConfig=GradientConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T,CHK}
+    return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check)
+end
+
+function ForwardDiff.gradient!(result::AbstractArray,
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::GradientConfig=GradientConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T,CHK}
+    return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check)
+end
+
+#####
+##### Hessians
+#####
+
+function ForwardDiff.HessianConfig(
+    model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x)
 ) where {T}
-    return ForwardDiff.gradient(fd_deviance(model), θ)
+    return HessianConfig(fd_deviance(model), x, chunk)
 end
 
 """
@@ -82,9 +115,20 @@ values.
 $(FORWARDDIFF)
 """
 function ForwardDiff.hessian(
-    model::LinearMixedModel{T}, θ::Vector{T}=model.θ
-) where {T}
-    return ForwardDiff.hessian(fd_deviance(model), θ)
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::HessianConfig=HessianConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T,CHK}
+    n = length(θ)
+    return ForwardDiff.hessian!(Matrix{T}(undef, n, n), model, θ, cfg, check)
+end
+
+function ForwardDiff.hessian!(result::AbstractArray,
+    model::LinearMixedModel{T}, θ::Vector{T}=model.θ,
+    cfg::HessianConfig=HessianConfig(model, θ),
+    check::Val{CHK}=Val(true),
+) where {T,CHK}
+    return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check)
 end
 
 #####
diff --git a/gradients/.gitignore b/gradients/.gitignore
new file mode 100644
index 000000000..7d064f72a
--- /dev/null
+++ b/gradients/.gitignore
@@ -0,0 +1,4 @@
+*.html
+*\~
+*.swp
+
diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd
new file mode 100644
index 000000000..ac652277d
--- /dev/null
+++ b/gradients/GradientEvaluation.qmd
@@ -0,0 +1,481 @@
+---
+title: "Gradient of the Profiled log-likelihood"
+author:
+  - name: Douglas Bates
+    email: dmbates@gmail.com
+    orcid: 0000-0001-8316-9503
+    affiliation:
+      - name: University of Wisconsin - Madison
+        city: Madison
+        state: WI
+        url: https://www.wisc.edu
+        department: Statistics
+  - name: Phillip Alday
+    email: me@phillipalday.com
+    orcid: 0000-0002-9984-5745
+    affiliation:
+      - name: Beacon Biosignals
+        url: https://beacon.bio
+date: last-modified
+date-format: iso
+toc: true
+bibliography: bibliography.bib
+number-sections: true
+engine: julia
+julia:
+  exeflags:
+    - -tauto
+    - --project=@.
+format:
+  html:
+    toc: true
+    toc-location: right
+    embed-resources: true
+---
+
+## Introduction {#sec-intro}
+
+A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper.
+The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design.
+
+The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and in [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood.
+The optimization itself is performed by a derivative-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers.
+
+Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages.
+
+The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients.
+
+### Model definition and evaluation of the objective and gradient
+
+The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as
+
+$$
+\begin{aligned}
+    \mathbfcal{B}&\sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})\\
+    (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})&
+    \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)
+\end{aligned}
+$$ {#eq-dists}
+
+where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$.
+Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite.
+We express it as
+
+$$
+\boldsymbol{\Sigma} = \sigma^2
+\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}}
+$$ {#eq-Sigma}
+
+for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$.
+
+In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship
+
+$$
+\begin{aligned}
+\boldsymbol{\Omega_\theta}&=
+\begin{bmatrix}
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}&
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} &
+\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\
+\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} &
+\mathbf{X^\top X} &
+\mathbf{X^\top y}\\
+\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} &
+\mathbf{y^\top X} &
+\mathbf{y^\top y}\\
+\end{bmatrix}\\
+&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\
+&=
+\begin{bmatrix}
+\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\
+\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\
+\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}}
+\end{bmatrix}
+\begin{bmatrix}
+\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\
+\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\
+\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}}
+\end{bmatrix}^\top
+\end{aligned}
+$$ {#eq-blockedOmega}
+
+where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive.
+(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.)
+
+(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.)
+
+As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as
+
+$$
+\begin{aligned}
+-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&=
+\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\
+&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c_\ell\\
+&=2\sum_{j=1}^q\log L_{j,j} + 2n \log L_{q+p+1,q+p+1} + c_\ell
+\end{aligned}
+$$ {#eq-objective}
+
+where $c_\ell$ is a constant.
+That is, the objective is an affine function (a linear function plus a constant) of the logarithms of the diagonal elements of $\mathbf{L}_\boldsymbol{\theta}$.
+It happens that the gradient of the objective, as a function of $\boldsymbol{\theta}$, expressed in this form is straightforward to evaluate, as shown in @sec-Cholesky_derivative.
+
+As shown in @bates.maechler.etal:2015, sec 3.4 the REML criterion, which some prefer for parameter estimation, can be written as
+
+$$
+\begin{aligned}
+-2\mathcal{L}_R(\boldsymbol{\theta}|\mathbf{y})&=
+\log\left(\left|\mathbf{L_{ZZ}}\right|^2\left|\mathbf{L_{XX}}\right|^2\right) + (n-p) \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n-p}\right)\right]\\
+&=\log\left|\mathbf{L_{ZZ}}\right|^2 + \log\left|\mathbf{L_{XX}}\right|^2 + (n-p)\log\ell^2_{\mathbf{yy}} + c_r\\
+&=2\sum_{j=1}^{q+p}\log L_{j,j} + 2(n-p)\log L_{q+p+1,q+p+1} + c_r
+\end{aligned}
+$$ {#eq-objective}
+
+where $c_r$ is likewise a constant.
+This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$.
+
+### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative}
+
+@murray2016differentiation, section 3.1, provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega.
+
+Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as
+
+$$
+d\boldsymbol{\Omega}=d\mathbf{L}\mathbf{L}^\top + \mathbf{L} d\mathbf{L}^\top
+$$ {#eq-infinitesimal}
+
+Pre-multiplying @eq-infinitesimal by $\mathbf{L}^{-1}$ and post-multiplying by $\mathbf{L}^{-\top}$ gives
+
+$$
+\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}=\mathbf{L}^{-1}d\mathbf{L} + d\mathbf{L}^\top\mathbf{L}^{-\top}
+$$ {#eq-LOmegaLT}
+
+The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first.
+Thus, the diagonal of the left-hand side is exactly the result we wish to evaluate, twice the infinitesimal of the logarithms of the diagonal elements of $\mathbf{L}$.
+
+For completeness, we provide the conclusion of the derivation in @murray2016differentiation but we don't need the more general result of $d\mathbf{L}$ - we only need the particular result from the left-hand side of @eq-LOmegaLT.
+
+To evaluate $d\mathbf{L}$ we must isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, on the right-hand side of @eq-LOmegaLT, which we do with the $\Phi$ transformation applied to a symmetric matrix.
+This transformation preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle.
+Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing
+
+$$
+\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)=\mathbf{L}^{-1}d\mathbf{L}
+$$ {#eq-Phi_dOmega}
+
+or
+
+$$
+d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)
+$$ {#eq-dL}
+
+## Examples
+
+To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples.
+
+Load the packages to be used
+
+```{julia}
+#| label: load_packages
+#| warning: false
+#| output: false
+using CairoMakie
+using FiniteDiff
+using LinearAlgebra
+using MixedModels
+using MixedModelsDatasets: dataset
+using TypedTables: Table
+
+const progress = isinteractive()  # suppress progress bars in non-interactive sessions
+CairoMakie.activate!(type="svg")  # use svg graphics output
+```
+
+### Dyestuff - a single, scalar random-effects term
+
+The `dyestuff` data set provides the yield of dyestuff in each of 5 samples from each of 6 batches of an intermediate product in the process of producing a dye.
+
+```{julia}
+#| label: dyestuff_data
+dyestuff = Table(dataset(:dyestuff))
+```
+
+A mixed-effects model for these data includes an overall "intercept" term (whose estimate will be the sample mean because of the balanced design) and random effects for each level of `batch`.
+
+```{julia}
+#| label: dyestuff_model
+m01 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), dyestuff; progress)
+θ = m01.θ     # retain a copy of the estimate of θ
+print(m01)
+```
+
+#### The objective as a function of $\theta_1$
+
+@fig-obj_graph shows the objective (negative twice the log-likelihood) of this model as a function of $\theta_1$, the relative covariance parameter.
+
+```{julia}
+#| label: fig-obj_graph
+#| fig-cap: "Graph of the objective for model m01 as a function of θ₁"
+#| code-fold: true
+let f = Figure()
+  ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective")
+  lines!(ax, -0.25..1.5, objective!(m01))
+  f
+end
+```
+
+Notice that the objective is well-defined for negative values of $\theta_1$ and that it is an even function, in the sense that $f(-\theta_1)=f(\theta_1)\,\forall\theta_1$.
+
+This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function.
+
+At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$
+
+```{julia}
+#| label: dyestuff_theta
+only(θ)
+```
+
+the derivative should also be zero (in practice, close to zero).
+
+We can see from @fig-obj_graph that the derivative at $\theta_1=1.$ will be positive.
+
+To show the evaluation of the gradient at $\boldsymbol{\theta}=[1.]$ we reset the parameter in the model object to $[1.]$
+
+```{julia}
+#| output: false
+updateL!(setθ!(m01, ones(1)));
+```
+
+#### Evaluating the gradient terms
+
+In `MixedModels.jl` the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of
+
+```{julia}
+ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y)
+Int.(ZXy)    # Int for more concise printing
+```
+
+is stored as the $\mathbf{A}$ property of the model object.
+
+Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern
+
+```{julia}
+BlockDescription(m01)
+```
+
+In @eq-Sigma the matrix $\boldsymbol{\Lambda_\theta}$ is defined as a $6\times6$ diagonal matrix.
+Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diagonal block that is the identity, so that multiplication by $\boldsymbol{\Lambda_\theta}$ applies to the full matrix $\mathbf{A}$.
+
+```{julia}
+Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2)))
+Λ([1.])
+```
+
+The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is
+
+```{julia}
+dΛdθ1 = Diagonal(vcat(ones(6), zeros(2)))
+```
+
+The matrix $\mathbf{A}$ from which $\boldsymbol{\Omega_\theta}$ is generated is stored in blocks.
+We assemble these into a sparse matrix as
+
+```{julia}
+A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A)))
+```
+
+and check that these blocks are indeed the Gram matrix of the columns of $\mathbf{[ZXy]}$
+```{julia}
+A == ZXy'ZXy
+```
+
+The matrix $\boldsymbol{\Omega_\theta}$ is
+
+```{julia}
+Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2)))
+Ω([1.])
+```
+
+The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix as
+
+```{julia}
+L = LowerTriangular(sparseL(m01; full=true))
+```
+
+We can check that it is indeed the lower Cholesky factor
+
+```{julia}
+L * L' ≈ Ω([1.])
+```
+
+The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is
+
+```{julia}
+dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1
+dΩdθ1([1.])
+```
+
+Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$.
+In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$.
+
+Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to
+
+```{julia}
+prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L')
+```
+
+The derivative of the objective at $\theta_1=1$ is, therefore
+
+```{julia}
+dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi))
+```
+
+which we can compare to the finite-difference evaluation
+
+```{julia}
+FiniteDiff.finite_difference_gradient(objective!(m01), [1.])
+```
+
+If we repeat these steps at the parameter estimate we have
+```{julia}
+#| output: false
+updateL!(setθ!(m01, θ))   # reset the value of θ in the model
+L = LowerTriangular(sparseL(m01; full=true))
+prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L')
+```
+
+with the derivative being evaluated as
+
+```{julia}
+dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi))
+```
+
+### Penicillin - two completely crossed scalar random-effects terms
+
+The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s.
+
+```{julia}
+#| label: penicillin_data
+penicillin = Table(dataset(:penicillin))
+```
+
+We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`,
+
+```{julia}
+#| label: m02
+#| output: false
+#| warn: false
+m02 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin)
+θ = m02.θ
+print(m02)
+```
+
+for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is
+
+```{julia}
+#| label: m02ZXy
+Int.(hcat(collect(first(m02.reterms)), collect(last(m02.reterms)), m02.X, m02.y))
+```
+
+in which the first 24 columns are the indicators for `plate`, the next 6 columns are the indicators for `sample`, the second-to-last column is the single column of the fixed-effects model matrix, $\mathbf{X}$, and the last column is $\mathbf{y}$.
+
+The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left[1,1\right]^\top$, can be expressed as a lower-triangular sparse matrix as
+
+```{julia}
+#| label: m02L
+Lsparse = LowerTriangular(sparseL(updateL!(setθ!(m02, ones(2))); full=true))
+```
+
+In practice, the $\mathbf{L}$ matrix is stored in a blocked form
+
+```{julia}
+#| label: m02_blocks
+BlockDescription(m02)
+```
+
+from which the profiled objective (negative twice the log-likelihood) can be evaluated as
+
+```{julia}
+#| label: m02L_initial_objective
+objective(m02)
+```
+
+#### Evaluating terms in the gradient
+
+Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the `A` property of the model
+
+```{julia}
+A2 = let blk = m02.A
+  hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6])
+end
+```
+
+The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is
+
+```{julia}
+function Λ2(θ::Vector{Float64})
+  length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2"))
+  return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2)))
+end
+Λ2([1.,1.])
+```
+
+producing $\boldsymbol{\Omega}$ as
+
+```{julia}
+function Ω2(θ)
+  length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2"))
+  return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2)))
+end
+Ω2([1.,1.])
+```
+
+The partial derivatives of `Ω2` are constants
+
+```{julia}
+#| output: false
+dΛ2dθ1 = Diagonal(vcat(ones(24), zeros(8)))
+dΛ2dθ2 = Diagonal(vcat(zeros(24), ones(6), zeros(2)))
+```
+
+For the ML objective (i.e. negative twice the log-likelihood) a finite difference gradient at the initial $\boldsymbol{\theta}=[1,1]^\top$ is
+
+```{julia}
+FiniteDiff.finite_difference_gradient(objective!(m02), ones(2))
+```
+
+To evaluate this quantity from the formula we create
+
+```{julia}
+dΩ2dθ1(θ) = dΛ2dθ1 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ1'
+Int.(dΩ2dθ1([1., 1.]))  # Int to save space when printing
+```
+
+
+```{julia}
+prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ1([1.,1.])), Lsparse')
+```
+
+yielding the first component of the gradient as 
+
+```{julia}
+dot(diag(prePhi), vcat(ones(30), 0., length(m02.y)))
+```
+
+For the second component of the gradient
+
+```{julia}
+dΩ2dθ2(θ) = dΛ2dθ2 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ2'
+Int.(dΩ2dθ2([1., 1.]))
+```
+
+```{julia}
+prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse')
+```
+
+```{julia}
+dot(diag(prePhi), vcat(ones(30), 0., length(m02.y)))
+```
+
+### References {.unnumbered}
+
+::: {#refs}
+:::
diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd
new file mode 100644
index 000000000..d3167307f
--- /dev/null
+++ b/gradients/Gradient_based_optimization.qmd
@@ -0,0 +1,190 @@
+---
+title: "Gradient-based Optimization of the Profiled log-likelihood"
+author:
+  - name: Douglas Bates
+    email: dmbates@gmail.com
+    orcid: 0000-0001-8316-9503
+    affiliation:
+      - name: University of Wisconsin - Madison
+        city: Madison
+        state: WI
+        url: https://www.wisc.edu
+        department: Statistics
+  - name: Phillip Alday
+    email: me@phillipalday.com
+    orcid: 0000-0002-9984-5745
+    affiliation:
+      - name: Beacon Biosignals
+        url: https://beacon.bio
+date: last-modified
+date-format: iso
+toc: true
+bibliography: bibliography.bib
+number-sections: true
+engine: julia
+julia:
+  exeflags:
+    - -tauto
+    - --project=@.
+format:
+  html:
+    toc: true
+    toc-location: right
+    embed-resources: true
+---
+
+## Introduction {#sec-intro}
+
+Before devoting too much effort to efficient evaluation of the gradient of the profiled log-likelihood, we should check if using gradient-based optimization requires sufficiently fewer evaluations of the objective, and the gradient, than does derivative-free optimization.
+
+Here we fit a few models using automatic differentiation from [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and the `ForwardDiff` extension to [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) to optimize the profiled log-likelihood with the `LD_LBFGS` optimizer from [NLopt.jl](https://github.com/jump-dev/NLopt.jl), instead of the default `LN_NEWUOA` which does not use gradients.
+
+The results are more-or-less a toss-up when using `ForwardDiff` to evaluate the gradient.
+A more efficient evaluation of the gradient, taking advantage of the sparse-blocked structure of the Cholesky factorization to evaluate the profiled log-likelihood, may tip the balance in favor of gradient-based methods.
+
+## Preliminaries {#sec-prelim}
+
+Load the packages to be used
+
+```{julia}
+#| label: load_packages
+using ForwardDiff
+using MixedModels
+using MixedModels: fd_deviance
+using MixedModelsDatasets: dataset
+using NLopt
+using Tables: table
+using TypedTables: Table
+
+const progress = false
+```
+
+## Examples {#sec-examples}
+
+We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`.
+
+```{julia}
+addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0'))
+function gr_refit!(m::LinearMixedModel{T}) where {T}
+  θ = copy(m.optsum.initial)
+  k = length(θ)
+  fitlog = sizehint!(T[], 50 * k)
+  grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ)
+  function obj(θ::Vector{Float64}, grad::Vector{Float64})
+    val = objective(updateL!(setθ!(m, θ)))
+    push!(fitlog, val)
+    append!(fitlog, θ)
+    if !isempty(grad)
+      ForwardDiff.gradient!(grad, m, θ, grad_config)
+      append!(fitlog, grad)
+    else
+      append!(fitlog, fill(NaN, k))  # never called with empty grad but just in case
+    end
+    return val
+  end
+  opt = NLopt.Opt(:LD_LBFGS, k)
+  NLopt.ftol_rel!(opt, 1.e-12)
+  NLopt.ftol_abs!(opt, 1.e-8)
+  NLopt.min_objective!(opt, obj)
+  min_f, min_x, ret = NLopt.optimize(opt, θ)
+  header = vcat([:obj], addinds('θ', k), addinds('g', k))
+  return Table(table(transpose(reshape(fitlog, 2k + 1, :)); header))
+end
+```
+
+### Penicillin data {#sec-penicillin}
+
+Define a model for the `penicillin` data
+
+```{julia}
+#| label: m1
+m1 = fit(
+  MixedModel,
+  @formula(diameter ~ 1 + (1|plate) + (1|sample)),
+  dataset(:penicillin);
+  progress,
+)
+print(m1)
+```
+
+for which the optimization summary is
+
+```{julia}
+m1.optsum
+```
+
+and refit the model using ForwardDiff gradient evaluations.
+
+```{julia}
+fitlog = gr_refit!(m1)
+```
+
+The objective at convergence is
+
+```{julia}
+last(fitlog.obj)
+```
+
+and the last few evaluations are
+
+```{julia}
+last(fitlog, 5)
+```
+
+### Sleepstudy {#sec-sleepstudy}
+
+```{julia}
+m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress)
+print(m2)
+```
+
+```{julia}
+m2.optsum
+```
+
+```{julia}
+fitlog = gr_refit!(m2)
+```
+
+```{julia}
+last(fitlog.obj)
+```
+
+```{julia}
+last(fitlog, 5)
+```
+
+### Kronmueller-Barr 2007 {#sec-kb07}
+
+```{julia}
+# this model is very overparameterized, but it's a test example
+m3 = fit(
+  MixedModel,
+  @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)),
+  dataset(:kb07);
+  progress,
+)
+print(m3)
+```
+
+```{julia}
+m3.optsum
+```
+
+Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector
+
+```{julia}
+findall(iszero, m3.θ)
+```
+
+```{julia}
+fitlog = gr_refit!(m3)
+```
+
+```{julia}
+last(fitlog.obj)
+```
+
+```{julia}
+last(fitlog, 5)
+```
\ No newline at end of file
diff --git a/gradients/Project.toml b/gradients/Project.toml
new file mode 100644
index 000000000..a05f9e152
--- /dev/null
+++ b/gradients/Project.toml
@@ -0,0 +1,14 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
+MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6"
+NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
+
+[sources]
+MixedModels = {path = ".."}
diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib
new file mode 100644
index 000000000..decf33365
--- /dev/null
+++ b/gradients/bibliography.bib
@@ -0,0 +1,44 @@
+@article{Zhou03042019,
+    author = {Hua Zhou and Liuyi Hu and Jin Zhou and Kenneth Lange},
+    title = {MM Algorithms for Variance Components Models},
+    journal = {Journal of Computational and Graphical Statistics},
+    volume = {28},
+    number = {2},
+    pages = {350--361},
+    year = {2019},
+    publisher = {ASA Website},
+    doi = {10.1080/10618600.2018.1529601},
+    note ={PMID: 31592195},
+    URL = {https://doi.org/10.1080/10618600.2018.1529601},
+    eprint = {https://doi.org/10.1080/10618600.2018.1529601}
+}
+
+@Article{bates.maechler.etal:2015,
+  author        = {Bates, Douglas and Maechler, Martin and Bolker, Benjamin M. and Walker, Steven},
+  title         = {Fitting Linear Mixed-Effects Models using lme4},
+  doi           = {10.18637/jss.v067.i01},
+  number        = {1},
+  pages         = {1--48},
+  volume        = {67},
+  date-added    = {2020-03-24},
+  date-modified = {2016-02-12 06:52:06 +0000},
+  file          = {:2015/bates.maechler.etal_2015 - Fitting Linear Mixed.pdf:PDF},
+  journal       = {Journal of Statistical Software},
+  year          = {2015},
+}
+
+@article{bates2025mixed,
+  title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization},
+  author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H},
+  journal={arXiv preprint arXiv:2505.11674},
+  year={2025},
+  url={https://arxiv.org/pdf/2505.11674}
+}
+
+@article{murray2016differentiation,
+  title={Differentiation of the Cholesky decomposition},
+  author={Murray, Iain},
+  journal={arXiv preprint arXiv:1602.07527},
+  year={2016},
+  url={https://arxiv.org/pdf/1602.07527}
+}