JuliaMixedModels · dmbates · Oct 12, 2022 · Oct 13, 2022 · Oct 13, 2022 · Oct 17, 2022
diff --git a/Project.toml b/Project.toml
@@ -37,16 +37,16 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 [compat]
 AlgebraOfGraphics = "0.6"
 Arrow = "2"
-CairoMakie = "0.8"
+CSV = "0.10"
+CairoMakie = "0.8,0.9"
 CategoricalArrays = "0.10"
 Chain = "0.5"
-CSV = "0.10"
 DataAPI = "1.9"
-DataFrameMacros = "0.2"
+DataFrameMacros = "0.3,0.4"
 DataFrames = "1.3"
 Effects = "0.1.7"
 GLM = "1.7"
-MKL = "0.5"
+MKL = "0.5,0.6"
 MixedModels = "4.6"
 MixedModelsMakie = "0.3.13"
 ProgressMeter = "1.7"

diff --git a/data/sizespeed.arrow b/data/sizespeed.arrow
diff --git a/data/sizespeed.csv b/data/sizespeed.csv
diff --git a/data/sizespeedrfp.arrow b/data/sizespeedrfp.arrow
diff --git a/data/sizespeedrfp.csv b/data/sizespeedrfp.csv
@@ -0,0 +1,22 @@
+mc,uc,nratings,nusers,nmvie,modelsz,L22sz,nv,fittime,evtime
+1,5,27716748,266980,53883,13.528159,10.816126,20,11194.346,472.15826
+1,10,27565774,243658,53852,13.499616,10.803683,23,12405.15,463.4751
+1,20,26544038,174605,53781,13.3646345,10.775215,26,13327.93,469.11862
+2,5,27706599,266980,43734,9.835946,7.125387,20,7190.8804,304.24103
+2,10,27555656,243658,43734,9.819849,7.125387,23,8030.3833,305.5716
+2,20,26533987,174605,43730,9.712043,7.1240835,28,8777.485,286.3375
+5,5,27671580,266979,30824,6.2453322,3.539584,23,4049.6167,181.60448
+5,10,27520713,243658,30824,6.229243,3.539584,25,3913.8855,136.16142
+5,20,26499391,174605,30823,6.122543,3.5393543,18,2949.3943,142.71194
+10,5,27624556,266979,23716,4.7962227,2.095373,23,2465.8628,95.06677
+10,10,27473781,243658,23716,4.7801423,2.095373,27,2860.669,95.515144
+10,20,26452968,174605,23716,4.6737213,2.095373,18,2043.9052,97.16636
+15,5,27585626,266979,20400,4.2469144,1.5503929,24,2141.2239,83.87505
+15,10,27434948,243658,20400,4.2308435,1.5503929,28,2532.9507,87.11419
+15,20,26414555,174605,20400,4.124463,1.5503929,19,1571.836,72.48678
+20,5,27551352,266978,18366,3.949749,1.2566459,19,1563.819,80.66754
+20,10,27400762,243658,18366,3.9336874,1.2566459,33,2825.257,89.89166
+20,20,26380766,174605,18366,3.8273454,1.2566459,21,1774.1753,69.60442
+50,5,27394411,266974,13360,3.3426354,0.6649754,26,1665.6656,71.67412
+50,10,27244313,243656,13360,3.3266208,0.6649754,30,1937.7285,65.354385
+50,20,26226280,174604,13360,3.220469,0.6649754,20,1272.8384,59.05939
diff --git a/data/sizespeedsquare.arrow b/data/sizespeedsquare.arrow
diff --git a/data/sizespeedsquare.csv b/data/sizespeedsquare.csv
@@ -0,0 +1,22 @@
+mc,uc,nratings,nusers,nmvie,modelsz,L22sz,nv,fittime,evtime
+1,5,27716748,266980,53883,24.343884,21.63185,20,7117.5103,292.15567
+1,10,27565774,243658,53852,24.302898,21.606966,23,7114.0396,287.0661
+1,20,26544038,174605,53781,24.139448,21.55003,27,8357.902,287.1031
+2,5,27706599,266980,43734,16.961006,14.250448,20,4757.1997,232.81114
+2,10,27555656,243658,43734,16.94491,14.250448,23,5609.2354,194.78448
+2,20,26533987,174605,43730,16.8358,14.247842,28,5492.3843,175.88655
+5,5,27671580,266979,30824,9.784686,7.078938,23,2492.2966,100.09162
+5,10,27520713,243658,30824,9.768599,7.078938,44,5480.9663,128.91615
+5,20,26499391,174605,30823,9.661668,7.078479,18,2090.2693,103.75671
+10,5,27624556,266979,23716,6.891419,4.190569,23,1836.2778,75.29963
+10,10,27473781,243658,23716,6.875338,4.190569,23,1756.509,68.03782
+10,20,26452968,174605,23716,6.768917,4.190569,18,1411.8765,72.54461
+15,5,27585626,266979,20400,5.7971554,3.1006336,23,1518.7816,62.48056
+15,10,27434948,243658,20400,5.781084,3.1006336,24,1634.6731,64.484474
+15,20,26414555,174605,20400,5.674704,3.1006336,20,1299.0247,55.43926
+20,5,27551352,266978,18366,5.2062583,2.513155,19,1177.3881,58.52489
+20,10,27400762,243658,18366,5.1901965,2.513155,29,1877.9796,67.05502
+20,20,26380766,174605,18366,5.083854,2.513155,21,1417.6132,66.36447
+50,5,27394411,266974,13360,4.007511,1.3298512,26,1401.7651,53.847027
+50,10,27244313,243656,13360,3.9914968,1.3298512,31,1684.2335,54.81318
+50,20,26226280,174604,13360,3.8853447,1.3298512,20,1104.7524,54.073765
diff --git a/largescaleobserved.qmd b/largescaleobserved.qmd
@@ -16,23 +16,14 @@ using CairoMakie
 using CategoricalArrays
 using DataFrameMacros
 using DataFrames
-using GLM
-if contains(first(Sys.cpu_info()).model, "Intel")
-  using MKL
-end
 using MixedModels
-using MixedModelsMakie
-using ProgressMeter
 using SparseArrays
 using Statistics
 using TypedTables
 
-datadir(paths::AbstractString...) =
-  joinpath(@__DIR__, "data", paths...)
-optsumdir(paths::AbstractString...) =
-  joinpath(@__DIR__, "optsums", paths...)
+datadir(paths::AbstractString...) = joinpath(@__DIR__, "data", paths...)
+optsumdir(paths::AbstractString...) = joinpath(@__DIR__, "optsums", paths...)
 CairoMakie.activate!(; type="svg");
-ProgressMeter.ijulia_behavior(:clear);
 ```
 
 In the previous chapter we explored and fit models to data from a large-scale designed experiment.
@@ -82,22 +73,31 @@ ratings = select!(DataFrame(ratings), Not(:timestamp));
 Information on the movies is available as
 
 ```{julia}
-movies = disallowmissing!(
-  leftjoin!(
-    combine(groupby(ratings, :movieId), :rating => mean => :mnrtng),
-    DataFrame(Arrow.Table(datadir("movies.arrow")));
-    on=:movieId,
-  );
-  error=false,
+movies = sort!(
+  disallowmissing!(
+    leftjoin!(
+      combine(groupby(ratings, :movieId), :rating => mean => :mnrtng),
+      DataFrame(Arrow.Table(datadir("movies.arrow")));
+      on=:movieId,
+    );
+    error=false,
+  ),
+  :nrtngs;
+  rev=true,
 )
 ```
 
-In contrast to data from a designed experiment, like the English Lexicon Project, the data from this observational study are extremely unbalanced with respect to the observational grouping factors, `userId` and `movieId`.
-The `movies` table includes an `nrtngs` column that gives the number of ratings for each movie, which varies from 1 to nearly 100,000.
+The `movies` table includes an `nrtngs` column that gives the number of ratings for each movie, which varies from nearly 100,000 down to a single rating.
+We have sorted the movies by decreasing number of ratings to illustrate this highly skewed distribution and also because this ordering provides a slight speed boost when fitting a model.
 
-```{julia}
-extrema(movies.nrtngs)
-```
+:::{.callout-note collapse="true"}
+### Why does this ordering give a speed boost?
+
+The reasons for this ordering giving a speed boost are rather technical.
+The most time-consuming part of evaluating the objective during optimization is "down-dating" a symmetric matrix, the `[2,2]` block of a blocked Cholesky factor `L`, from the `[2,1]` block, which is a sparse matrix.
+The ordering of the movies determines both the row and column positions of each movie in the `[2,2]` block.
+Having movies that are often rated by the same user in close proximity results in localization of memory access, which is an advantage on modern computer architectures.
+:::
 
 The number of ratings per user is also highly skewed
 
@@ -221,7 +221,7 @@ Similarly, users who rate very few movies add little information, even about the
 One way of dealing with the extreme imbalance in the number of observations per user or per movie is to set a threshold on the number of observations for a user or a movie to be included in the data used to fit the model.
 For example, a companion data set on grouplens.org, available in the [ml-25m.zip](https://files.grouplens.org/datasets/movielens/ml-25n.zip) archive, included only users who had rated at least 20 movies.
 
-To be able to select ratings on according to the number of ratings per user and the number of ratings per movie, we left-joined the `movies.nrtngs` and `users.urtngs` columns into the `ratings` data frame.
+To be able to select ratings according to the number of ratings per user and the number of ratings per movie, we left-joined the `movies.nrtngs` and `users.urtngs` columns into the `ratings` data frame.
 
 ```{julia}
 describe(ratings)
@@ -243,7 +243,7 @@ The results are summarized in the following table
 
 ```{julia}
 #| code-fold: true
-sizespeed = Table(Arrow.Table("./data/sizespeed.arrow"))
+sizespeed = Table(Arrow.Table("./data/sizespeedsquare.arrow"))
 ```
 
 In this table, `mc` is the "movie cutoff" (i.e. the threshold on the number of ratings per movie); `uc` is the user cutoff (threshold on the number of ratings per user); `nratings`, `nusers` and `nmvie` are the number of ratings, users and movies in the resulting trimmed data set; `modelsz` is the size (in GiB) of the model fit; `L22sz` is the size of the [2,2] block of the `L` matrix in that model; `fittime` is the time (in seconds) required to fit the model; `nev` is the number of function evaluations until convergence; and `evtime` is the time (s) per function evaluation.
@@ -345,9 +345,11 @@ mvm20u20 = ratingsoptsum(20, 20)
 println(mvm20u20)
 ```
 
-Creating the model representation and restoring the optimal parameter values can take a couple of minutes because the objective is evaluated twice --- at the initial parameter values and at the final parameter values --- during the call to `restoreoptsum!`.
+Creating the model representation and restoring the optimal parameter values can take a few minutes because the objective is evaluated twice --- at the initial parameter values and at the final parameter values --- during the call to `restoreoptsum!`.
+
 
-Each evaluation of the objective, which requires setting the value of the parameter $\bbtheta$ in the numerical representation of the model, updating the blocked Cholesky factor, $\mathbf{L}$, and evaluating the scalar objective value from this factor, takes a little over a minute (71 seconds) on a server node and probably longer on a laptop.
+As described in @sec-lmmtheory, the profiled log-likelihood of a linear mixed-effects model is evaluated in the `MixedModels.jl` package by setting the value of the parameter $\bbtheta$ in the numerical representation of the model, updating the blocked Cholesky factor, $\mathbf{L}$, and evaluating the scalar objective value from this factor.
+For this model these steps take a little over a minute (66 seconds) on a server node and probably longer on a laptop.
 
 The lower triangular `L` factor is large but sparse.
 It is stored in six blocks of dimensions and types as shown in
@@ -433,12 +435,9 @@ end
 @fig-memoryfootprint shows that when all the movies are included in the data to which the model is fit (i.e. `mc == 1`) the total memory footprint is over 20 GiB, and nearly 90% of that memory is that required for the `[2,2]` block of `L`.
 Even when requiring a minimum of 50 ratings per movie, the `[2,2]` block of `L` is over 30% of the memory footprint.
 
-In a sense this is good news because the amount of storage required for the `[2,2]` block can be nearly cut in half by taking advantage of the fact that it is a triangular matrix.
-The [rectangular full packed format](https://netlib.org/lapack/lawnspdf/lawn199.pdf) looks especially promising for this purpose.
-
 In general, for models with scalar random effects for two incompletely crossed grouping factors, the memory footprint depends strongly on the smaller of the number of levels of the grouping factors, less strongly on the larger number, and almost not at all on the number of observations.
 
-### Speed of log-likelihood evalation
+### Speed of log-likelihood evalation {#sec-largeobslleval}
 
 The time required to fit a model to large data sets is dominated by the time required to evaluate the log-likelihood during the optimization of the parameter estimates.
 The time for one evaluation is given in the `evtime` column of `sizespeed`.
@@ -480,38 +479,103 @@ end
 @fig-timetoconverge shows that the average evaluation time for the log-likelihood function depends strongly on the number of movies and less strongly on the number of users.
 
 However the middle panel shows that the number of iterations to convergence is highly variable.
-Most of these models required between 20 and 25 evaluations but some required almost 50 evaluations.
+Most of these models required between 20 and 25 evaluations but some required nearly 40 evaluations.
+
+## Rectangular storage of L[2,2] {#sec-RFPstorage}
 
-The derivation of the log-likelihood for linear mixed-effects models is given in @sec-lmmtheory, which provides a rather remarkable result:
-the profiled log-likelihood for a linear mixed-effects model can be evaluated from Cholesky factor of a blocked, positive-definite symmetric matrix.
+@fig-memoryfootprint shows that the amount of storage required for `L[2,2]` is a substantial fraction of the total storage required for the model.
+In a sense this is good news because the amount of storage required for the `L[2,2]` can be cut roughly in half by taking advantage of the fact that it is a triangular matrix.
+When the matrix is stored as a square two-dimensional array, storage for a total of $n^2$ floating point numbers is allocated, even though the matrix is determined by only $(n*(n+1))/2$ values.
 
-There are two blocked matrices, `A` and `L`, stored in the model representation and, for large models such as we are considering these are the largest fields in 
-## Model size and speed for different thresholds
+Switching to the [rectangular full packed format](https://netlib.org/lapack/lawnspdf/lawn199.pdf) for `L[2,2]` saves nearly 50% of the storage size at the cost of a modest increase in the evaluation time for the objective.
 
 ```{julia}
 #| code-fold: true
-#| fig-cap: "Amount of storage for L[2,2] versus amount of storage for the entire model.  Colors are determined by the minimum number of ratings per user in the data to which the model was fit."
-#| label: fig-L22vsmodelsz
-draw(
-  data(sizespeed) *
-  mapping(
-    :L22sz => "Size (GiB) of 2,2 block of L",
-    :modelsz => "Size (GiB) of model representation";
-    color = :uc,
-  ) * 
-  visual(Scatter)
-)
+sizespeedrfp = Table(Arrow.Table("./data/sizespeedrfp.arrow"))
 ```
 
-Linear regression of the `modelsz` versus the number of users and `L22sz`.
-
 ```{julia}
-sizemod = lm(@formula(Float64(modelsz) ~ 1 + nusers + L22sz), sizespeed)
-coeftable(sizemod)
+#| label: augment-sizespeed-df
+#| echo: false
+#| output: false
+sizespeedrfp = let
+  ord(x) = categorical(x; ordered=true)
+  transform!(
+    DataFrame(sizespeedrfp),
+    :mc => ord => :mc,
+    :uc => ord => :uc,
+    [:L22sz, :modelsz] => ((x, y) -> x ./ y) => :L22prop,
+  )
+end
 ```
 
-provides an $r^2$ value very close to one, indicating an almost perfect fit.
+Reproducing @fig-memoryfootprint and @fig-timetoconverge for the RFP storage scheme as @fig-memoryfootprintrfp and @fig-timetoconvergerfp
 
 ```{julia}
-r²(sizemod)
+#| fig-cap: "Bar plot of memory footprint of the model representation using RFP, by minimum number of ratings per user and per movie."
+#| code-fold: true
+#| label: fig-memoryfootprintrfp
+let
+  f = Figure(resolution=(800, 1000))
+  xlabel = "Minimum number of ratings per movie"
+  mc = refarray(sizespeedrfp.mc)
+  xticks = (1:7, string.(refpool(sizespeedrfp.mc)))
+  uc = refarray(sizespeedrfp.uc)
+  Legend(
+    f[1,1], lelements, llabels, ltitle;
+    orientation=:horizontal, tellwidth=false, tellheight=true,
+  )
+  barplot!(
+    Axis(f[2,1]; xticks, ylabel="Memory requirement (GiB)",),
+    mc, sizespeedrfp.modelsz; xticks, dodge=uc, color=uc,
+  )
+  barplot!(
+    Axis(f[3,1]; xticks, ylabel="Size of L[2,2] (GiB)",),
+    mc, sizespeedrfp.L22sz; xticks, dodge=uc, color=uc,
+  )
+  barplot!(
+    Axis(f[4,1]; xlabel, xticks, ylabel="L[2,2] memory fraction",),
+    mc, sizespeedrfp.L22prop; xticks, dodge=uc, color=uc,
+  )
+  f
+end
 ```
+
+```{julia}
+#| fig-cap: "Bar plot of function log-likelihood evaluation time using RFP, by minimum number of ratings per user and per movie using RFP storage"
+#| code-fold: true
+#| label: fig-timetoconvergerfp
+let
+  f = Figure(resolution=(800, 1000))
+  xlabel = "Minimum number of ratings per movie"
+  mc = refarray(sizespeedrfp.mc)
+  xticks = (1:7, string.(refpool(sizespeedrfp.mc)))
+  uc = refarray(sizespeedrfp.uc)
+  Legend(
+    f[1,1], lelements, llabels, ltitle;
+    orientation=:horizontal, tellwidth=false, tellheight=true,
+  )
+  barplot!(
+    Axis(f[2,1]; xticks, ylabel="Log-likelihood evaluation time (m)",),
+    mc, sizespeedrfp.evtime ./ 60; xticks, dodge=uc, color=uc,
+  )
+  barplot!(
+    Axis(f[3,1]; xticks, ylabel="Number of evaluations to convergence",),
+    mc, sizespeedrfp.nv; xticks, dodge=uc, color=uc,
+  )
+  barplot!(
+    Axis(f[4,1]; xlabel, xticks, ylabel="Time (hr) to fit the model",),
+    mc, sizespeedrfp.fittime ./ 3600; xticks, dodge=uc, color=uc,
+  )
+  f
+end
+```
+
+The patterns in @fig-memoryfootprintrfp are similar to those in @fig-memoryfootprint but the vertical axes are quite different.
+In the most extreme case of `mc == 1` the square storage of `L[2,2]` requires over 20 GiB but the rectangular storage requires just over 10 GiB.
+This can be the difference between being able to fit the model and not being able to fit it.
+
+The time to evaluate the log-likelihood once goes from about 5 minutes for the square storage to a little over 6 minutes for the rectangular storage on the `mc == 1` models.
+Because the number of evaluations to convergence stays the same on this set of models, the time to fit the model goes from roughly 2 hours (square storage) to about 2.5 hours (rectangular storage).
+
+As the threshold on the number of ratings for inclusion of a movie increases the relative change in storage sizes decreases as does the relative change in time to fit the model.