Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
[compat]
AlgebraOfGraphics = "0.6"
Arrow = "2"
CairoMakie = "0.8"
CSV = "0.10"
CairoMakie = "0.8,0.9"
CategoricalArrays = "0.10"
Chain = "0.5"
CSV = "0.10"
DataAPI = "1.9"
DataFrameMacros = "0.2"
DataFrameMacros = "0.3,0.4"
DataFrames = "1.3"
Effects = "0.1.7"
GLM = "1.7"
MKL = "0.5"
MKL = "0.5,0.6"
MixedModels = "4.6"
MixedModelsMakie = "0.3.13"
ProgressMeter = "1.7"
Expand Down
Binary file removed data/sizespeed.arrow
Binary file not shown.
22 changes: 0 additions & 22 deletions data/sizespeed.csv

This file was deleted.

Binary file added data/sizespeedrfp.arrow
Binary file not shown.
22 changes: 22 additions & 0 deletions data/sizespeedrfp.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
mc,uc,nratings,nusers,nmvie,modelsz,L22sz,nv,fittime,evtime
1,5,27716748,266980,53883,13.528159,10.816126,20,11194.346,472.15826
1,10,27565774,243658,53852,13.499616,10.803683,23,12405.15,463.4751
1,20,26544038,174605,53781,13.3646345,10.775215,26,13327.93,469.11862
2,5,27706599,266980,43734,9.835946,7.125387,20,7190.8804,304.24103
2,10,27555656,243658,43734,9.819849,7.125387,23,8030.3833,305.5716
2,20,26533987,174605,43730,9.712043,7.1240835,28,8777.485,286.3375
5,5,27671580,266979,30824,6.2453322,3.539584,23,4049.6167,181.60448
5,10,27520713,243658,30824,6.229243,3.539584,25,3913.8855,136.16142
5,20,26499391,174605,30823,6.122543,3.5393543,18,2949.3943,142.71194
10,5,27624556,266979,23716,4.7962227,2.095373,23,2465.8628,95.06677
10,10,27473781,243658,23716,4.7801423,2.095373,27,2860.669,95.515144
10,20,26452968,174605,23716,4.6737213,2.095373,18,2043.9052,97.16636
15,5,27585626,266979,20400,4.2469144,1.5503929,24,2141.2239,83.87505
15,10,27434948,243658,20400,4.2308435,1.5503929,28,2532.9507,87.11419
15,20,26414555,174605,20400,4.124463,1.5503929,19,1571.836,72.48678
20,5,27551352,266978,18366,3.949749,1.2566459,19,1563.819,80.66754
20,10,27400762,243658,18366,3.9336874,1.2566459,33,2825.257,89.89166
20,20,26380766,174605,18366,3.8273454,1.2566459,21,1774.1753,69.60442
50,5,27394411,266974,13360,3.3426354,0.6649754,26,1665.6656,71.67412
50,10,27244313,243656,13360,3.3266208,0.6649754,30,1937.7285,65.354385
50,20,26226280,174604,13360,3.220469,0.6649754,20,1272.8384,59.05939
Binary file added data/sizespeedsquare.arrow
Binary file not shown.
22 changes: 22 additions & 0 deletions data/sizespeedsquare.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
mc,uc,nratings,nusers,nmvie,modelsz,L22sz,nv,fittime,evtime
1,5,27716748,266980,53883,24.343884,21.63185,20,7117.5103,292.15567
1,10,27565774,243658,53852,24.302898,21.606966,23,7114.0396,287.0661
1,20,26544038,174605,53781,24.139448,21.55003,27,8357.902,287.1031
2,5,27706599,266980,43734,16.961006,14.250448,20,4757.1997,232.81114
2,10,27555656,243658,43734,16.94491,14.250448,23,5609.2354,194.78448
2,20,26533987,174605,43730,16.8358,14.247842,28,5492.3843,175.88655
5,5,27671580,266979,30824,9.784686,7.078938,23,2492.2966,100.09162
5,10,27520713,243658,30824,9.768599,7.078938,44,5480.9663,128.91615
5,20,26499391,174605,30823,9.661668,7.078479,18,2090.2693,103.75671
10,5,27624556,266979,23716,6.891419,4.190569,23,1836.2778,75.29963
10,10,27473781,243658,23716,6.875338,4.190569,23,1756.509,68.03782
10,20,26452968,174605,23716,6.768917,4.190569,18,1411.8765,72.54461
15,5,27585626,266979,20400,5.7971554,3.1006336,23,1518.7816,62.48056
15,10,27434948,243658,20400,5.781084,3.1006336,24,1634.6731,64.484474
15,20,26414555,174605,20400,5.674704,3.1006336,20,1299.0247,55.43926
20,5,27551352,266978,18366,5.2062583,2.513155,19,1177.3881,58.52489
20,10,27400762,243658,18366,5.1901965,2.513155,29,1877.9796,67.05502
20,20,26380766,174605,18366,5.083854,2.513155,21,1417.6132,66.36447
50,5,27394411,266974,13360,4.007511,1.3298512,26,1401.7651,53.847027
50,10,27244313,243656,13360,3.9914968,1.3298512,31,1684.2335,54.81318
50,20,26226280,174604,13360,3.8853447,1.3298512,20,1104.7524,54.073765
170 changes: 117 additions & 53 deletions largescaleobserved.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,14 @@ using CairoMakie
using CategoricalArrays
using DataFrameMacros
using DataFrames
using GLM
if contains(first(Sys.cpu_info()).model, "Intel")
using MKL
end
using MixedModels
using MixedModelsMakie
using ProgressMeter
using SparseArrays
using Statistics
using TypedTables

datadir(paths::AbstractString...) =
joinpath(@__DIR__, "data", paths...)
optsumdir(paths::AbstractString...) =
joinpath(@__DIR__, "optsums", paths...)
datadir(paths::AbstractString...) = joinpath(@__DIR__, "data", paths...)
optsumdir(paths::AbstractString...) = joinpath(@__DIR__, "optsums", paths...)
CairoMakie.activate!(; type="svg");
ProgressMeter.ijulia_behavior(:clear);
```

In the previous chapter we explored and fit models to data from a large-scale designed experiment.
Expand Down Expand Up @@ -82,22 +73,31 @@ ratings = select!(DataFrame(ratings), Not(:timestamp));
Information on the movies is available as

```{julia}
movies = disallowmissing!(
leftjoin!(
combine(groupby(ratings, :movieId), :rating => mean => :mnrtng),
DataFrame(Arrow.Table(datadir("movies.arrow")));
on=:movieId,
);
error=false,
movies = sort!(
disallowmissing!(
leftjoin!(
combine(groupby(ratings, :movieId), :rating => mean => :mnrtng),
DataFrame(Arrow.Table(datadir("movies.arrow")));
on=:movieId,
);
error=false,
),
:nrtngs;
rev=true,
)
```

In contrast to data from a designed experiment, like the English Lexicon Project, the data from this observational study are extremely unbalanced with respect to the observational grouping factors, `userId` and `movieId`.
The `movies` table includes an `nrtngs` column that gives the number of ratings for each movie, which varies from 1 to nearly 100,000.
The `movies` table includes an `nrtngs` column that gives the number of ratings for each movie, which varies from nearly 100,000 down to a single rating.
We have sorted the movies by decreasing number of ratings to illustrate this highly skewed distribution and also because this ordering provides a slight speed boost when fitting a model.

```{julia}
extrema(movies.nrtngs)
```
:::{.callout-note collapse="true"}
### Why does this ordering give a speed boost?

The reasons for this ordering giving a speed boost are rather technical.
The most time-consuming part of evaluating the objective during optimization is "down-dating" a symmetric matrix, the `[2,2]` block of a blocked Cholesky factor `L`, from the `[2,1]` block, which is a sparse matrix.
The ordering of the movies determines both the row and column positions of each movie in the `[2,2]` block.
Having movies that are often rated by the same user in close proximity results in localization of memory access, which is an advantage on modern computer architectures.
:::

The number of ratings per user is also highly skewed

Expand Down Expand Up @@ -221,7 +221,7 @@ Similarly, users who rate very few movies add little information, even about the
One way of dealing with the extreme imbalance in the number of observations per user or per movie is to set a threshold on the number of observations for a user or a movie to be included in the data used to fit the model.
For example, a companion data set on grouplens.org, available in the [ml-25m.zip](https://files.grouplens.org/datasets/movielens/ml-25n.zip) archive, included only users who had rated at least 20 movies.

To be able to select ratings on according to the number of ratings per user and the number of ratings per movie, we left-joined the `movies.nrtngs` and `users.urtngs` columns into the `ratings` data frame.
To be able to select ratings according to the number of ratings per user and the number of ratings per movie, we left-joined the `movies.nrtngs` and `users.urtngs` columns into the `ratings` data frame.

```{julia}
describe(ratings)
Expand All @@ -243,7 +243,7 @@ The results are summarized in the following table

```{julia}
#| code-fold: true
sizespeed = Table(Arrow.Table("./data/sizespeed.arrow"))
sizespeed = Table(Arrow.Table("./data/sizespeedsquare.arrow"))
```

In this table, `mc` is the "movie cutoff" (i.e. the threshold on the number of ratings per movie); `uc` is the user cutoff (threshold on the number of ratings per user); `nratings`, `nusers` and `nmvie` are the number of ratings, users and movies in the resulting trimmed data set; `modelsz` is the size (in GiB) of the model fit; `L22sz` is the size of the [2,2] block of the `L` matrix in that model; `fittime` is the time (in seconds) required to fit the model; `nev` is the number of function evaluations until convergence; and `evtime` is the time (s) per function evaluation.
Expand Down Expand Up @@ -345,9 +345,11 @@ mvm20u20 = ratingsoptsum(20, 20)
println(mvm20u20)
```

Creating the model representation and restoring the optimal parameter values can take a couple of minutes because the objective is evaluated twice --- at the initial parameter values and at the final parameter values --- during the call to `restoreoptsum!`.
Creating the model representation and restoring the optimal parameter values can take a few minutes because the objective is evaluated twice --- at the initial parameter values and at the final parameter values --- during the call to `restoreoptsum!`.


Each evaluation of the objective, which requires setting the value of the parameter $\bbtheta$ in the numerical representation of the model, updating the blocked Cholesky factor, $\mathbf{L}$, and evaluating the scalar objective value from this factor, takes a little over a minute (71 seconds) on a server node and probably longer on a laptop.
As described in @sec-lmmtheory, the profiled log-likelihood of a linear mixed-effects model is evaluated in the `MixedModels.jl` package by setting the value of the parameter $\bbtheta$ in the numerical representation of the model, updating the blocked Cholesky factor, $\mathbf{L}$, and evaluating the scalar objective value from this factor.
For this model these steps take a little over a minute (66 seconds) on a server node and probably longer on a laptop.

The lower triangular `L` factor is large but sparse.
It is stored in six blocks of dimensions and types as shown in
Expand Down Expand Up @@ -433,12 +435,9 @@ end
@fig-memoryfootprint shows that when all the movies are included in the data to which the model is fit (i.e. `mc == 1`) the total memory footprint is over 20 GiB, and nearly 90% of that memory is that required for the `[2,2]` block of `L`.
Even when requiring a minimum of 50 ratings per movie, the `[2,2]` block of `L` is over 30% of the memory footprint.

In a sense this is good news because the amount of storage required for the `[2,2]` block can be nearly cut in half by taking advantage of the fact that it is a triangular matrix.
The [rectangular full packed format](https://netlib.org/lapack/lawnspdf/lawn199.pdf) looks especially promising for this purpose.

In general, for models with scalar random effects for two incompletely crossed grouping factors, the memory footprint depends strongly on the smaller of the number of levels of the grouping factors, less strongly on the larger number, and almost not at all on the number of observations.

### Speed of log-likelihood evalation
### Speed of log-likelihood evalation {#sec-largeobslleval}

The time required to fit a model to large data sets is dominated by the time required to evaluate the log-likelihood during the optimization of the parameter estimates.
The time for one evaluation is given in the `evtime` column of `sizespeed`.
Expand Down Expand Up @@ -480,38 +479,103 @@ end
@fig-timetoconverge shows that the average evaluation time for the log-likelihood function depends strongly on the number of movies and less strongly on the number of users.

However the middle panel shows that the number of iterations to convergence is highly variable.
Most of these models required between 20 and 25 evaluations but some required almost 50 evaluations.
Most of these models required between 20 and 25 evaluations but some required nearly 40 evaluations.

## Rectangular storage of L[2,2] {#sec-RFPstorage}

The derivation of the log-likelihood for linear mixed-effects models is given in @sec-lmmtheory, which provides a rather remarkable result:
the profiled log-likelihood for a linear mixed-effects model can be evaluated from Cholesky factor of a blocked, positive-definite symmetric matrix.
@fig-memoryfootprint shows that the amount of storage required for `L[2,2]` is a substantial fraction of the total storage required for the model.
In a sense this is good news because the amount of storage required for the `L[2,2]` can be cut roughly in half by taking advantage of the fact that it is a triangular matrix.
When the matrix is stored as a square two-dimensional array, storage for a total of $n^2$ floating point numbers is allocated, even though the matrix is determined by only $(n*(n+1))/2$ values.

There are two blocked matrices, `A` and `L`, stored in the model representation and, for large models such as we are considering these are the largest fields in
## Model size and speed for different thresholds
Switching to the [rectangular full packed format](https://netlib.org/lapack/lawnspdf/lawn199.pdf) for `L[2,2]` saves nearly 50% of the storage size at the cost of a modest increase in the evaluation time for the objective.

```{julia}
#| code-fold: true
#| fig-cap: "Amount of storage for L[2,2] versus amount of storage for the entire model. Colors are determined by the minimum number of ratings per user in the data to which the model was fit."
#| label: fig-L22vsmodelsz
draw(
data(sizespeed) *
mapping(
:L22sz => "Size (GiB) of 2,2 block of L",
:modelsz => "Size (GiB) of model representation";
color = :uc,
) *
visual(Scatter)
)
sizespeedrfp = Table(Arrow.Table("./data/sizespeedrfp.arrow"))
```

Linear regression of the `modelsz` versus the number of users and `L22sz`.

```{julia}
sizemod = lm(@formula(Float64(modelsz) ~ 1 + nusers + L22sz), sizespeed)
coeftable(sizemod)
#| label: augment-sizespeed-df
#| echo: false
#| output: false
sizespeedrfp = let
ord(x) = categorical(x; ordered=true)
transform!(
DataFrame(sizespeedrfp),
:mc => ord => :mc,
:uc => ord => :uc,
[:L22sz, :modelsz] => ((x, y) -> x ./ y) => :L22prop,
)
end
```

provides an $r^2$ value very close to one, indicating an almost perfect fit.
Reproducing @fig-memoryfootprint and @fig-timetoconverge for the RFP storage scheme as @fig-memoryfootprintrfp and @fig-timetoconvergerfp

```{julia}
r²(sizemod)
#| fig-cap: "Bar plot of memory footprint of the model representation using RFP, by minimum number of ratings per user and per movie."
#| code-fold: true
#| label: fig-memoryfootprintrfp
let
f = Figure(resolution=(800, 1000))
xlabel = "Minimum number of ratings per movie"
mc = refarray(sizespeedrfp.mc)
xticks = (1:7, string.(refpool(sizespeedrfp.mc)))
uc = refarray(sizespeedrfp.uc)
Legend(
f[1,1], lelements, llabels, ltitle;
orientation=:horizontal, tellwidth=false, tellheight=true,
)
barplot!(
Axis(f[2,1]; xticks, ylabel="Memory requirement (GiB)",),
mc, sizespeedrfp.modelsz; xticks, dodge=uc, color=uc,
)
barplot!(
Axis(f[3,1]; xticks, ylabel="Size of L[2,2] (GiB)",),
mc, sizespeedrfp.L22sz; xticks, dodge=uc, color=uc,
)
barplot!(
Axis(f[4,1]; xlabel, xticks, ylabel="L[2,2] memory fraction",),
mc, sizespeedrfp.L22prop; xticks, dodge=uc, color=uc,
)
f
end
```

```{julia}
#| fig-cap: "Bar plot of function log-likelihood evaluation time using RFP, by minimum number of ratings per user and per movie using RFP storage"
#| code-fold: true
#| label: fig-timetoconvergerfp
let
f = Figure(resolution=(800, 1000))
xlabel = "Minimum number of ratings per movie"
mc = refarray(sizespeedrfp.mc)
xticks = (1:7, string.(refpool(sizespeedrfp.mc)))
uc = refarray(sizespeedrfp.uc)
Legend(
f[1,1], lelements, llabels, ltitle;
orientation=:horizontal, tellwidth=false, tellheight=true,
)
barplot!(
Axis(f[2,1]; xticks, ylabel="Log-likelihood evaluation time (m)",),
mc, sizespeedrfp.evtime ./ 60; xticks, dodge=uc, color=uc,
)
barplot!(
Axis(f[3,1]; xticks, ylabel="Number of evaluations to convergence",),
mc, sizespeedrfp.nv; xticks, dodge=uc, color=uc,
)
barplot!(
Axis(f[4,1]; xlabel, xticks, ylabel="Time (hr) to fit the model",),
mc, sizespeedrfp.fittime ./ 3600; xticks, dodge=uc, color=uc,
)
f
end
```

The patterns in @fig-memoryfootprintrfp are similar to those in @fig-memoryfootprint but the vertical axes are quite different.
In the most extreme case of `mc == 1` the square storage of `L[2,2]` requires over 20 GiB but the rectangular storage requires just over 10 GiB.
This can be the difference between being able to fit the model and not being able to fit it.

The time to evaluate the log-likelihood once goes from about 5 minutes for the square storage to a little over 6 minutes for the rectangular storage on the `mc == 1` models.
Because the number of evaluations to convergence stays the same on this set of models, the time to fit the model goes from roughly 2 hours (square storage) to about 2.5 hours (rectangular storage).

As the threshold on the number of ratings for inclusion of a movie increases the relative change in storage sizes decreases as does the relative change in time to fit the model.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

Is this because the [2,1] block is a smaller fraction of the total memory use?