diff --git a/Project.toml b/Project.toml index 5546fc9..626645f 100644 --- a/Project.toml +++ b/Project.toml @@ -37,7 +37,6 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -TidierPlots = "337ecbd1-5042-4e2a-ae6f-ca776f97570a" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" @@ -69,7 +68,6 @@ StatsAPI = "1" StatsBase = "0.33, 0.34" StatsModels = "0.7" Tables = "1" -TidierPlots = "0.11.1" TypedTables = "1" ZipFile = "0.10" julia = "1.10" diff --git a/largescaleobserved.qmd b/largescaleobserved.qmd index b030cf1..078f465 100644 --- a/largescaleobserved.qmd +++ b/largescaleobserved.qmd @@ -4,6 +4,8 @@ fig-height: 3 fig-dpi: 192 fig-format: png engine: julia +execute: + cache: true julia: exeflags: - --project @@ -28,7 +30,6 @@ using MixedModels using MixedModelsMakie using SparseArrays # for the nnz function using Statistics # for the mean function -using TidierPlots using TypedTables ``` @@ -269,10 +270,18 @@ As shown if @fig-nratingsbycutoff, the number of ratings varies from a little ov #| code-fold: true #| label: fig-nratingsbycutoff #| fig-cap: "Number of ratings in reduced table by movie cutoff value" -ggplot(sizespeed, aes(; x=:mc, y=:nratings, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Minimum number of ratings per movie", y="Number of ratings") + +sizespeed.ucbak = sizespeed.uc +sizespeed.uc = nonnumeric.(sizespeed.uc) +draw( + data(sizespeed) * + mapping( + :mc => "Minimum number of ratings per movie (mc)", + :nratings => "Total number of ratings", + color=:uc, + ) * visual(ScatterLines); + figure=(; size=(600, 350)) +) ``` For this range of choices of cutoffs, the user cutoff has more impact on the number of ratings in the reduced dataset than does the movie cutoff. @@ -285,10 +294,16 @@ A glance at the table shows that the number of users, `nusers`, is essentially a #| code-fold: true #| label: fig-nusersbycutoff #| fig-cap: "Number of users in reduced table by movie cutoff value" -ggplot(sizespeed, aes(; x=:mc, y=:nmvie, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Minimum number of ratings per movie", y="Number of movies in table") + +draw( + data(sizespeed) * + mapping( + :mc => "Minimum number of ratings per movie (mc)", + :nmvie => "Number of movies in table", + color=:uc) * + visual(ScatterLines); + figure=(; size=(600, 350)) +) ``` ```{julia} @@ -400,10 +415,16 @@ The memory footprint of the model representation depends strongly on the number #| code-fold: true #| fig-cap: Memory footprint of the model representation by minimum number of ratings per user and per movie." #| label: fig-memoryfootprint -ggplot(sizespeed, aes(x=:mc, y=:modelsz, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Minimum number of ratings per movie", y="Size of model (GiB)") + +draw( + data(sizespeed) * + mapping( + :mc => "Minimum number of ratings per movie (mc)", + :modelsz => "Size of model object (GiB)", + color=:uc) * + visual(ScatterLines); + figure=(; size=(600, 350)) +) ``` @fig-memoryvsl22 shows the dominance of the `[2, 2]` block of `L` in the overall memory footprint of the model @@ -412,10 +433,18 @@ ggplot(sizespeed, aes(x=:mc, y=:modelsz, color=:ucutoff)) + #| code-fold: true #| fig-cap: Memory footprint of the model representation (GiB) versus the size of the [2, 2] block of L (GiB) #| label: fig-memoryvsl22 -ggplot(sizespeed, aes(; x=:L22sz, y=:modelsz, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(y="Size of model representation (GiB)", x="Size of [2,2] block of L (GiB)") + +draw( + data(transform!(sizespeed, [:L22sz, :modelsz] => ((x, y) -> x ./ y) => :L22prop)) * + mapping( + :modelsz => "Size of model object (GiB)", + [:L22sz => "Size of [2,2] block of L (GiB)", :L22prop => "Proportion of memory footprint in L[2,2]"], + col = dims(1) => renamer(["Size of L[2,2] block", "Memory Proportion of L[2,2] block"]), + color=:uc) * + visual(ScatterLines); + legend = (; position = :bottom, titleposition=:left), + figure=(; size=(600, 350)) +) ``` @fig-memoryfootprint shows that when all the movies are included in the data to which the model is fit (i.e. `mc == 1`) the total memory footprint is over 20 GiB, and nearly 90% of that memory is that required for the `[2,2]` block of `L`. @@ -441,10 +470,16 @@ As shown in @fig-evtimevsl22 the evaluation time for the objective is predominan #| code-fold: true #| fig-cap: "Evaluation time for the objective (s) versus size of the [2, 2] block of L (GiB)" #| label: fig-evtimevsl22 -ggplot(sizespeed, aes(x=:L22sz, y=:evtime, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Size of [2,2] block of L (GiB)", y="Time for one evaluation of objective (s)") + +draw( + data(sizespeed) * + mapping( + :L22sz => "Size of [2,2] block of L (GiB)", + :evtime => "Time for one evaluation of objective (s)", + color=:uc) * + visual(ScatterLines); + figure=(; size=(600, 350)) +) ``` However the middle panel shows that the number of iterations to convergence is highly variable.