JuliaMixedModels · dmbates · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 25, 2024
diff --git a/LDT_accuracy.qmd b/LDT_accuracy.qmd
@@ -27,19 +27,23 @@ and define some constants
 ```{julia}
 #| code-fold: true
 #| output: false
-@isdefined(contrasts) || const contrasts = Dict{Symbol, Any}()
+@isdefined(contrasts) || const contrasts = Dict{Symbol,Any}()
 @isdefined(progress) || const progress = false
 ```
 
 ## Create the dataset
 
-
 ```{julia}
 #| output: false
 trials = innerjoin(
-    DataFrame(dataset(:ELP_ldt_trial)),
-    select(DataFrame(dataset(:ELP_ldt_item)), :item, :isword, :wrdlen),
-    on = :item
+  DataFrame(dataset(:ELP_ldt_trial)),
+  select(
+    DataFrame(dataset(:ELP_ldt_item)),
+    :item,
+    :isword,
+    :wrdlen,
+  ),
+  on=:item,
 )
 ```
 
@@ -54,20 +58,28 @@ This takes about ten to fifteen minutes on a recent laptop
 ```{julia}
 contrasts[:isword] = EffectsCoding()
 contrasts[:wrdlen] = Center(8)
-@time gm1 = let f = @formula(acc ~ 1 + isword * wrdlen + (1|item) + (1|subj))
-    fit(MixedModel, f, trials, Bernoulli(); contrasts, progress, init_from_lmm=(:β, :θ))
-end
+@time gm1 =
+  let f =
+      @formula(acc ~ 1 + isword * wrdlen + (1 | item) + (1 | subj))
+    fit(
+      MixedModel,
+      f,
+      trials,
+      Bernoulli();
+      contrasts,
+      progress,
+      init_from_lmm=(:β, :θ),
+    )
+  end
 ```
 
-
 ```{julia}
 print(gm1)
 ```
 
-
 ```{julia}
 #| fig-cap: Conditional modes and 95% prediction intervals on random effects for subject in model gm1
 #| label: fig-gm1condmodesubj
 #| code-fold: true
-qqcaterpillar!(Figure(; size=(800,800)), gm1, :subj)
-```
+qqcaterpillar!(Figure(; size=(800, 800)), gm1, :subj)
+```
diff --git a/Project.toml b/Project.toml
@@ -6,11 +6,11 @@ version = "0.1.0"
 [deps]
 AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"
+Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 DataFrameMacros = "75880514-38bc-4a95-a458-c2aea5a3a702"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
@@ -71,6 +71,7 @@ StatsAPI = "1"
 StatsBase = "0.33, 0.34"
 StatsModels = "0.7"
 Tables = "1"
+TidierPlots = "0.7,0.8"
 TypedTables = "1"
 ZipFile = "0.10"
 julia = "1.8"
diff --git a/_quarto.yml b/_quarto.yml
@@ -70,9 +70,7 @@ format:
 engine: julia
 
 julia:
-  exeflags: 
-    - --project
-    - -tauto
+  exeflags: ["-tauto", "--project"]
 
 execute:
   freeze: auto
diff --git a/aGHQ.qmd b/aGHQ.qmd
@@ -5,7 +5,7 @@ fig-dpi: 150
 fig-format: png
 engine: julia
 julia:
-  exeflags: ["--project"]
+  exeflags: ["-tauto", "--project"]
 ---
 
 # GLMM log-likelihood {#sec-GLMMdeviance}
@@ -136,8 +136,9 @@ Load the packages to be used
 #| output: false
 #| label: packagesA03
 using AlgebraOfGraphics
-using BenchmarkTools
 using CairoMakie
+using Chairmarks    # a more modern BenchmarkTools
+using DataFrames
 using EmbraceUncertainty: dataset
 using FreqTables
 using LinearAlgebra
@@ -146,6 +147,7 @@ using MixedModelsMakie
 using NLopt
 using PooledArrays
 using StatsAPI
+using TidierPlots
 ```
 
 and define some constants
@@ -156,6 +158,8 @@ and define some constants
 #| label: constantsA03
 @isdefined(contrasts) || const contrasts = Dict{Symbol,Any}()
 @isdefined(progress) || const progress = false
+TidierPlots_set("plot_show", false)
+TidierPlots_set("plot_log", false)
 ```
 
 ## Generalized linear models for binary data {#sec-BernoulliGLM}
@@ -402,8 +406,7 @@ Each evaluation of the deviance is fast, requiring only a fraction of a millisec
 
 ```{julia}
 βopt = copy(com05fe.β)
-@benchmark deviance(setβ!(m, β)) seconds = 1 setup =
-  (m = com05fe; β = βopt)
+@be deviance(setβ!($com05fe, $βopt))
 ```
 
 but the already large number of evaluations for these six coefficients would not scale well as this dimension increases.
@@ -635,7 +638,7 @@ The IRLS algorithm has converged in 4 iterations to essentially the same devianc
 Each iteration of the IRLS algorithm takes more time than a deviance evaluation, but still only a fraction of a millisecond on a laptop computer.
 
 ```{julia}
-@benchmark deviance(updateβ!(m)) seconds = 1 setup = (m = com05fe)
+@be deviance(updateβ!($com05fe))
 ```
 
 ## GLMMs and the PIRLS algorithm {#sec-PIRLS}
@@ -866,7 +869,7 @@ pirls!(m; verbose=true);
 As with IRLS, PIRLS is a fast and stable algorithm for determining the mode of the conditional distribution $(\mcU|\mcY=\bby)$ with $\bbtheta$ and $\bbbeta$ held fixed.
 
 ```{julia}
-@benchmark pirls!(mm) seconds = 1 setup = (mm = m)
+@be pirls!($m)
 ```
 
 The time taken for the four iterations to determine the conditional mode of $\bbu$ is comparable to the time taken for a single call to `updateβ!`.
@@ -1127,28 +1130,34 @@ The weights and positions for the 9th order rule are shown in @fig-ghnine.
 #| code-fold: true
 #| fig-cap: Weights and positions for the 9th order normalized Gauss-Hermite quadrature rule
 #| label: fig-ghnine
-#| warning: false
-draw(
-  data(gausshermitenorm(9)) *
-  mapping(:abscissae => "Positions", :weights);
-  figure=(; size=(600, 450)),
-)
+df9 = DataFrame(gausshermitenorm(9))
+ggplot(df9, aes(; x=:abscissae, y=:weights)) +
+geom_point() +
+labs(; x="Positions", y="Weights")
+# draw(
+#   data(gausshermitenorm(9)) *
+#   mapping(:abscissae => "Positions", :weights);
+#   figure=(; size=(600,450)),
+# )
-# draw(
-#   data(gausshermitenorm(9)) *
-#   mapping(:abscissae => "Positions", :weights);
-#   figure=(; size=(600,450)),
-# )
-# draw(
-#   data(gausshermitenorm(9)) *
-#   mapping(:abscissae => "Positions", :weights);
-#   figure=(; size=(600,450)),
-# )
 ```
 
-Notice that the magnitudes of the weights drop quite dramatically away from zero, even on a logarithmic scale (@fig-ghninelog)
+Notice that the magnitudes of the weights drop quite dramatically as the position moves away from zero, even on a logarithmic scale (@fig-ghninelog)
 
 ```{julia}
 #| code-fold: true
-#| fig-cap: Weights (logarithm base 2) and positions for the 9th order normalized Gauss-Hermite quadrature rule
+#| fig-cap: Weights (logarithm base 10) and positions for the 9th order normalized Gauss-Hermite quadrature rule
 #| label: fig-ghninelog
-#| warning: false
-draw(
-  data(gausshermitenorm(9)) * mapping(
-    :abscissae => "Positions",
-    :weights => log2 => "log₂(weight)",
-  );
-  figure=(; size=(600, 450)),
-)
+ggplot(df9, aes(; x=:abscissae, y=:weights)) +
+geom_point() +
+labs(; x="Positions", y="Weights") +
+scale_y_log10()
+# draw(
+#   data(gausshermitenorm(9)) * mapping(
+#     :abscissae => "Positions",
+#     :weights => log2 => "log₂(weight)",
+#   );
+#   figure=(; size=(600,450)),
+# )
-# draw(
-#   data(gausshermitenorm(9)) * mapping(
-#     :abscissae => "Positions",
-#     :weights => log2 => "log₂(weight)",
-#   );
-#   figure=(; size=(600,450)),
-# )
-# draw(
-#   data(gausshermitenorm(9)) * mapping(
-#     :abscissae => "Positions",
-#     :weights => log2 => "log₂(weight)",
-#   );
-#   figure=(; size=(600,450)),
-# )
 ```
 
 The definition of `MixedModels.GHnorm` is similar to the `gausshermitenorm` function with some extra provisions for ensuring symmetry of the abscissae and of the weights and for caching values once they have been calculated.

diff --git a/datatables.qmd b/datatables.qmd
@@ -11,8 +11,8 @@ Load the packages to be used
 using DataFrames
 using EmbraceUncertainty: dataset
 using MixedModels
-using Tables
-using TypedTables
+using Tables: Tables, columnnames, table
+using TypedTables: Table, getproperties
 ```
 
 A call to fit a mixed-effects model using the `MixedModels` package follows the *formula/data* specification that is common to many statistical model-fitting packages, especially those in [R](https://r-project.org).
@@ -27,7 +27,7 @@ These include the *data.frame* type in [R](https://R-Project.org) and the [data.
 
 An increasing popular representation of data frames is as [Arrow](https://arrow.apache.org) tables.
 [Polars](https://pola.rs) uses the Arrow format internally as does the [DuckDB](https://duckdb.org) database.
-Recently it was announced that the 2.0 release of [pandas](https://pandas.pydata.org) will allow for Arrow tables.
+Also, the 2.0 and later releases of [pandas](https://pandas.pydata.org) will allow for Arrow tables.
 
 Arrow specifies a language-independent tabular memory format that provides many desirable properties, such as provision for missing data and compact representation of categorical data vectors, for data science.
 The memory format also defines a file format for storing and exchanging data tables.
@@ -55,9 +55,10 @@ Before going into detail about the properties and use of the `Table` type, let u
 An important characteristic of any system for working with data tables is whether the table is stored in memory column-wise or row-wise.
 
 As described above, most implementations of data frames for data science store the data column-wise.
+That is, elements of a column are stored in adjacent memory locations, allowing for fast traversals of columns.
 
 In *relational database management systems* (RDBMS), such as [PostgreSQL](https://postgresql.org), [SQLite](https://sqlite.org), and a multitude of commercial systems, a data table, called a *relation*, is typically stored row-wise.
-Such systems typically use [SQL](https://en.wikipedia.org/wiki/SQL), the *structured query language*, to define and access the data in tables, which is why that acronym appears in many of the names.
+Such systems generally use [SQL](https://en.wikipedia.org/wiki/SQL), the *structured query language*, to define and access the data in tables, which is why that acronym appears in many of the names.
 There are exceptions to the row-wise rule, such as [DuckDB](https://duckdb.org), an SQL-based RDBMS, that, as mentioned above, represents relations as Arrow tables.
 
 Many external representations of data tables, such as in comma-separated-value (CSV) files, are row-oriented.
@@ -82,11 +83,61 @@ Tables.RowTable
 The actual implementation of a row-table or column-table type may be different from these prototypes but it must provide access methods as if it were one of these types.
 `Tables.jl` provides the "glue" to treat a particular data table type as if it were row-oriented, by calling `Tables.rows` or `Tables.rowtable` on it, or column-oriented, by calling `Tables.columntable` on it.
 
+### MatrixTable as a concrete table type
+
+[Tables.jl](https://github.com/JuliaData/Tables.jl) does provide one concrete table type, which is a `MatrixTable`.
+As the name implies, this is a column-table view of a `Matrix`.
+Because the data values are stored in a matrix, the columns must be homogeneous - that is, the element types of all the columns must be the same.
+
+We will use this type of table in @sec-GLMMdeviance to evaluate and store several related properties of each observation in a generalized linear model.
+In this case all of the properties are expressed as floating-point numbers.
+We construct the table as
+
+```{julia}
+ytbl =
+  let header = (:y, :offset, :η, :μ, :dev, :rtwwt, :wwresp)
+
+    tb = table(
+      zeros(Float64, length(contra.use), length(header));
+      header,
+    )
+    tb.y .= contra.use .≠ "N"
+    tb
+  end
+last(Table(ytbl), 8)
+```
+
+The columns of the table are accessed as *properties*, e.g. `ytbl.η`.
+
+A common idiom when passing a column table to a Julia function is to *destructure* the table into individual columns, which is a notation for binding several names to properties of the tables in one statement.
+That is, instead of beginning a function with several statements like
+
+```julia
+function updateμ!(ytbl)
+  y = ytbl.y
+  η = ytbl.η
+  offset = ytbl.offset
+  μ = ytbl.μ
+  ...
+  return ytbl
+end
+```
+
+one can write
+
+```julia
+function updateμ!(ytbl)
+  (; y, η, offset, μ) = ytbl
+  ...
+  return ytbl
+end
+```
+
 ## The Table type from TypedTables
 
 [TypedTables.jl](https://github.com/JuliaData/TypedTables.jl) is a lightweight package (about 1500 lines of source code) that provides a concrete implementation of column-tables, called simply `Table`, as a `NamedTuple` of vectors.
 
-A `Table` that is constructed from another type of column-table, such as an `Arrow.Table` or a `DataFrame` or an explicit `NamedTuple` of vectors, is simply a wrapper around the original table's contents.
+A `Table` that is constructed from another type of column-table, such as an `Arrow.Table` or a `DataFrame` or an explicit `NamedTuple` of vectors, is just a wrapper around the original table's contents.
 On the other hand, constructing a `Table` from a row table first creates a `ColumnTable`, then wraps it.
 
 ```{julia}

diff --git a/glmmbernoulli.qmd b/glmmbernoulli.qmd
@@ -149,11 +149,12 @@ contrasts[:urban] = HelmertCoding()
 com01 =
   let d = contra,
     ds = Bernoulli(),
-    f = @formula(use ~
-      1 + livch + (age + abs2(age)) * urban + (1 | dist))
+    f = @formula(
+      use ~ 1 + livch + (age + abs2(age)) * urban + (1 | dist)
+    )
 
-  fit(MixedModel, f, d, ds; contrasts, nAGQ, progress)
-end
+    fit(MixedModel, f, d, ds; contrasts, nAGQ, progress)
+  end
 ```
 
 Notice that in the formula language defined by the [StatsModels](https://github.com/JuliaStats/StatsModels.jl) package, an interaction term is written with the `&` operator.
@@ -284,8 +285,9 @@ A series of such model fits led to a model with random effects for the combinati
 
 ```{julia}
 com05 =
-  let f = @formula(use ~
-      1 + urban + ch * age + abs2(age) + (1 | dist & urban)),
+  let f = @formula(
+      use ~ 1 + urban + ch * age + abs2(age) + (1 | dist & urban)
+    ),
     d = contra,
     ds = Bernoulli()