Merge branch 'main' of github.com:mlr-org/mlr3book

be-marc · be-marc · commit f08a30c018dd · 2025-09-04T13:51:15.000+02:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,6 +28,7 @@ Imports:
     mlr3filters,
     mlr3fselect,
     mlr3hyperband,
+    mlr3inferr,
     mlr3learners,
     mlr3oml,
     mlr3mbo,
@@ -46,6 +47,10 @@ Imports:
     stringi
 Remotes:
     mlr-org/mlr3extralearners,
+    mlr-org/mlr3batchmark,
+    mlr-org/mlr3proba,
+    mlr-org/mlr3fairness,
+    mlr-org/mlr3inferr
     mlr-org/mlr3proba
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
diff --git a/R/zzz.R b/R/zzz.R
@@ -5,7 +5,7 @@ NULL
 
 db = new.env()
 db$index = c("base", "utils", "datasets", "data.table", "stats", "batchtools")
-db$hosted = c("paradox", "mlr3misc", "mlr3", "mlr3data", "mlr3db", "mlr3proba", "mlr3pipelines", "mlr3learners", "mlr3filters", "bbotk", "mlr3tuning", "mlr3viz", "mlr3fselect", "mlr3cluster", "mlr3spatiotempcv", "mlr3spatial", "mlr3extralearners", "mlr3tuningspaces", "mlr3hyperband", "mlr3mbo", "mlr3verse", "mlr3benchmark", "mlr3oml", "mlr3batchmark", "mlr3fairness")
+db$hosted = c("paradox", "mlr3misc", "mlr3", "mlr3data", "mlr3db", "mlr3proba", "mlr3pipelines", "mlr3learners", "mlr3filters", "bbotk", "mlr3tuning", "mlr3viz", "mlr3fselect", "mlr3cluster", "mlr3spatiotempcv", "mlr3spatial", "mlr3extralearners", "mlr3tuningspaces", "mlr3hyperband", "mlr3mbo", "mlr3verse", "mlr3benchmark", "mlr3oml", "mlr3batchmark", "mlr3fairness", "mlr3inferr")
 
 lgr = NULL
 
diff --git a/book/book.bib b/book/book.bib
@@ -1436,6 +1436,25 @@ @book{hutter2019automated
     publisher = {Springer},
     keywords = {}
 }
+
+@misc{kuempelfischer2024ciforge,
+      title={Constructing Confidence Intervals for 'the' Generalization Error -- a Comprehensive Benchmark Study},
+      author={Hannah Schulz-Kümpel and Sebastian Fischer and Thomas Nagler and Anne-Laure Boulesteix and Bernd Bischl and Roman Hornung},
+      year={2024},
+      eprint={2409.18836},
+      archivePrefix={arXiv},
+      primaryClass={stat.ML},
+      url={https://arxiv.org/abs/2409.18836},
+}
+
+@article{bayle2020cross,
+  title={Cross-validation confidence intervals for test error},
+  author={Bayle, Pierre and Bayle, Alexandre and Janson, Lucas and Mackey, Lester},
+  journal={Advances in Neural Information Processing Systems},
+  volume={33},
+  pages={16339--16350},
+  year={2020}
+}
 @article{yu_quantile_2003,
     author = {Yu, Keming and Lu, Zudi and Stander, Julian},
     doi = {10.1111/1467-9884.00363},
@@ -1447,3 +1466,13 @@ @article{yu_quantile_2003
     volume = {52},
     year = {2003},
 }
+@book{koenker_quantile_2005,
+    address = {Cambridge},
+    series = {Econometric Society Monographs},
+    title = {Quantile Regression},
+    isbn = {978-0-521-84573-1},
+    publisher = {Cambridge University Press},
+    author = {Koenker, Roger},
+    year = {2005},
+    doi = {10.1017/CBO9780511754098},
+}
diff --git a/book/chapters/appendices/errata.qmd b/book/chapters/appendices/errata.qmd
@@ -18,6 +18,7 @@ This appendix lists changes to the online version of this book to chapters inclu
 ## 3. Evaluation and Benchmarking
 
 * Use `$encapsulate()` method instead of the `$encapsulate` and `$fallback` fields.
+* A section on the `mlr3inferr` package was added.
 
 ## 4. Hyperparameter Optimization
 
diff --git a/book/chapters/chapter13/beyond_regression_and_classification.qmd b/book/chapters/chapter13/beyond_regression_and_classification.qmd
@@ -1031,7 +1031,7 @@ ggplot(plot_data, aes(x = x, y = loss, color = tau)) +
 But note: While many ML models based on empirical risk minimization use the pinball loss for estimating quantiles, some model classes might work differently.
 However, since the underlying training procedure of a model is external to `mlr3`, we are more concerned with resampling and evaluating quantile regression models.
 This works in exactly the same manner as for other tasks.
-Because we provide only a brief overview of quantile regression, we recommend @yu_quantile_2003 if you are interested in a methodological introduction to the topic.
+Because we provide only a brief overview of quantile regression, we recommend @yu_quantile_2003 if you are interested in a methodological introduction to the topic and @koenker_quantile_2005 for a more expansive treatment of quantile regression.
 
 ### Synthetic data set generation {#sec-data-generation}
 
diff --git a/book/chapters/chapter3/evaluation_and_benchmarking.qmd b/book/chapters/chapter3/evaluation_and_benchmarking.qmd
@@ -285,6 +285,31 @@ print(plt2)
 ```
 
 
+### Confidence Intervals (+) {#sec-resampling-ci}
+
+Instead of relying solely on point estimates, CIs offer a measure of uncertainty of this estimate, allowing us to understand the reliability of our performance measurement.
+While constructing CIs for the generalization error is challenging due to the complex nature of the inference problem, some methods have been shown to work well in practice [@kuempelfischer2024ciforge].
+When employing such methods, it is important to be aware that they can fail in some cases -- e.g. in the presence of outliers or instable learning procedures -- and to be aware that the resulting CIs can either be too conservative or too liberal.
+
+The `r ref_pkg("mlr3inferr")` package extends the `mlr3` ecosystem with both inference methods and new resampling strategies.
+The inference methods are implemented as `r ref("Measure")` objects that take in another measure for which to compute the CI.
+Below, we demonstrate how to use the inference method suggested by @bayle2020cross to compute a CI for the cross-validation result from the previous section.
+As opposed to other mlr3 measures, the result is not a scalar value, but a vector containing the point estimate, as well as the lower and upper bounds of the CI for the specified alpha level.
+
+```{r}
+library(mlr3inferr)
+# alpha = 0.05 is also the default
+msr_ci = msr("ci.wald_cv", msr("classif.acc"), alpha = 0.05)
+rr$aggregate(msr_ci)
+```
+
+We can also use `msr("ci")`, which will automatically select the appropriate inference measure for the given resampling strategy.
+A list of available inference methods can be found on the package website: `r link("https://mlr3inferr.mlr-org.com/")`.
+
+```{r}
+rr$aggregate(msr("ci", msr("classif.acc")))
+```
+
 ### ResampleResult Objects {#sec-resampling-inspect}
 
 As well as being useful for estimating the generalization performance, the `r ref("ResampleResult")` object can also be used for model inspection.
@@ -576,6 +601,21 @@ plt = plt + ggplot2::scale_fill_manual(values = c("grey30", "grey50", "grey70"))
 print(plt)
 ```
 
+It is also possible to plot confidence intervals by setting the type of plot to `"ci"`.
+
+```{r}
+#| fig-height: 3
+#| fig-width: 6
+#| label: fig-benchmark-ci
+#| fig-cap: 'Confidence intervals for accuracy scores for each learner across resampling iterations and the three tasks. Random forests (`lrn("classif.ranger")`) consistently outperforms the other learners.'
+#| fig-alt: Nine confidence intervals, one corresponding to each task/learner combination. In all cases the random forest performs best and the featureless baseline the worst.
+#| echo: false
+#| warning: false
+#| message: false
+autoplot(bmr, "ci", measure = msr("ci", msr("classif.acc")))
+```
+
+
 ## Evaluation of Binary Classifiers {#sec-roc}
 
 In @sec-basics-classif-learner we touched on the concept of a confusion matrix and how it can be used to break down classification errors in more detail.
diff --git a/book/chapters/chapter9/preprocessing.qmd b/book/chapters/chapter9/preprocessing.qmd
@@ -236,17 +236,9 @@ invisible(dev.off())
 magick::image_trim(fig)
 ```
 
-::: {.callout-warning}
-
-Currently, there is a bug in the mlr3pipelines package that causes the following code chunk to fail.
-See https://github.com/mlr-org/mlr3pipelines/issues/894 for more details.
-
-:::
-
 Using this pipeline we can now run experiments with `lrn("regr.ranger")`, which cannot handle missing data; we also compare a simpler pipeline that only uses OOR imputation to demonstrate performance differences resulting from different strategies.
 
 ```{r preprocessing-015}
-#| eval: false
 glrn_rf_impute_hist = as_learner(impute_hist %>>% lrn("regr.ranger"))
 glrn_rf_impute_hist$id = "RF_imp_Hist"
 
@@ -450,19 +442,10 @@ tsk_ames_ext$data(1,
   c("energy_means", "energy_mins", "energy_maxs", "energy_vars"))
 ```
 
-::: {.callout-warning}
-
-This code chunk does not work due to the bug in the `mlr3pipelines` package.
-See the warning message above for more details.
-
-:::
-
-
 These outputs look sensible compared to @fig-energy so we can now run our final benchmark experiment using feature extraction.
 We do not need to add the `PipeOp` to each learner as we can apply it once (as above) before any model training by applying it to all available data.
 
 ```{r preprocessing-026, warning=FALSE, R.options = list(datatable.print.nrows = 13, datatable.print.class = FALSE, datatable.print.keys = FALSE, datatable.print.trunc.cols = TRUE)}
-#| eval: false
 learners = list(lrn_baseline, lrn("regr.rpart"), glrn_xgb_impact,
     glrn_rf_impute_oor, glrn_lm_robust, glrn_log_lm_robust)
 
diff --git a/book/common/chap_auths.csv b/book/common/chap_auths.csv
@@ -1,7 +1,7 @@
 Chapter Number,Title,Authors
 1,Introduction and Overview,"Lars Kotthoff, Raphael Sonabend, Natalie Foss, Bernd Bischl"
 2,Data and Basic Modeling,"Natalie Foss, Lars Kotthoff"
-3,Evaluation and Benchmarking,"Giuseppe Casalicchio, Lukas Burk"
+3,Evaluation and Benchmarking,"Giuseppe Casalicchio, Lukas Burk, Sebastian Fischer"
 4,Hyperparameter Optimization,"Marc Becker, Lennart Schneider, Sebastian Fischer"
 5,Advanced Tuning Methods and Black Box Optimization,"Lennart Schneider, Marc Becker"
 6,Feature Selection,Marvin N. Wright