feat: add simple and law of total variance se methods to ranger (#347)

be-marc · berndbischl · Bernd Bischl · web-flow · commit f94eb7b0787e · 2025-09-15T14:45:10.000+02:00
* feat: add simple and law of total variance se methods to ranger

* ...

* ...

* ...

* ...

* ...

* ...

* ...

* ...

* ...

* ...g

* ...

* ...

* ...

* ...

* ...

* ...

* fix off by one error

* ...

* ....

* ....

* ...

* ...

* ...

* ...

* ...

* ...

* ...

---------

Co-authored-by: Bernd Bischl &lt;bernd_bischl@gmx.net&gt;
Co-authored-by: Bernd Bischl &lt;bernd.bischl@gmx.net&gt;
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -19,3 +19,4 @@ vignettes/learners/
 ^revdep$
 ^cran-comments\.md$
 ^CRAN-SUBMISSION$
+.clangd
diff --git a/.gitignore b/.gitignore
@@ -181,3 +181,4 @@ revdep/
 
 # misc
 Meta/
+.clangd
diff --git a/.lintr b/.lintr
@@ -5,5 +5,5 @@ linters: linters_with_defaults(
     object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names
     cyclocomp_linter = NULL, # do not check function complexity
     commented_code_linter = NULL, # allow code in comments
-    line_length_linter = line_length_linter(120L)
+    line_length_linter = line_length_linter(300L)
     )
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -43,6 +43,7 @@ Suggests:
     knitr,
     lgr,
     MASS,
+    mirai,
     nnet,
     pracma,
     ranger,
@@ -54,7 +55,7 @@ Remotes:
     mlr-org/mlr3
 Config/testthat/edition: 3
 Encoding: UTF-8
-NeedsCompilation: no
+NeedsCompilation: yes
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.3
 Collate:
diff --git a/NAMESPACE b/NAMESPACE
@@ -41,3 +41,5 @@ importFrom(stats,predict)
 importFrom(stats,reformulate)
 importFrom(utils,bibentry)
 importFrom(utils,packageVersion)
+useDynLib(mlr3learners,c_ranger_mu_sigma)
+useDynLib(mlr3learners,c_ranger_var)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # mlr3learners (development version)
 
+* feat: Add new uncertainty estimation methods `ensemble_standard_deviation` and `law_of_total_variance` to `regr.ranger` learner.
+
 # mlr3learners 0.12.0
 
 * feat: Add `classif.kknn` and `regr.kknn` learners.
diff --git a/R/LearnerClassifRanger.R b/R/LearnerClassifRanger.R
@@ -128,7 +128,7 @@ LearnerClassifRanger = R6Class("LearnerClassifRanger",
     #'
     #' @return `character()`.
     selected_features = function() {
-      ranger_selected_features(self)
+      ranger_selected_features(self$model, self$state$feature_names)
     }
   ),
 
diff --git a/R/LearnerRegrRanger.R b/R/LearnerRegrRanger.R
@@ -4,7 +4,18 @@
 #'
 #' @description
 #' Random regression forest.
-#' Calls [ranger::ranger()] from package \CRANpkg{ranger}.
+#' Calls `ranger()` from package \CRANpkg{ranger}.
+#'
+#' @details
+#' Additionally to the uncertainty estimation methods provided by the ranger package, the learner provides a ensemble standard deviation and law of total variance uncertainty estimation.
+#' Both methods compute the empirical mean and variance of the training data points that fall into the predicted leaf nodes.
+#' The ensemble standard deviation method calculates the standard deviation of the mean of the leaf nodes.
+#' The law of total variance method calculates the mean of the variance of the leaf nodes plus the variance of the means of the leaf nodes.
+#' Formulas for the ensemble standard deviation and law of total variance method are given in Hutter et al. (2015).
+#'
+#' For these 2 methods, the parameter `sigma2.threshold` can be used to set a threshold for the variance of the leaf nodes,
+#' this is a minimal value for the variance of the leaf nodes, if the variance is below this threshold, it is set to this value (as described in the paper).
+#' Default is 1e-2.
 #'
 #' @inheritSection mlr_learners_classif.ranger Custom mlr3 parameters
 #' @inheritSection mlr_learners_classif.ranger Initial parameter values
@@ -13,7 +24,7 @@
 #' @template learner
 #'
 #' @references
-#' `r format_bib("wright_2017", "breiman_2001")`
+#' `r format_bib("wright_2017", "breiman_2001", "hutter_2015")`
 #'
 #' @export
 #' @template seealso_learner
@@ -50,15 +61,16 @@ LearnerRegrRanger = R6Class("LearnerRegrRanger",
         sample.fraction              = p_dbl(0L, 1L, tags = "train"),
         save.memory                  = p_lgl(default = FALSE, tags = "train"),
         scale.permutation.importance = p_lgl(default = FALSE, tags = "train", depends = quote(importance == "permutation")),
-        se.method                    = p_fct(c("jack", "infjack"), default = "infjack", tags = "predict"), # FIXME: only works if predict_type == "se". How to set dependency?
+        se.method                    = p_fct(c("jack", "infjack", "ensemble_standard_deviation", "law_of_total_variance"), default = "infjack", tags = "predict"),
+        sigma2.threshold             = p_dbl(default = 1e-2, tags = "train"),
         seed                         = p_int(default = NULL, special_vals = list(NULL), tags = c("train", "predict")),
         split.select.weights         = p_uty(default = NULL, tags = "train"),
         splitrule                    = p_fct(c("variance", "extratrees", "maxstat", "beta", "poisson"), default = "variance", tags = "train"),
         verbose                      = p_lgl(default = TRUE, tags = c("train", "predict")),
         write.forest                 = p_lgl(default = TRUE, tags = "train")
       )
 
-      ps$set_values(num.threads = 1L)
+      ps$set_values(num.threads = 1L, sigma2.threshold = 1e-2)
 
       super$initialize(
         id = "regr.ranger",
@@ -79,14 +91,14 @@ LearnerRegrRanger = R6Class("LearnerRegrRanger",
     #'
     #' @return Named `numeric()`.
     importance = function() {
-      if (is.null(self$model)) {
+      if (is.null(self$model$model)) {
         stopf("No model stored")
       }
-      if (self$model$importance.mode == "none") {
+      if (self$model$model$importance.mode == "none") {
         stopf("No importance stored")
       }
 
-      sort(self$model$variable.importance, decreasing = TRUE)
+      sort(self$model$model$variable.importance, decreasing = TRUE)
     },
 
     #' @description
@@ -98,8 +110,8 @@ LearnerRegrRanger = R6Class("LearnerRegrRanger",
         return(self$state$oob_error)
       }
 
-      if (!is.null(self$model)) {
-        return(self$model$prediction.error)
+      if (!is.null(self$model$model)) {
+        return(self$model$model$prediction.error)
       }
 
       stopf("No model stored")
@@ -110,14 +122,17 @@ LearnerRegrRanger = R6Class("LearnerRegrRanger",
     #'
     #' @return `character()`.
     selected_features = function() {
-      ranger_selected_features(self)
+      ranger_selected_features(self$model$model, self$state$feature_names)
     }
   ),
 
   private = list(
     .train = function(task) {
       pv = self$param_set$get_values(tags = "train")
       pv = convert_ratio(pv, "mtry", "mtry.ratio", length(task$feature_names))
+      pv$se.method = NULL
+      sigma2_threshold = pv$sigma2.threshold
+      pv$sigma2.threshold = NULL
       pv$case.weights = get_weights(task, private)
 
       if (self$predict_type == "se") {
@@ -127,43 +142,56 @@ LearnerRegrRanger = R6Class("LearnerRegrRanger",
       if (self$predict_type == "quantiles") {
         pv$quantreg = TRUE # nolint
       }
-
-      invoke(ranger::ranger,
+      data = task$data()
+      model = invoke(ranger::ranger,
         dependent.variable.name = task$target_names,
-        data = task$data(),
+        data = data,
         .args = pv
       )
+
+      if (isTRUE(self$param_set$values$se.method %in% c("ensemble_standard_deviation", "law_of_total_variance"))) {
+        # num.threads is the only thing from the param set we want to pass here and not set manually
+        prediction_nodes = mlr3misc::invoke(predict, model, data = data, type = "terminalNodes", predict.all = TRUE, num.threads = pv$num.threads)
+        storage.mode(prediction_nodes$predictions) = "integer"
+        mu_sigma = .Call("c_ranger_mu_sigma", prediction_nodes$predictions, task$truth(), sigma2_threshold)
+        list(model = model, mu_sigma = mu_sigma)
+      } else {
+        list(model = model)
+      }
     },
 
     .predict = function(task) {
       pv = self$param_set$get_values(tags = "predict")
       newdata = ordered_features(task, self)
 
-      prediction = invoke(predict, self$model,
-        data = newdata,
-        type = self$predict_type,
-        quantiles = private$.quantiles,
-        .args = pv)
-
-      if (self$predict_type == "quantiles") {
-        assert_quantiles(self, quantile_response = TRUE)
-        quantiles = prediction$predictions
-        setattr(quantiles, "probs", private$.quantiles)
-        setattr(quantiles, "response", private$.quantile_response)
-        return(list(quantiles = quantiles))
+      if (isTRUE(pv$se.method %in% c("ensemble_standard_deviation", "law_of_total_variance"))) {
+        prediction_nodes = mlr3misc::invoke(predict, self$model$model, data = newdata, type = "terminalNodes", .args = pv[setdiff(names(pv), "se.method")], predict.all = TRUE)
+        storage.mode(prediction_nodes$predictions) = "integer"
+        method = if (pv$se.method == "ensemble_standard_deviation") 0 else 1
+        .Call("c_ranger_var", prediction_nodes$predictions, self$model$mu_sigma, method)
+      } else {
+        prediction = mlr3misc::invoke(predict, self$model$model, data = newdata, type = self$predict_type, quantiles = private$.quantiles, .args = pv)
+
+        if (self$predict_type == "quantiles") {
+          assert_quantiles(self, quantile_response = TRUE)
+          quantiles = prediction$predictions
+          setattr(quantiles, "probs", private$.quantiles)
+          setattr(quantiles, "response", private$.quantile_response)
+          return(list(quantiles = quantiles))
+        }
+
+        list(response = prediction$predictions, se = prediction$se)
       }
-
-      list(response = prediction$predictions, se = prediction$se)
     },
 
     .hotstart = function(task) {
-      model = self$models
+      model = self$model$model
       model$num.trees = self$param_set$values$num.trees
-      model
+      list(model = model)
     },
 
     .extract_oob_error = function() {
-      self$model$prediction.error
+      self$model$model$prediction.error
     }
   )
 )
diff --git a/R/bibentries.R b/R/bibentries.R
@@ -108,5 +108,16 @@ bibentries = c( # nolint start
     number           = "1",
     pages            = "1--17",
     doi              = "10.18637/jss.v077.i01"
-  )
+  ),
+  hutter_2015 = bibentry("inproceedings",
+    title            = "Algorithm runtime prediction: methods and evaluation",
+    author           = "Hutter, Frank and Xu, Lin and Hoos, Holger H. and Leyton-Brown, Kevin",
+    year             = "2015",
+    publisher        = "AAAI Press",
+    booktitle        = "Proceedings of the 24th International Conference on Artificial Intelligence",
+    pages            = "4197--4201",
+    series           = "IJCAI'15",
+    doi              = "10.5555/2832747.2832840"
+)
+
 ) # nolint end
diff --git a/R/helpers_ranger.R b/R/helpers_ranger.R
@@ -35,25 +35,18 @@ convert_ratio = function(pv, target, ratio, n) {
   )
 }
 
-
-
-
-ranger_selected_features = function(self) {
-  if (is.null(self$model)) {
+ranger_selected_features = function(model, feature_names) {
+  if (is.null(model)) {
     stopf("No model stored")
   }
 
-  splitvars = ranger::treeInfo(object = self$model, tree = 1)$splitvarName
+  splitvars = ranger::treeInfo(object = model, tree = 1)$splitvarName
   i = 2
-  while (i <= self$model$num.trees &&
-      !all(self$state$feature_names %in% splitvars)) {
-    sv = ranger::treeInfo(object = self$model, tree = i)$splitvarName
+  while (i <= model$num.trees && !all(feature_names %in% splitvars)) {
+    sv = ranger::treeInfo(object = model, tree = i)$splitvarName
     splitvars = union(splitvars, sv)
     i = i + 1
   }
 
-  # order the names of the selected features in the same order as in the task
-  self$state$feature_names[self$state$feature_names %in% splitvars]
+  splitvars[!is.na(splitvars)]
 }
-
-
diff --git a/R/zzz.R b/R/zzz.R
@@ -6,6 +6,7 @@
 #' @importFrom mlr3 mlr_learners LearnerClassif LearnerRegr assert_validate assert_quantiles
 #' @importFrom stats predict reformulate
 #' @importFrom utils packageVersion
+#' @useDynLib mlr3learners c_ranger_mu_sigma c_ranger_var
 #'
 #' @description
 #' More learners are implemented in the [mlr3extralearners package](https://github.com/mlr-org/mlr3extralearners).
diff --git a/inst/paramtest/test_paramtest_regr.ranger.R b/inst/paramtest/test_paramtest_regr.ranger.R
@@ -20,7 +20,8 @@ test_that("regr.ranger", {
     "time.interest", # survival only
     "quantreg", # handled by predict_type
     "alpha", # survival only
-    "minprop" # survival only
+    "minprop", # survival only
+    "sigma2.threshold" # added by mlr3
   )
 
   ParamTest = run_paramtest(learner, fun, exclude, tag = "train")
@@ -42,7 +43,8 @@ test_that("predict regr.ranger", {
     "formula", # handled via mlr3
     "object", # handled via mlr3
     "data", # handled via mlr3
-    "type" # handled via mlr3
+    "type", # handled via mlr3
+    "sigma2.threshold" # added by mlr3
   )
 
   ParamTest = run_paramtest(learner, fun, exclude, tag = "predict")
diff --git a/man-roxygen/note_kknn.R b/man-roxygen/note_kknn.R
@@ -8,3 +8,4 @@
 #' * `pv`: Training parameters for calling [kknn::kknn()] during `$predict()`.
 #' * `kknn`: Model as returned by [kknn::kknn()], only available **after** `$predict()` has been called.
 #'   This is not stored by default, you must set hyperparameter `store_model` to `TRUE`.
+
diff --git a/man/mlr_learners_regr.ranger.Rd b/man/mlr_learners_regr.ranger.Rd
diff --git a/src/init.c b/src/init.c
@@ -0,0 +1,19 @@
+#include <R.h>
+#include <Rinternals.h>
+#include <stdlib.h> // for NULL
+#include <R_ext/Rdynload.h>
+
+/* .Call calls */
+extern SEXP c_ranger_mu_sigma(SEXP, SEXP, SEXP);
+extern SEXP c_ranger_var(SEXP, SEXP, SEXP);
+
+static const R_CallMethodDef CallEntries[] = {
+    {"c_ranger_mu_sigma", (DL_FUNC) &c_ranger_mu_sigma, 3},
+    {"c_ranger_var", (DL_FUNC) &c_ranger_var, 3},
+    {NULL, NULL, 0}
+};
+
+void R_init_mlr3learners(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/ranger_var.c b/src/ranger_var.c
diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
diff --git a/tests/testthat/test_classif_cv_glmnet.R b/tests/testthat/test_classif_cv_glmnet.R
diff --git a/tests/testthat/test_regr_cv_glmnet.R b/tests/testthat/test_regr_cv_glmnet.R
diff --git a/tests/testthat/test_regr_ranger.R b/tests/testthat/test_regr_ranger.R

Original file line number	Diff line number	Diff line change
`@@ -181,3 +181,4 @@ revdep/`
`181`	`181`
`182`	`182`	`# misc`
`183`	`183`	`Meta/`
	`184`	`+.clangd`
Original file line number	Diff line number	Diff line change
`@@ -5,5 +5,5 @@ linters: linters_with_defaults(`
`5`	`5`	`object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names`
`6`	`6`	`cyclocomp_linter = NULL, # do not check function complexity`
`7`	`7`	`commented_code_linter = NULL, # allow code in comments`
`8`		`- line_length_linter = line_length_linter(120L)`
	`8`	`+ line_length_linter = line_length_linter(300L)`
`9`	`9`	`)`
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ LearnerClassifRanger = R6Class("LearnerClassifRanger",`
`128`	`128`	`#'`
`129`	`129`	#' @return `character()`.
`130`	`130`	`selected_features = function() {`
`131`		`- ranger_selected_features(self)`
	`131`	`+ ranger_selected_features(self$model, self$state$feature_names)`
`132`	`132`	`}`
`133`	`133`	`),`
`134`	`134`
Original file line number	Diff line number	Diff line change
`@@ -35,25 +35,18 @@ convert_ratio = function(pv, target, ratio, n) {`
`35`	`35`	`)`
`36`	`36`	`}`
`37`	`37`
`38`		`-`
`39`		`-`
`40`		`-`
`41`		`-ranger_selected_features = function(self) {`
`42`		`- if (is.null(self$model)) {`
	`38`	`+ranger_selected_features = function(model, feature_names) {`
	`39`	`+ if (is.null(model)) {`
`43`	`40`	`stopf("No model stored")`
`44`	`41`	`}`
`45`	`42`
`46`		`- splitvars = ranger::treeInfo(object = self$model, tree = 1)$splitvarName`
	`43`	`+ splitvars = ranger::treeInfo(object = model, tree = 1)$splitvarName`
`47`	`44`	`i = 2`
`48`		`- while (i <= self$model$num.trees &&`
`49`		`- !all(self$state$feature_names %in% splitvars)) {`
`50`		`- sv = ranger::treeInfo(object = self$model, tree = i)$splitvarName`
	`45`	`+ while (i <= model$num.trees && !all(feature_names %in% splitvars)) {`
	`46`	`+ sv = ranger::treeInfo(object = model, tree = i)$splitvarName`
`51`	`47`	`splitvars = union(splitvars, sv)`
`52`	`48`	`i = i + 1`
`53`	`49`	`}`
`54`	`50`
`55`		`- # order the names of the selected features in the same order as in the task`
`56`		`- self$state$feature_names[self$state$feature_names %in% splitvars]`
	`51`	`+ splitvars[!is.na(splitvars)]`
`57`	`52`	`}`
`58`		`-`
`59`		`-`