use an inner split when training calibrators

simonpcouch · simonpcouch · commit 2000f2d07c1f · 2024-05-22T13:48:12.000-05:00
diff --git a/R/fit.R b/R/fit.R
@@ -59,16 +59,34 @@ fit.workflow <- function(object, data, ..., control = control_workflow()) {
     abort("`data` must be provided to fit a workflow.")
   }
 
+  # If `potato` is not overwritten in the following `if` statement, then the
+  # the postprocessor doesn't actually require training and the dataset
+  # passed to `.fit_post()` will have no effect.
+  potato <- data
   if (should_inner_split(object)) {
-    # todo: make an inner_split here
-    TRUE
+    validate_rsample_available()
+
+    mocked_split <-
+      rsample::make_splits(
+        list(analysis = seq_len(nrow(data)), assessment = integer()),
+        data = data,
+        class = object$post$actions$tailor$method %||% "mc_split"
+      )
+
+    inner_split <- rsample::inner_split(
+      mocked_split,
+      list(prop = object$post$actions$tailor$prop %||% 2/3)
+    )
+
+    data <- rsample::analysis(inner_split)
+    potato <- rsample::assessment(inner_split)
   }
 
   workflow <- object
   workflow <- .fit_pre(workflow, data)
   workflow <- .fit_model(workflow, control)
   if (has_postprocessor(workflow)) {
-    workflow <- .fit_post(workflow, data)
+    workflow <- .fit_post(workflow, potato)
   }
   workflow <- .fit_finalize(workflow)
 
diff --git a/R/post-action-tailor.R b/R/post-action-tailor.R
@@ -17,6 +17,70 @@
 #'   should not have been trained already with [tailor::fit()]; workflows
 #'   will handle training internally.
 #'
+#' @param prop The proportion of the data in [fit.workflow()] that should be
+#' held back specifically for estimating the postprocessor. Only relevant for
+#' postprocessors that require estimation---see section Data Usage below to
+#' learn more. Defaults to 2/3.
+#'
+#' @param method The method with which to split the data in [fit.workflow()],
+#' as a character vector. Only relevant for postprocessors that
+#' require estimation and not required when resampling the workflow with
+#' tune. If `fit.workflow(data)` arose as `training(split_object)`, this argument can
+#' usually be supplied as `class(split_object)`. Defaults to `"mc_split"`, which
+#' randomly samples `fit.workflow(data)` into two sets, similarly to
+#' [rsample::initial_split()]. See section Data Usage below to learn more.
+#'
+#' @section Data Usage:
+#'
+#' While preprocessors and models are trained on data in the usual sense,
+#' postprocessors are training on _predictions_ on data. When a workflow
+#' is fitted, the user supplies training data with the `data` argument.
+#' When workflows don't contain a postprocessor that requires training,
+#' they can use all of the supplied `data` to train the preprocessor and model.
+#' However, in the case where a postprocessor must be trained as well,
+#' training the preprocessor and model on all of `data` would leave no data
+#' left to train the postprocessor with---if that were the case, workflows
+#' would need to `predict()` from the preprocessor and model on the same `data`
+#' that they were trained on, with the postprocessor then training on those
+#' predictions. Predictions on data that a model was trained on likely follow
+#' different distributions than predictions on unseen data; thus, workflows must
+#' split up the supplied `data` into two training sets, where the first is used to
+#' train the preprocessor and model and the second is passed to that trained
+#' processor and model to generate predictions, which then form the training data
+#' for the post-processor.
+#'
+#' The arguments `prop` and `method` parameterize how that data is split up.
+#' `prop` determines the proportion of rows in `fit.workflow(data)` that are
+#' allotted to training the preprocessor and model, while the rest are used to
+#' train the postprocessor. `method` determines how that split occurs; since
+#' `fit.workflow()` just takes in a data frame, the function doesn't have
+#' any information on how that dataset came to be. For example, `data` could
+#' have been created as:
+#'
+#' ```
+#' split <- rsample::initial_split(some_other_data)
+#' data <- rsample::training(split)
+#' ```
+#'
+#' ...in which case it's okay to randomly allot some rows of `data` to train the
+#' preprocessor and model and the rest to train the postprocessor. However,
+#' `data` could also have arisen as:
+#'
+#' ```
+#' boots <- rsample::bootstraps(some_other_data)
+#' split <- rsample::get_rsplit(boots, 1)
+#' data <- rsample::assessment(split)
+#' ```
+#'
+#' In this case, some of the rows in `data` will be duplicated. Thus, randomly
+#' allotting some of them to train the preprocessor and model and others to train
+#' the preprocessor would likely result in the same rows appearing in both
+#' datasets, resulting in the preprocessor and model generating predictions on
+#' rows they've seen before. Similarly problematic situations could arise in the
+#' context of other resampling situations, like time-based splits.
+#' The `method` argument ensures that data is allotted properly (and is
+#' internally handled by the tune package when resampling workflows).
+#'
 #' @param ... Not used.
 #'
 #' @return
@@ -38,10 +102,10 @@
 #' remove_tailor(workflow)
 #'
 #' update_tailor(workflow, adjust_probability_threshold(tailor, .2))
-add_tailor <- function(x, tailor, ...) {
+add_tailor <- function(x, tailor, prop = NULL, method = NULL, ...) {
   check_dots_empty()
   validate_tailor_available()
-  action <- new_action_tailor(tailor)
+  action <- new_action_tailor(tailor, prop = prop, method = method)
   res <- add_action(x, action, "tailor")
   if (should_inner_split(res)) {
     validate_rsample_available()
@@ -130,7 +194,7 @@ check_conflicts.action_tailor <- function(action, x, ..., call = caller_env()) {
 
 # ------------------------------------------------------------------------------
 
-new_action_tailor <- function(tailor, ..., call = caller_env()) {
+new_action_tailor <- function(tailor, prop, method, ..., call = caller_env()) {
   check_dots_empty()
 
   if (!is_tailor(tailor)) {
@@ -142,8 +206,17 @@ new_action_tailor <- function(tailor, ..., call = caller_env()) {
     abort("Can't add a trained tailor to a workflow.", call = call)
   }
 
+  if (!is.null(prop) &&
+      (!rlang::is_double(prop, n = 1) || prop <= 0 || prop >= 1)) {
+    abort("`prop` must be a numeric on (0, 1).", call = call)
+  }
+
+  # todo: test method
+
   new_action_post(
     tailor = tailor,
+    prop = prop,
+    method = method,
     subclass = "action_tailor"
   )
 }
diff --git a/man/add_tailor.Rd b/man/add_tailor.Rd
diff --git a/tests/testthat/test-post-action-tailor.R b/tests/testthat/test-post-action-tailor.R
@@ -109,6 +109,7 @@ test_that("postprocessor fit aligns with manually fitted version (no calibration
 
 test_that("postprocessor fit aligns with manually fitted version (with calibration)", {
   skip_if_not_installed("modeldata")
+  skip_if_not_installed("rsample")
 
   # create example data
   y <- seq(0, 7, .1)
@@ -122,15 +123,31 @@ test_that("postprocessor fit aligns with manually fitted version (with calibrati
   wflow_post <- add_tailor(wflow_simple, post)
 
   # train workflow
-  wf_simple_fit <- fit(wflow_simple, dat)
+
+  # first, separate out the same data that workflows ought to internally
+  # when training with a postprocessor that needs estimation
+  mocked_split <-
+    rsample::make_splits(
+      list(analysis = seq_len(nrow(dat)), assessment = integer()),
+      data = dat,
+      class = "mc_split"
+    )
+  set.seed(1)
+  inner_split <- rsample::inner_split(mocked_split, list(prop = 2/3))
+
+  wf_simple_fit <- fit(wflow_simple, rsample::analysis(inner_split))
+
+  # the following fit will do all of this internally
+  set.seed(1)
   wf_post_fit <- fit(wflow_post, dat)
 
-  # ...verify predictions are the same as training the post-proc separately
-  wflow_simple_preds <- augment(wf_simple_fit, dat)
+  # ...verify predictions are the same as training the post-proc separately.
+  # note that this test naughtily re-predicts on the potato set.
+  wflow_simple_preds <- augment(wf_simple_fit, rsample::assessment(inner_split))
   post_trained <- fit(post, wflow_simple_preds, y, .pred)
   wflow_manual_preds <- predict(post_trained, wflow_simple_preds)
 
-  wflow_post_preds <- predict(wf_post_fit, dat)
+  wflow_post_preds <- predict(wf_post_fit, rsample::assessment(inner_split))
 
   expect_equal(wflow_manual_preds[".pred"], wflow_post_preds)
   expect_false(all(wflow_simple_preds[".pred"] == wflow_manual_preds[".pred"]))