TLC for loop_over_all_stages() (#1142)

hfrick · web-flow · commit 6677c46f1903 · 2026-03-09T07:25:58.000-04:00
* first version of loop doc

* align pattern for grid to pattern for workflow

have `current_*` be the object that gets updated across the stages

* anchor naming more in context

* don't use `all_` prefix unless it's across all iterations

* add note to point to `loop.qmd`
diff --git a/R/loop_over_all_stages-helpers.R b/R/loop_over_all_stages-helpers.R
@@ -338,7 +338,7 @@ process_prediction_data <- function(wflow_fit, static) {
 # ------------------------------------------------------------------------------
 # Misc functions
 
-rebind_grid <- function(...) {
+extend_grid <- function(...) {
   list(...) |> purrr::map(remove_stage) |> purrr::list_cbind()
 }
 
diff --git a/R/loop_over_all_stages.R b/R/loop_over_all_stages.R
@@ -1,3 +1,4 @@
+# Notes for easier reading are in `inst/loop.qmd`.
 # Notes on debugging:
 # 1. You can set `options(future.debug = TRUE)` to help
 # 2. If you are debugging .loop_over_all_stages, use the control option
@@ -52,7 +53,8 @@
 
   for (iter_pre in seq_len(num_iterations_pre)) {
     current_sched_pre <- sched[iter_pre, ]
-    0
+    current_grid <- remove_stage(current_sched_pre)
+
     location <- glue::glue("preprocessor {iter_pre}/{num_iterations_pre}")
 
     # Note: finalize_fit_pre() will process the data used for modeling. We'll
@@ -98,12 +100,15 @@
     # values currently are tune()
     wflow_with_fitted_pre <- current_wflow
 
+    grid_with_pre <- current_grid
+
     if (is_failure(pred_data)) {
       next
     }
 
     for (iter_model in seq_len(num_iterations_model)) {
       current_sched_model <- current_sched_pre$model_stage[[1]][iter_model, ]
+      current_grid <- extend_grid(grid_with_pre, current_sched_model)
 
       # Splice in any parameters marked for tuning and fit the model
       location <- glue::glue(
@@ -122,8 +127,6 @@
         next
       }
 
-      current_grid <- rebind_grid(current_sched_pre, current_sched_model)
-
       has_submodel <- has_sub_param(current_sched_model$predict_stage[[1]])
       num_iterations_pred <- max(
         nrow(current_sched_model$predict_stage[[1]]),
@@ -136,25 +139,37 @@
 
       if (has_submodel) {
         # Collect all submodel values and predict once
-        all_sub_sched <- current_sched_model$predict_stage[[1]]
-        sub_nm <- get_sub_param(all_sub_sched)
-        all_sub_grid <- all_sub_sched[, sub_nm, drop = FALSE]
+        sched_pred_all_submodels <- current_sched_model$predict_stage[[1]]
+        sub_nm <- get_sub_param(sched_pred_all_submodels)
+        grid_pred_all_submodels <- sched_pred_all_submodels[,
+          sub_nm,
+          drop = FALSE
+        ]
+
+        # Submodel parameters will be added in the predict stage
+        grid_with_pre_model <- current_grid |>
+          dplyr::select(-dplyr::all_of(sub_nm))
 
         location <- glue::glue(
           "preprocessor {iter_pre}/{num_iterations_pre}, model {iter_model}/{num_iterations_model} (predictions)"
         )
-        all_submodel_pred <- .catch_and_log(
-          predict_all_types(current_wflow, pred_data, static, all_sub_grid),
+        pred_all_submodels <- .catch_and_log(
+          predict_all_types(
+            current_wflow,
+            pred_data,
+            static,
+            grid_pred_all_submodels
+          ),
           control = static$control,
           split_labels = split_labs,
           location = location,
           notes = notes
         )
 
-        if (is_failure(all_submodel_pred)) {
+        if (is_failure(pred_all_submodels)) {
           next
         }
-        all_submodel_pred <- remove_log_notes(all_submodel_pred)
+        pred_all_submodels <- remove_log_notes(pred_all_submodels)
       }
 
       for (iter_pred in seq_len(num_iterations_pred)) {
@@ -166,14 +181,11 @@
           sub_nm <- get_sub_param(current_sched_pred)
           sub_val <- current_sched_pred[[sub_nm]]
 
-          # The assigned submodel parameter (from min_grid()) is in the
-          # current grid. Remove that and add the one that we are predicting on
-          current_grid <- current_grid |>
-            dplyr::select(-dplyr::all_of(sub_nm)) |>
-            rebind_grid(current_sched_pred)
+          # Add submodel param to grid
+          current_grid <- extend_grid(grid_with_pre_model, current_sched_pred)
 
           # Filter to this submodel's predictions (already computed above)
-          current_pred <- all_submodel_pred |>
+          current_pred <- pred_all_submodels |>
             dplyr::filter(.data[[sub_nm]] == sub_val) |>
             dplyr::select(-dplyr::all_of(sub_nm))
         } else {
@@ -205,16 +217,15 @@
         # values currently are tune()
         wflow_with_fitted_pre_and_model <- current_wflow
 
-        current_predict_grid <- current_grid
+        grid_with_pre_model_pred <- current_grid
 
         for (iter_post in seq_len(num_iterations_post)) {
           if (has_post) {
             current_sched_post <-
               current_sched_pred$post_stage[[1]][iter_post, ]
-            post_grid <- current_sched_post
 
-            current_post_grid <- rebind_grid(
-              current_predict_grid,
+            current_grid <- extend_grid(
+              grid_with_pre_model_pred,
               current_sched_post
             )
 
@@ -236,7 +247,7 @@
               finalize_fit_post(
                 wflow_with_fitted_pre_and_model,
                 data_calibration = tailor_train_data,
-                grid = post_grid
+                grid = current_sched_post
               ),
               control = static$control,
               split_labels = split_labs,
@@ -262,13 +273,10 @@
               next
             }
 
-            final_pred <- dplyr::bind_cols(post_pred, current_post_grid)
-            current_extract_grid <- current_post_grid
-            # end submodels
+            final_pred <- dplyr::bind_cols(post_pred, current_grid)
           } else {
             # No postprocessor so just use what we have
-            final_pred <- dplyr::bind_cols(current_pred, current_predict_grid)
-            current_extract_grid <- current_predict_grid
+            final_pred <- dplyr::bind_cols(current_pred, current_grid)
           }
 
           current_wflow <- workflows::.fit_finalize(current_wflow)
@@ -298,7 +306,7 @@
               extracts <- tibble::tibble(.extracts = list(1))
               if (nrow(static$param_info) > 0) {
                 extracts <- tibble::add_column(
-                  current_extract_grid,
+                  current_grid,
                   .extracts = list(1)
                 )
               }
@@ -309,7 +317,7 @@
               extracts <- tibble::add_row(
                 extracts,
                 tibble::add_column(
-                  current_extract_grid,
+                  current_grid,
                   .extracts = list(elt_extract)
                 )
               )
diff --git a/inst/.gitignore b/inst/.gitignore
@@ -0,0 +1,2 @@
+/.quarto/
+**/*.quarto_ipynb
diff --git a/inst/loop.qmd b/inst/loop.qmd
@@ -0,0 +1,189 @@
+---
+title: Loop over all stages aka loopy
+---
+
+We have a big ol loop at the heart of `tune_grid()` (and thus the rest of the tuning/resampling functions). That can be a lot to hold in your head at once, so here's a memory aid.
+
+## Overview
+
+We need to work our way through a whole lot of resamples and tuning parameter combinations. We could take the whole grid of tuning parameters, splice them into the workflow, and fit all those workflows on all resamples. This repeats potentially costly calculations, so we have carefully crafted the loop to avoid redundant computation.
+
+The loop runs over a single resample. The basic structure is:
+
+- For each preproc parameter combination
+	- Fit preprocessor
+	- Apply it to analysis set
+	- Apply it to the assessment set
+	- For each (non-sub) model parameter combination
+		- Fit the model
+		- Predict for all submodel parameters at once (via multi_predict)
+		- For each submodel parameter value
+			- Filter to this submodel's predictions
+			- For each post parameter combination
+				- Fit the postprocessor
+				- Apply the postprocessor to the predictions (on the assessment set)
+				- Combine (post-processed) predictions with the grid to `final_pred`
+				- Save `final_pred` by appending it to `pred_reserve`
+				- Do the extracts
+- Compute the metrics
+
+## Inputs
+
+### The schedule
+
+The schedule is a nested tibble created by `schedule_grid()` that organizes tuning parameters by stage. Each stage contains the next stage in a list-column:
+
+```
+sched
+├── [preproc params]
+└── model_stage (list-col)
+    └── tibble
+        ├── [model params]
+        └── predict_stage (list-col)
+            └── tibble
+                ├── [submodel params]
+                └── post_stage (list-col)
+                    └── tibble
+                        └── [post params]
+```
+
+The four stages are:
+
+- **pre**: preprocessing via recipes
+- **model**: the model fit via parsnip (submodel parameters are collapsed via `min_grid()`)
+- **predict**: prediction (submodel parameters are expanded)
+- **post**: postprocessing via tailor
+
+To access the next stage, extract with `[[1]]`:
+
+- `current_sched_pre$model_stage[[1]]` → tibble of model param combinations
+- `current_sched_model$predict_stage[[1]]` → tibble of submodel param values
+- `current_sched_pred$post_stage[[1]]` → tibble of post param combinations
+
+### The `static` object
+
+The `static` list contains everything that stays constant throughout the loop:
+
+- `wflow` - the original workflow (template for finalization)
+- `param_info` - parameter set info from `tune_args()`
+- `configs` - tibble mapping parameter values to `.config` labels
+- `metrics` - the metric set
+- `pred_types` - prediction types needed (e.g., "class", "prob", "numeric")
+- `eval_time` - evaluation times for survival models
+- `control` - control options
+- `data` - list with `fit`, `pred`, and `cal` data partitions (added after setup)
+- `y_name` - outcome column name(s)
+
+### Data partitions
+
+The `static$data` list contains three partitions (set up once per resample):
+
+- `fit` - training data for preprocessor and model
+- `pred` - assessment data for predictions (used to compute metrics)
+- `cal` - calibration data for postprocessors that need fitting (e.g., probability calibration)
+
+When there's no postprocessor requiring calibration, `cal` is NULL and `fit` uses the full analysis set. When calibration is needed, the analysis set is further split into `fit` and `cal`.
+
+## Reading the code
+
+### Naming conventions
+
+**Loop variables:**
+
+- `iter_{stage}` - iteration counter (e.g., `iter_pre`, `iter_model`, `iter_pred`, `iter_post`)
+- `num_iterations_{stage}` - total iterations for that stage
+
+**Schedule objects:**
+
+- `sched` - the full nested schedule tibble
+- `current_sched_{stage}` - current row of the schedule at each stage
+
+**Grid objects:**
+
+- `current_grid` - progressively accumulates tuning params as we descend into loops
+- `grid_with_pre` - snapshot with pre params (before model loop)
+- `grid_with_pre_model` - snapshot with pre + model params, without submodel col (before pred loop)
+- `grid_with_pre_model_pred` - snapshot with pre + model + pred params (before post loop)
+
+**Workflow snapshots** (saved to allow re-finalization in inner loops):
+
+- `current_wflow` - the workflow being modified
+- `wflow_with_fitted_pre` - snapshot after fitting preprocessor
+- `wflow_with_fitted_pre_and_model` - snapshot after fitting model
+
+**Prediction objects:**
+
+- `pred_data` - processed prediction data (features + outcomes)
+- `pred_all_submodels` - batched predictions for all submodel values (source for filtering)
+- `current_pred` - predictions for current pred iteration (filtered from `pred_all_submodels`)
+- `final_pred` - predictions after postprocessing (ready to save)
+- `pred_reserve` - accumulator for all final predictions
+
+**General conventions:**
+
+- `static` - things that don't change during the loop
+- `current_*` - value for the current iteration
+- `*_all_submodels` - batched values for all submodel params (e.g., `pred_all_submodels`)
+- Stage suffixes: `_pre`, `_model`, `_pred`, `_post`
+
+### Key helper functions
+
+**Finalization** (splice tuning params into workflow and fit):
+
+- `finalize_fit_pre()` - finalize recipe params, fit preprocessor
+- `finalize_fit_model()` - finalize model params, fit model
+- `finalize_fit_post()` - finalize tailor params, fit postprocessor
+
+**Grid helpers:**
+
+- `extend_grid()` - extend a grid with params from a schedule row (strips `*_stage` columns)
+- `remove_stage()` - remove nested stage columns from a schedule row
+
+**Prediction:**
+
+- `process_prediction_data()` - apply fitted preprocessor to assessment data
+- `predict_all_types()` - generate all needed prediction types
+
+### Error handling
+
+The loop uses a consistent error handling pattern:
+
+```r
+result <- .catch_and_log(
+  some_operation(),
+  control = static$control,
+  split_labels = split_labs,
+  location = location,
+  notes = notes
+)
+
+if (is_failure(result)) {
+  next
+}
+result <- remove_log_notes(result)
+```
+
+- `.catch_and_log()` wraps operations to capture errors/warnings without stopping
+- `is_failure()` checks if the operation failed
+- `next` skips to the next iteration (the failed config won't have results)
+- `remove_log_notes()` strips logging metadata from successful results
+
+## Efficiency
+
+The nested structure avoids redundant computation:
+
+| What | Computed | Reused for |
+|------|----------|------------|
+| Preprocessor fit | Once per preproc param combo | All model params below it |
+| Processed prediction data | Once per preproc param combo | All predictions below it |
+| Model fit | Once per model param combo | All submodel predictions |
+| Submodel predictions | Once per model (batched) | All submodel × post combos |
+| Postprocessor fit | Once per post param combo | That specific config |
+
+Submodel parameters (like `penalty` in glmnet) are predicted all at once using `multi_predict()`, which is much faster than predicting one at a time.
+
+## Debugging
+
+Set `control = control_grid(allow_par = FALSE)` to run sequentially with `lapply()` so you can see output and use `browser()`.
+
+For parallel debugging, `options(future.debug = TRUE)` can help.

Original file line number	Diff line number	Diff line change
`@@ -338,7 +338,7 @@ process_prediction_data <- function(wflow_fit, static) {`
`338`	`338`	`# ------------------------------------------------------------------------------`
`339`	`339`	`# Misc functions`
`340`	`340`
`341`		`-rebind_grid <- function(...) {`
	`341`	`+extend_grid <- function(...) {`
`342`	`342`	`list(...) \|> purrr::map(remove_stage) \|> purrr::list_cbind()`
`343`	`343`	`}`
`344`	`344`