diff --git a/NEWS.md b/NEWS.md index 09833b69..8d9b5b8a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # tune (development version) +* The warning threshold when check the size of a workflow is now a parameter to the control functions and has a new default of 100MB. (#914) + # tune 2.0.1 * Fixed a bug where `int_pctl()` wouldn't work on `last_fit()` outcomes when future parallelism was enabled. (#1099) diff --git a/R/control.R b/R/control.R index 388ed88f..264f92d5 100644 --- a/R/control.R +++ b/R/control.R @@ -37,7 +37,8 @@ control_grid <- function( save_workflow = FALSE, event_level = "first", parallel_over = NULL, - backend_options = NULL + backend_options = NULL, + workflow_size = 100.0 ) { # Any added arguments should also be added in superset control functions # in other packages @@ -50,6 +51,7 @@ control_grid <- function( check_string(event_level) check_character(pkgs, allow_null = TRUE) check_function(extract, allow_null = TRUE) + check_number_decimal(workflow_size) val_parallel_over(parallel_over, "control_grid()") @@ -62,7 +64,8 @@ control_grid <- function( save_workflow = save_workflow, event_level = event_level, parallel_over = parallel_over, - backend_options = backend_options + backend_options = backend_options, + workflow_size = workflow_size ) class(res) <- c("control_grid", "control_resamples") @@ -200,6 +203,9 @@ print.control_last_fit <- function(x, ...) { #' backend. Defaults to `NULL` for default backend options. #' @param allow_par A logical to allow parallel processing (if a parallel #' backend is registered). +#' @param workflow_size A non-negative number (in MB) that is used as a +#' threshold for a warning regarding the size of the workflow. Only used when +#' `save_workflow = TRUE`. #' #' @inheritSection collect_predictions Hyperparameters and extracted objects #' @@ -240,7 +246,8 @@ control_bayes <- event_level = "first", parallel_over = NULL, backend_options = NULL, - allow_par = TRUE + allow_par = TRUE, + workflow_size = 100.0 ) { # Any added arguments should also be added in superset control functions # in other packages @@ -257,6 +264,7 @@ control_bayes <- check_number_whole(no_improve, min = 0, allow_infinite = TRUE) check_number_whole(uncertain, min = 0, allow_infinite = TRUE) check_number_whole(seed) + check_number_decimal(workflow_size) check_time_limit_arg(time_limit) @@ -285,7 +293,8 @@ control_bayes <- save_gp_scoring = save_gp_scoring, event_level = event_level, parallel_over = parallel_over, - backend_options = backend_options + backend_options = backend_options, + workflow_size = workflow_size ) class(res) <- "control_bayes" diff --git a/R/tune_grid.R b/R/tune_grid.R index 5080c5a9..4e6f49f9 100644 --- a/R/tune_grid.R +++ b/R/tune_grid.R @@ -436,16 +436,19 @@ pull_rset_attributes <- function(x) { set_workflow <- function(workflow, control) { if (control$save_workflow) { - if (!is.null(workflow$pre$actions$recipe)) { - w_size <- utils::object.size(workflow$pre$actions$recipe) - # make 5MB cutoff - if (w_size / 1024^2 > 5) { + if (!is.null(workflow)) { + w_size <- utils::object.size(workflow) + if (w_size / 1024^2 > control$workflow_size) { + rounded <- round(w_size / 1024^2, 1) msg <- - paste0( - "The workflow being saved contains a recipe, which is ", - format(w_size, units = "Mb", digits = 2), - " in memory. If this was not intentional, please set the control ", - "setting `save_workflow = FALSE`." + cli::format_inline( + "The workflow being saved is large ({rounded} MB). If this + was not intentional, please set the control setting + {.arg save_workflow} to be {.code FALSE} or change the threshold + for this warning (currently {control$workflow_size} MB) with the + {.arg workflow_size} argument.", + keep_whitespace = FALSE, + collapse = FALSE ) cols <- get_tune_colors() msg <- strwrap( diff --git a/man/control_bayes.Rd b/man/control_bayes.Rd index 9fdce80d..66f006a7 100644 --- a/man/control_bayes.Rd +++ b/man/control_bayes.Rd @@ -19,7 +19,8 @@ control_bayes( event_level = "first", parallel_over = NULL, backend_options = NULL, - allow_par = TRUE + allow_par = TRUE, + workflow_size = 100 ) } \arguments{ @@ -115,6 +116,10 @@ backend. Defaults to \code{NULL} for default backend options.} \item{allow_par}{A logical to allow parallel processing (if a parallel backend is registered).} + +\item{workflow_size}{A non-negative number (in MB) that is used as a +threshold for a warning regarding the size of the workflow. Only used when +\code{save_workflow = TRUE}.} } \description{ Control aspects of the Bayesian search process diff --git a/man/control_grid.Rd b/man/control_grid.Rd index c800898a..b32538a0 100644 --- a/man/control_grid.Rd +++ b/man/control_grid.Rd @@ -15,7 +15,8 @@ control_grid( save_workflow = FALSE, event_level = "first", parallel_over = NULL, - backend_options = NULL + backend_options = NULL, + workflow_size = 100 ) control_resamples( @@ -27,7 +28,8 @@ control_resamples( save_workflow = FALSE, event_level = "first", parallel_over = NULL, - backend_options = NULL + backend_options = NULL, + workflow_size = 100 ) new_backend_options(..., class = character()) @@ -88,6 +90,10 @@ reproducible between runs.} \item{backend_options}{An object of class \code{"tune_backend_options"} as created by \code{tune::new_backend_options()}, used to pass arguments to specific tuning backend. Defaults to \code{NULL} for default backend options.} + +\item{workflow_size}{A non-negative number (in MB) that is used as a +threshold for a warning regarding the size of the workflow. Only used when +\code{save_workflow = TRUE}.} } \description{ Control aspects of the grid search process diff --git a/tests/testthat/_snaps/control.md b/tests/testthat/_snaps/control.md new file mode 100644 index 00000000..e747970c --- /dev/null +++ b/tests/testthat/_snaps/control.md @@ -0,0 +1,10 @@ +# workflow size warning + + Code + set.seed(1) + warns <- fit_resamples(lm_wflow, resamples = vfold_cv(MTCARS), control = control_resamples(save_workflow = TRUE, workflow_size = 2)) + Message + i The workflow being saved is large (2.7 MB). If this was not intentional, + please set the control setting `save_workflow` to be `FALSE` or change the + threshold for this warning (currently 2 MB) with the `workflow_size` argument. + diff --git a/tests/testthat/test-control.R b/tests/testthat/test-control.R new file mode 100644 index 00000000..8f7f48f3 --- /dev/null +++ b/tests/testthat/test-control.R @@ -0,0 +1,40 @@ +test_that("workflow size warning", { + withr::local_options(width = 500) + + # A larger data set to trip the warning + MTCARS <- mtcars[rep(1:32, each = 1000), ] + + lm_rec <- recipe(mpg ~ ., data = MTCARS) + lm_wflow <- workflow(lm_rec, linear_reg() |> set_engine("lm", x = TRUE)) + # About 2.7MB when fit + + expect_silent({ + set.seed(1) + no_warning <- + lm_wflow |> + fit_resamples( + resamples = vfold_cv(MTCARS), + control = control_resamples(save_workflow = TRUE, workflow_size = Inf) + ) + }) + + expect_snapshot({ + set.seed(1) + warns <- + lm_wflow |> + fit_resamples( + resamples = vfold_cv(MTCARS), + control = control_resamples(save_workflow = TRUE, workflow_size = 2) + ) + }) + + expect_silent({ + set.seed(1) + no_save <- + lm_wflow |> + fit_resamples( + resamples = vfold_cv(MTCARS), + control = control_resamples(save_workflow = FALSE, workflow_size = 2) + ) + }) +}) diff --git a/tests/testthat/test-grid.R b/tests/testthat/test-grid.R index 6df7fe06..19f64495 100644 --- a/tests/testthat/test-grid.R +++ b/tests/testthat/test-grid.R @@ -786,20 +786,4 @@ test_that("retain extra attributes", { ) expect_null(attr(res, "workflow")) expect_true(inherits(attr(res2, "workflow"), "workflow")) - - wflow2 <- workflow() |> - add_recipe(recipes::recipe(mpg ~ ., mtcars[rep(1:32, 3000), ])) |> - add_model(helper_objects$svm_mod) - pset2 <- extract_parameter_set_dials(wflow2) - grid2 <- dials::grid_regular(pset2, levels = 3) - - expect_message( - tune_grid( - wflow2, - resamples = folds, - grid = grid2, - control = control_grid(save_workflow = TRUE) - ), - "being saved contains a recipe, which is" - ) }) diff --git a/tests/testthat/test-resample.R b/tests/testthat/test-resample.R index 231f0fe5..6ebd3e74 100644 --- a/tests/testthat/test-resample.R +++ b/tests/testthat/test-resample.R @@ -484,15 +484,6 @@ test_that("retain extra attributes", { ) expect_null(attr(res, "workflow")) expect_true(inherits(attr(res2, "workflow"), "workflow")) - - expect_snapshot( - fit_resamples( - lin_mod, - recipes::recipe(mpg ~ ., mtcars[rep(1:32, 3000), ]), - folds, - control = control_resamples(save_workflow = TRUE) - ) - ) })