Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 50 additions & 29 deletions R/grid_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -318,50 +318,71 @@ compute_grid_info <- function(workflow, grid) {

res <- min_grid(extract_spec_parsnip(workflow), grid)

syms_pre <- rlang::syms(parameters_preprocessor$id)
syms_mod <- rlang::syms(parameters_model$id)

# ----------------------------------------------------------------------------
# Create an order of execution to train the preprocessor (if any). This will
# define a loop over any preprocessing tuning parameter combinations.
if (any_parameters_preprocessor) {
res$.iter_preprocessor <- seq_len(nrow(res))
pp_df <-
dplyr::distinct(res, !!!syms_pre) %>%
dplyr::arrange(!!!syms_pre) %>%
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will make the ordering of the preprocessors predictable. Previously, it would order them as-is. It's no big deal, but someone might wonder why deg_free = 10 is executed before deg_free=2.

This is why there are several sort() calls in the unit tests.

dplyr::mutate(
.iter_preprocessor = dplyr::row_number(),
.lab_pre = paste0("Preprocessor", 1:dplyr::n())
)
res <-
dplyr::full_join(res, pp_df, by = parameters_preprocessor$id) %>%
dplyr::arrange(.iter_preprocessor)
} else {
res$.iter_preprocessor <- 1L
res$.lab_pre <- "Preprocessor1"
}

# Make the label shown in the grid and in loggining
res$.msg_preprocessor <-
new_msgs_preprocessor(
seq_len(max(res$.iter_preprocessor)),
res$.iter_preprocessor,
max(res$.iter_preprocessor)
)

if (nrow(res) != nrow(grid) ||
(any_parameters_model && !any_parameters_preprocessor)) {
res$.iter_model <- seq_len(dplyr::n_distinct(res[parameters_model$id]))
} else {
res$.iter_model <- 1L
}

res$.iter_config <- list(list())
for (row in seq_len(nrow(res))) {
res$.iter_config[row] <- list(iter_config(res[row, ]))
}
# ----------------------------------------------------------------------------
# Now make a similar iterator across models. Conditioning on each unique
# preprocessing candidate set, make an iterator for the model candidate sets
# (if any)

res <-
res %>%
dplyr::group_nest(.iter_preprocessor, keep = TRUE) %>%
dplyr::mutate(
.iter_config = purrr::map(data, make_iter_config),
.model = purrr::map(data, ~ tibble::tibble(.iter_model = seq_len(nrow(.x)))),
.num_models = purrr::map_int(.model, nrow)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed for res$.msg_model (and is removed later)

) %>%
dplyr::select(-.iter_preprocessor) %>%
tidyr::unnest(cols = c(data, .model, .iter_config)) %>%
dplyr::select(-.lab_pre) %>%
dplyr::relocate(dplyr::starts_with(".iter"))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (and another relocate below) should help with column ordering stability in case we refactor again.


res$.msg_model <-
new_msgs_model(i = res$.iter_model, n = max(res$.iter_model), res$.msg_preprocessor)
new_msgs_model(i = res$.iter_model,
n = res$.num_models,
res$.msg_preprocessor)

res
res %>%
dplyr::select(-.num_models) %>%
dplyr::relocate(dplyr::starts_with(".msg"))
}

iter_config <- function(res_row) {
submodels <- res_row$.submodels[[1]]
if (identical(submodels, list())) {
models <- res_row$.iter_model
} else {
models <- seq_len(length(submodels[[1]]) + 1)
}

paste0(
"Preprocessor",
res_row$.iter_preprocessor,
"_Model",
format_with_padding(models)
)
make_iter_config <- function(dat) {
# Compute labels for the models *within* each preprocessing loop.
num_submodels <- purrr::map_int(dat$.submodels, ~ length(unlist(.x)))
num_models <- sum(num_submodels + 1) # +1 for the model being trained
.mod_label <- paste0("Model", 1:num_models)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.mod_label <- paste0("Model", 1:num_models)
.mod_label <- paste0("Model", seq_len(num_models))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be curious if we could just add a format_with_padding() here, but I think that actually needs to happen outside of the loop since a different preprocessor might be paired with models that enum up to the tens / hundreds place.

Copy link
Member Author

@topepo topepo Nov 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this will work in make_iter_config(); all the models within the preprocessor combination are present at that time.

Copy link
Member Author

@topepo topepo Nov 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tidymodels/extratests#227 looks goo after the last commit.

.iter_config <- paste(dat$.lab_pre[1], .mod_label, sep = "_")
.iter_config <- vctrs::vec_chop(.iter_config, sizes = num_submodels + 1)
tibble::tibble(.iter_config = .iter_config)
}

# This generates a "dummy" grid_info object that has the same
Expand Down
34 changes: 25 additions & 9 deletions tests/testthat/test-grid_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ test_that("compute_grid_info - recipe only", {

expect_equal(res$.iter_preprocessor, 1:5)
expect_equal(res$.msg_preprocessor, paste0("preprocessor ", 1:5, "/5"))
expect_equal(res$deg_free, grid$deg_free)
expect_equal(sort(res$deg_free), sort(grid$deg_free))
expect_equal(res$.iter_model, rep(1, 5))
expect_equal(res$.iter_config, as.list(paste0("Preprocessor", 1:5, "_Model1")))
expect_equal(res$.msg_model, paste0("preprocessor ", 1:5, "/5, model 1/1"))
Expand All @@ -27,6 +27,7 @@ test_that("compute_grid_info - recipe only", {
ignore.order = TRUE
)
expect_equal(nrow(res), 5)
expect_equal(vctrs::vec_unique_count(res$.iter_config), nrow(grid))
})

test_that("compute_grid_info - model only (no submodels)", {
Expand Down Expand Up @@ -57,6 +58,7 @@ test_that("compute_grid_info - model only (no submodels)", {
ignore.order = TRUE
)
expect_equal(nrow(res), 5)
expect_equal(vctrs::vec_unique_count(res$.iter_config), nrow(grid))
})

test_that("compute_grid_info - model only (with submodels)", {
Expand Down Expand Up @@ -107,8 +109,8 @@ test_that("compute_grid_info - recipe and model (no submodels)", {

expect_equal(res$.iter_preprocessor, 1:5)
expect_equal(res$.msg_preprocessor, paste0("preprocessor ", 1:5, "/5"))
expect_equal(res$learn_rate, grid$learn_rate)
expect_equal(res$deg_free, grid$deg_free)
expect_equal(sort(res$learn_rate), sort(grid$learn_rate))
expect_equal(sort(res$deg_free), sort(grid$deg_free))
expect_equal(res$.iter_model, rep(1, 5))
expect_equal(res$.iter_config, as.list(paste0("Preprocessor", 1:5, "_Model1")))
expect_equal(res$.msg_model, paste0("preprocessor ", 1:5, "/5, model 1/1"))
Expand All @@ -120,6 +122,7 @@ test_that("compute_grid_info - recipe and model (no submodels)", {
ignore.order = TRUE
)
expect_equal(nrow(res), 5)
expect_equal(vctrs::vec_unique_count(res$.iter_config), nrow(grid))
})

test_that("compute_grid_info - recipe and model (with submodels)", {
Expand Down Expand Up @@ -169,6 +172,7 @@ test_that("compute_grid_info - recipe and model (with submodels)", {
)
expect_equal(nrow(res), 3)
})

test_that("compute_grid_info - recipe and model (with and without submodels)", {
library(workflows)
library(parsnip)
Expand All @@ -185,25 +189,30 @@ test_that("compute_grid_info - recipe and model (with and without submodels)", {
# use grid_regular to (partially) trigger submodel trick
set.seed(1)
param_set <- extract_parameter_set_dials(wflow)
grid <- bind_rows(grid_regular(param_set), grid_space_filling(param_set))
grid <-
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to add more tests for unbalanced grids but this one covers it well.

bind_rows(grid_regular(param_set), grid_space_filling(param_set)) %>%
arrange(deg_free, loss_reduction, trees)
res <- compute_grid_info(wflow, grid)

expect_equal(length(unique(res$.iter_preprocessor)), 5)
expect_equal(
unique(res$.msg_preprocessor),
paste0("preprocessor ", 1:5, "/5")
)
expect_equal(res$trees, c(rep(max(grid$trees), 10), 1))
expect_equal(sort(res$trees), sort(c(rep(max(grid$trees), 10), 1)))
expect_equal(unique(res$.iter_model), 1:3)
expect_equal(
res$.iter_config[1:3],
res$.iter_config[res$.iter_preprocessor == 1],
list(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your new enumeration is correct (and fixes the case where I had broken previously), but it does look like we're missing a format_with_padding still. When I run these tests with CRAN compute_grid_info(), I see:

── Failure (test-grid_helpers.R:204:3): compute_grid_info - recipe and model (with and without submodels) ──
res$.iter_config[res$.iter_preprocessor == 1] (`actual`) not equal to list(...) (`expected`).

    actual[[1]]             | expected[[1]]             
[1] "Preprocessor1_Model01" - "Preprocessor1_Model1" [1]
[2] "Preprocessor1_Model02" - "Preprocessor1_Model2" [2]
[3] "Preprocessor1_Model03" - "Preprocessor1_Model3" [3]
[4] "Preprocessor1_Model04" - "Preprocessor1_Model4" [4]

`actual[[2]]`:   "Preprocessor1_Model05" "Preprocessor1_Model06" "Preprocessor1_Model07"
`expected[[2]]`: "Preprocessor1_Model5"  "Preprocessor1_Model6"  "Preprocessor1_Model7" 

`actual[[3]]`:   "Preprocessor1_Model08" "Preprocessor1_Model09" "Preprocessor1_Model10"
`expected[[3]]`: "Preprocessor1_Model8"  "Preprocessor1_Model9"  "Preprocessor1_Model10"

c("Preprocessor1_Model1", "Preprocessor1_Model2", "Preprocessor1_Model3", "Preprocessor1_Model4"),
c("Preprocessor2_Model1", "Preprocessor2_Model2", "Preprocessor2_Model3"),
c("Preprocessor3_Model1", "Preprocessor3_Model2", "Preprocessor3_Model3")
c("Preprocessor1_Model5", "Preprocessor1_Model6", "Preprocessor1_Model7"),
c("Preprocessor1_Model8", "Preprocessor1_Model9", "Preprocessor1_Model10")
)
)
expect_equal(res$.msg_model[1:3], paste0("preprocessor ", 1:3, "/5, model 1/3"))
expect_equal(
res$.msg_model[res$.iter_preprocessor == 1],
paste0("preprocessor 1/5, model ", 1:3, "/3")
)
expect_equal(
res$.submodels[1:3],
list(
Expand All @@ -212,6 +221,12 @@ test_that("compute_grid_info - recipe and model (with and without submodels)", {
list(trees = c(1L, 1000L))
)
)
expect_equal(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't be too paranoid about this.

res %>%
mutate(num_models = purrr::map_int(.iter_config, length)) %>%
summarize(n = sum(num_models), .by = c(deg_free)),
grid %>% count(deg_free)
)
expect_named(
res,
c(".iter_preprocessor", ".msg_preprocessor", "deg_free", "trees",
Expand Down Expand Up @@ -325,4 +340,5 @@ test_that("compute_grid_info - recipe and model (no submodels but has inner grid
ignore.order = TRUE
)
expect_equal(nrow(res), 9)
expect_equal(vctrs::vec_unique_count(res$.iter_config), nrow(grid))
})
Loading