Merge pull request #135 from ModelOriented/fix-multi-column-key-separator

mayer79 · web-flow · commit a0bf4836adf7 · 2025-05-06T22:31:45.000+02:00
Replace multi-column separator
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,10 @@
 
 - Changed Tidymodels example to probabilistic multiclass, see discussion in [#129](https://github.com/ModelOriented/hstats/issues/129).
 
+## Internals
+
+- Use "\r" instead of "_:_" as separator to paste values in multi-column grids (similar to `merge()`). [#133](https://github.com/ModelOriented/hstats/issues/133).
+
 # hstats 1.2.1
 
 ## Usability
diff --git a/R/pd_raw.R b/R/pd_raw.R
@@ -1,18 +1,18 @@
 #' Barebone Partial Dependence Function
-#' 
+#'
 #' Workhorse of the package, thus optimized for speed.
-#' 
+#'
 #' @noRd
 #' @keywords internal
-#' 
+#'
 #' @inheritParams partial_dep
 #' @param grid A vector, data.frame or matrix of grid values consistent with `v` and `X`.
 #' @param compress_X If `X` has a single non-`v` column: should duplicates be removed
 #'   and compensated via case weights? Default is `TRUE`.
-#' @param compress_grid Should duplicates in `grid` be removed and PDs mapped back to 
+#' @param compress_grid Should duplicates in `grid` be removed and PDs mapped back to
 #'   the original grid index? Default is `TRUE`.
-#' @returns 
-#'   A matrix of partial dependence values (one column per prediction dimension, 
+#' @returns
+#'   A matrix of partial dependence values (one column per prediction dimension,
 #'   one row per grid row, in the same order as `grid`).
 pd_raw <- function(
     object,
@@ -23,8 +23,7 @@ pd_raw <- function(
     w = NULL,
     compress_X = TRUE,
     compress_grid = TRUE,
-    ...
-  ) {
+    ...) {
   # Try different compressions
   if (compress_X && length(v) == ncol(X) - 1L) {
     # Removes duplicates in X[, not_v] and compensates via w
@@ -37,13 +36,19 @@ pd_raw <- function(
     cmp_grid <- .compress_grid(grid = grid)
     grid <- cmp_grid[["grid"]]
   }
-  
+
   # Now, the real work
   pred <- ice_raw(
-    object, v = v, X = X, grid = grid, pred_fun = pred_fun, pred_only = TRUE, ...
+    object,
+    v = v,
+    X = X,
+    grid = grid,
+    pred_fun = pred_fun,
+    pred_only = TRUE,
+    ...
   )
   pd <- wrowmean(pred, ngroups = NROW(grid), w = w)
-  
+
   # Map back to grid order
   if (compress_grid && !is.null(reindex <- cmp_grid[["reindex"]])) {
     return(pd[reindex, , drop = FALSE])
@@ -52,44 +57,49 @@ pd_raw <- function(
 }
 
 #' Barebone ICE Function
-#' 
+#'
 #' Part of the workhorse function `pd_raw()`, thus optimized for speed.
-#' 
+#'
 #' @noRd
 #' @keywords internal
-#' 
+#'
 #' @inheritParams pd_raw
 #' @param pred_only Logical flag determining the output mode. If `TRUE`, just
 #'   predictions. Otherwise, a list with two elements: `pred` (predictions)
-#'   and `grid_pred` (the corresponding grid values in the same mode as the input, 
+#'   and `grid_pred` (the corresponding grid values in the same mode as the input,
 #'   but replicated over `X`).
-#' @returns 
+#' @returns
 #'   Either a vector/matrix of predictions or a list with predictions and grid.
 ice_raw <- function(
-    object, v, X, grid, pred_fun = stats::predict, pred_only = TRUE, ...
-  ) {
+    object,
+    v,
+    X,
+    grid,
+    pred_fun = stats::predict,
+    pred_only = TRUE,
+    ...) {
   D1 <- length(v) == 1L
   n <- nrow(X)
   n_grid <- NROW(grid)
-  
+
   # Explode everything to n * n_grid rows
   X_pred <- rep_rows(X, rep.int(seq_len(n), n_grid))
   if (D1) {
     grid_pred <- rep(grid, each = n)
   } else {
     grid_pred <- rep_rows(grid, rep_each(n_grid, n))
   }
-  
+
   # Vary v
   if (D1 && is.data.frame(X_pred)) {
-    X_pred[[v]] <- grid_pred  #  [, v] <- slower if df
+    X_pred[[v]] <- grid_pred #  [, v] <- slower if df
   } else {
     X_pred[, v] <- grid_pred
   }
-  
+
   # Calculate matrix/vector of predictions
   pred <- prepare_pred(pred_fun(object, X_pred, ...))
-  
+
   if (pred_only) {
     return(pred)
   }
@@ -99,31 +109,31 @@ ice_raw <- function(
 # Helper functions used only within pd_raw()
 
 #' Compresses X
-#' 
+#'
 #' @description
-#' Internal function to remove duplicated rows in `X` based on columns not in `v`. 
-#' Compensation is done by summing corresponding case weights `w`. 
+#' Internal function to remove duplicated rows in `X` based on columns not in `v`.
+#' Compensation is done by summing corresponding case weights `w`.
 #' Currently implemented only for the case when there is a single non-`v` column in `X`.
-#' Can later be generalized to multiple columns via [paste()]. 
-#' 
+#' Can later be generalized to multiple columns via [paste()].
+#'
 #' Notes:
 #' - This function is important for interaction calculations.
 #' - The initial check for having a single non-`v` column is very cheap.
-#' 
+#'
 #' @noRd
 #' @keywords internal
-#' 
+#'
 #' @inheritParams pd_raw
 #' @returns A list with `X` and `w`, potentially compressed.
 .compress_X <- function(X, v, w = NULL) {
   not_v <- setdiff(colnames(X), v)
   if (length(not_v) != 1L) {
-    return(list(X = X, w = w))  # No optimization implemented for this case
+    return(list(X = X, w = w)) # No optimization implemented for this case
   }
   x_not_v <- if (is.data.frame(X)) X[[not_v]] else X[, not_v]
   X_dup <- duplicated(x_not_v)
   if (!any(X_dup)) {
-    return(list(X = X, w = w))  # No optimization done
+    return(list(X = X, w = w)) # No optimization done
   }
 
   # Compensate via w
@@ -135,22 +145,22 @@ ice_raw <- function(
     x_not_v <- match(x_not_v, x_not_v[!X_dup])
   }
   list(
-    X = X[!X_dup, , drop = FALSE], 
+    X = X[!X_dup, , drop = FALSE],
     w = c(rowsum(w, group = x_not_v, reorder = FALSE))
   )
 }
 
 #' Compresses Grid
-#' 
-#' Internal function used to remove duplicated grid rows. Re-indexing to original grid 
+#'
+#' Internal function used to remove duplicated grid rows. Re-indexing to original grid
 #' rows needs to be later, but this function provides the re-index vector to do so.
 #' Further note that checking for uniqueness can be costly for higher-dimensional grids.
-#' 
+#'
 #' @noRd
 #' @keywords internal
-#' 
+#'
 #' @inheritParams pd_raw
-#' @returns 
+#' @returns
 #'   A list with `grid` (possibly compressed) and the optional `reindex` vector
 #'   used to map compressed grid values back to the original grid rows. The original
 #'   grid equals the compressed grid at indices `reindex`.
@@ -161,14 +171,14 @@ ice_raw <- function(
     return(list(grid = grid, reindex = NULL))
   }
   out <- list(grid = ugrid)
-  if (NCOL(grid) >= 2L) {  # Non-vector case
-    grid <- do.call(paste, c(as.data.frame(grid), sep = "_:_"))
-    ugrid <- do.call(paste, c(as.data.frame(ugrid), sep = "_:_"))
+  if (NCOL(grid) >= 2L) { # Non-vector case (see merge())
+    # can we drop the as.data.frame()? I think yes
+    grid <- do.call(paste, c(as.data.frame(grid), sep = "\r"))
+    ugrid <- do.call(paste, c(as.data.frame(ugrid), sep = "\r"))
     if (anyDuplicated(ugrid)) {
-      stop("String '_:_' found in grid values at unlucky position.")
+      stop("Carriage return found in grid values at unlucky position.")
     }
   }
   out[["reindex"]] <- match(grid, ugrid)
   out
 }
-
diff --git a/tests/testthat/test_partial_dep.R b/tests/testthat/test_partial_dep.R
@@ -594,6 +594,6 @@ test_that(".compress_grid() leaves grid unchanged if unique", {
 })
 
 test_that(".compress_grid() can fail with very strange values", {
-  g <- data.frame(X = c("", "", "_:_"), Y = c("_:_", "_:_", ""))
+  g <- data.frame(X = c("", "", "\r"), Y = c("\r", "\r", ""))
   expect_error(.compress_grid(g))
 })