chore: merged multirow-support into main

kapsner · kapsner · commit bb470090176d · 2023-04-05T21:55:38.000+02:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: survex
 Title: Explainable Machine Learning in Survival Analysis
-Version: 1.0.0.9000
+Version: 1.0.0.9001
 Authors@R: 
     c(
         person("Mikołaj", "Spytek", email = "mikolajspytek@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7111-2286")),
@@ -27,7 +27,8 @@ Imports:
     treeshap,
     pec,
     survival,
-    patchwork
+    patchwork,
+    data.table
 Suggests:
     censored,
     covr,
diff --git a/NAMESPACE b/NAMESPACE
@@ -68,6 +68,7 @@ export(survival_to_cumulative_hazard)
 export(theme_default_survex)
 export(theme_vertical_default_survex)
 export(transform_to_stepfunction)
+import(data.table)
 import(ggplot2)
 import(patchwork)
 import(survival)
diff --git a/R/surv_shap.R b/R/surv_shap.R
@@ -24,16 +24,16 @@ surv_shap <- function(explainer,
                       B = 25,
                       exact = FALSE
 ) {
-    # if providing y_true, it must be exactly one single new observation,
-    # otherwise the indexing of y_true doesn't make any sense
-    stopifnot(
-        ifelse(!is.null(y_true), nrow(new_observation) == 1, TRUE),
-        nrow(new_observation) == 1 # produces nonesense, if more than on new observation
-    )
+    # make this code work for multiple observations
+    stopifnot(ifelse(!is.null(y_true),
+                     ifelse(is.matrix(y_true),
+                            nrow(new_observation) == nrow(y_true),
+                            is.null(dim(y_true)) && length(y_true) == 2L),
+                     TRUE))
 
     test_explainer(explainer, "surv_shap", has_data = TRUE, has_y = TRUE, has_survival = TRUE)
 
-    # make that this also works for 1-row matrix
+    # make this code also work for 1-row matrix
     col_index <- which(colnames(new_observation) %in% colnames(explainer$data))
     if (is.matrix(new_observation) && nrow(new_observation) == 1) {
         new_observation <- as.matrix(t(new_observation[, col_index]))
@@ -45,8 +45,11 @@ surv_shap <- function(explainer,
 
     if (!is.null(y_true)) {
         if (is.matrix(y_true)) {
-            y_true_ind <- y_true[1, 2]
-            y_true_time <- y_true[1, 1]
+            # above, we have already checked that nrows of observations are
+            # identical to nrows of y_true; thus we do not need to index
+            # the first row here
+            y_true_ind <- y_true[, 2]
+            y_true_time <- y_true[, 1]
         } else {
             y_true_ind <- y_true[2]
             y_true_time <- y_true[1]
@@ -66,6 +69,7 @@ surv_shap <- function(explainer,
 
     res <- list()
     res$eval_times <- explainer$times
+    # to display final object correctly, when is.matrix(new_observation) == TRUE
     res$variable_values <- as.data.frame(new_observation)
 
     res$result <- switch(calculation_method,
@@ -175,15 +179,37 @@ aggregate_surv_shap <- function(survshap, method) {
 use_kernelshap <- function(explainer, new_observation, ...){
 
     predfun <- function(model, newdata){
-        explainer$predict_survival_function(model, newdata, times = explainer$times)
+        explainer$predict_survival_function(
+            model,
+            newdata,
+            times = explainer$times
+        )
     }
 
-    tmp_res <- kernelshap::kernelshap(explainer$model, new_observation, bg_X = explainer$data,
-               pred_fun = predfun, verbose = FALSE)
+    tmp_res_list <- sapply(
+        X = as.character(seq_len(nrow(new_observation))),
+        FUN = function(i) {
+            tmp_res <- kernelshap::kernelshap(
+                object = explainer$model,
+                X = new_observation[as.integer(i), ],
+                bg_X = explainer$data,
+                pred_fun = predfun,
+                verbose = FALSE
+            )
+            tmp_shap_values <- data.frame(t(sapply(tmp_res$S, cbind)))
+            colnames(tmp_shap_values) <- colnames(tmp_res$X)
+            rownames(tmp_shap_values) <- paste("t=", explainer$times, sep = "")
+            data.table::as.data.table(tmp_shap_values, keep.rownames = TRUE)
+        },
+        USE.NAMES = TRUE,
+        simplify = FALSE
+    )
+
+    shap_values <- aggregate_shap_multiple_observations(
+        shap_res_list = tmp_res_list,
+        feature_names = colnames(new_observation)
+    )
 
-    shap_values <- data.frame(t(sapply(tmp_res$S, cbind)))
-    colnames(shap_values) <- colnames(tmp_res$X)
-    rownames(shap_values) <- paste("t=", explainer$times, sep = "")
     return(shap_values)
 }
 
@@ -200,21 +226,63 @@ use_treeshap <- function(explainer, new_observation, ...){
         data = explainer$data
     )
 
-    tmp_res <- do.call(
-        rbind,
-        lapply(
-            tmp_unified,
-            function(m) {
-              treeshap::treeshap(
-                unified_model = m,
-                x = new_observation
-              )$shaps
-            }
-        )
+    tmp_res_list <- sapply(
+        X = as.character(seq_len(nrow(new_observation))),
+        FUN = function(i) {
+            tmp_res <- do.call(
+                rbind,
+                lapply(
+                    tmp_unified,
+                    function(m) {
+                    treeshap::treeshap(
+                        unified_model = m,
+                        x = new_observation
+                    )$shaps
+                    }
+                )
+            )
+
+            tmp_shap_values <- data.frame(tmp_res)
+            colnames(tmp_shap_values) <- colnames(tmp_res)
+            rownames(tmp_shap_values) <- paste("t=", explainer$times, sep = "")
+            data.table::as.data.table(tmp_shap_values, keep.rownames = TRUE)
+        },
+        USE.NAMES = TRUE,
+        simplify = FALSE
+    )
+
+    shap_values <- aggregate_shap_multiple_observations(
+        shap_res_list = tmp_res_list,
+        feature_names = colnames(new_observation)
     )
 
-    shap_values <- data.frame(tmp_res)
-    colnames(shap_values) <- colnames(tmp_res)
-    rownames(shap_values) <- paste("t=", explainer$times, sep = "")
+    return(shap_values)
+}
+
+
+aggregate_shap_multiple_observations <- function(shap_res_list, feature_names) {
+
+    if (length(shap_res_list) > 1) {
+
+        full_survshap_results <- data.table::rbindlist(
+            l = shap_res_list,
+            use.names = TRUE,
+            idcol = TRUE
+        )
+
+        # compute arithmetic mean for each time-point and feature across
+        # multiple observations
+        tmp_res <- full_survshap_results[
+            , lapply(.SD, mean), by = "rn", .SDcols = feature_names
+        ]
+    } else {
+        # no aggregation required
+        tmp_res <- shap_res_list[[1]]
+    }
+    shap_values <- tmp_res[, .SD, .SDcols = setdiff(colnames(tmp_res), "rn")]
+    # transform to data.frame to make everything compatible with
+    # previous code
+    shap_values <- data.frame(shap_values)
+    rownames(shap_values) <- tmp_res$rn
     return(shap_values)
 }
diff --git a/R/zzz.R b/R/zzz.R
@@ -0,0 +1,2 @@
+#' @import data.table
+NULL
diff --git a/tests/testthat/test-predict_parts.R b/tests/testthat/test-predict_parts.R
@@ -62,6 +62,26 @@ test_that("survshap explanations work", {
 
 })
 
+test_that("global survshap explanations with kernelshap work for ranger", {
+    veteran <- survival::veteran
+
+    rsf_ranger <- ranger::ranger(survival::Surv(time, status) ~ ., data = veteran, respect.unordered.factors = TRUE, num.trees = 100, mtry = 3, max.depth = 5)
+    rsf_ranger_exp <- explain(rsf_ranger, data = veteran[, -c(3, 4)], y = Surv(veteran$time, veteran$status), verbose = FALSE)
+
+    parts_ranger <- predict_parts(
+        rsf_ranger_exp,
+        veteran[1:40, !colnames(veteran) %in% c("time", "status")],
+        y_true = Surv(veteran$time[1:40], veteran$status[1:40]),
+        aggregation_method = "mean_absolute",
+        calculation_method = "kernelshap"
+    )
+
+    expect_s3_class(parts_ranger, c("predict_parts_survival", "surv_shap"))
+    expect_equal(nrow(parts_ranger$result), length(rsf_ranger_exp$times))
+    expect_true(all(colnames(parts_ranger$result) == colnames(rsf_ranger_exp$data)))
+
+})
+
 
 test_that("survlime explanations work", {