Merge pull request #86 from ModelOriented/fixes

krzyzinskim · web-flow · commit 0f6bf31df2ed · 2023-10-22T14:07:13.000+02:00
hot fixes for predict parts functions
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: survex
 Title: Explainable Machine Learning in Survival Analysis
-Version: 1.1.3.9000
+Version: 1.1.3.9002
 Authors@R: 
     c(
         person("Mikołaj", "Spytek", email = "mikolajspytek@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7111-2286")),
diff --git a/R/plot_surv_shap.R b/R/plot_surv_shap.R
@@ -293,7 +293,7 @@ plot_shap_global_beeswarm <- function(x,
                                       max_vars = 7,
                                       colors = NULL) {
     df <- as.data.frame(do.call(rbind, x$aggregate))
-    cols <- names(sort(colMeans(abs(df))))[1:min(max_vars, length(df))]
+    cols <- names(sort(colMeans(abs(df)), decreasing = TRUE))[1:min(max_vars, length(df))]
     df <- df[, cols]
     df <- stack(df)
     colnames(df) <- c("shap_value", "variable")
diff --git a/R/predict_parts.R b/R/predict_parts.R
@@ -5,9 +5,9 @@
 #' @param explainer an explainer object - model preprocessed by the `explain()` function
 #' @param new_observation a new observation for which prediction need to be explained
 #' @param ... other parameters which are passed to `iBreakDown::break_down` if `output_type=="risk"`, or if `output_type=="survival"` to `surv_shap()` or `surv_lime()` functions depending on the selected type
-#' @param N the maximum number of observations used for calculation of attributions. If `NULL` (default) all observations will be used.
+#' @param N the number of observations used for calculation of attributions. If `NULL` (default) all explainer data will be used for SurvSHAP(t) and 100 neigbours for SurvLIME.
 #' @param type if `output_type == "survival"` must be either `"survshap"` or `"survlime"`, otherwise refer to the `DALEX::predict_parts`
-#' @param output_type either `"survival"` or `"risk"` the type of survival model output that should be considered for explanations. If `"survival"` the explanations are based on the survival function. Otherwise the scalar risk predictions are used by the `DALEX::predict_parts` function.
+#' @param output_type either `"survival"`, `"chf"` or `"risk"` the type of survival model output that should be considered for explanations. If `"survival"` the explanations are based on the survival function. If `"chf"` the explanations are based on the cumulative hazard function. Otherwise the scalar risk predictions are used by the `DALEX::predict_parts` function.
 #' @param explanation_label a label that can overwrite explainer label (useful for multiple explanations for the same explainer/model)
 #'
 #' @return An object of class `"predict_parts_survival"` and additional classes depending on the type of explanations. It is a list with the element `result` containing the results of the calculation.
@@ -27,7 +27,6 @@
 #'     * `categorical_variables` -  character vector, names of variables that should be treated as categories (factors are included by default)
 #'     * `k` -  a small positive number > 1, added to chf before taking log, so that weigths aren't negative
 #' * for `survshap`
-#'     * `timestamps` -  a numeric vector, time points at which the survival function will be evaluated
 #'     * `y_true` -  a two element numeric vector or matrix of one row and two columns, the first element being the true observed time and the second the status of the observation, used for plotting
 #'     * `calculation_method` -  a character, either `"kernelshap"` for use of `kernelshap` library (providing faster Kernel SHAP with refinements) or `"exact_kernel"` for exact Kernel SHAP estimation
 #'     * `aggregation_method` -  a character, either `"mean_absolute"` or `"integral"`, `"max_absolute"`, `"sum_of_squares"`
@@ -75,8 +74,8 @@ predict_parts.surv_explainer <- function(explainer, new_observation, ..., N = NU
         ))
     } else {
         res <- switch(type,
-            "survshap" = surv_shap(explainer, new_observation, output_type, ...),
-            "survlime" = surv_lime(explainer, new_observation, ...),
+            "survshap" = surv_shap(explainer, new_observation, output_type, ..., N = N),
+            "survlime" = surv_lime(explainer, new_observation, ..., N = N),
             stop("Only `survshap` and `survlime` methods are implemented for now")
         )
     }
diff --git a/R/surv_lime.R b/R/surv_lime.R
@@ -33,6 +33,7 @@ surv_lime <- function(explainer, new_observation,
     test_explainer(explainer, "surv_lime", has_data = TRUE, has_y = TRUE, has_chf = TRUE)
     new_observation <- new_observation[, colnames(new_observation) %in% colnames(explainer$data)]
     if (ncol(explainer$data) != ncol(new_observation)) stop("New observation and data have different number of columns (variables)")
+    if (is.null(N)) N <- 100
 
     predicted_sf <- explainer$predict_survival_function(explainer$model, new_observation, explainer$times)
 
@@ -57,12 +58,11 @@ surv_lime <- function(explainer, new_observation,
 
     distances <- apply(scaled_data, 1, dist, scaled_data[1, ])
 
-    if (is.null(kernel_width)) kernel_width <- sqrt(ncol(scaled_data) * 0.75)
+    if (is.null(kernel_width)) kernel_width <- sqrt(ncol(scaled_data)) * 0.75
 
     weights <- sqrt(exp(-(distances^2) / (kernel_width^2)))
     na_est <- survival::basehaz(survival::coxph(explainer$y ~ 1))
 
-
     model_chfs <- explainer$predict_cumulative_hazard_function(explainer$model, neighbourhood$inverse, na_est$time) + k
     log_chfs <- log(model_chfs)
     weights_v <- model_chfs / log_chfs
@@ -175,10 +175,13 @@ generate_neighbourhood <- function(data_org,
     data <- data[, colnames(data_row)]
 
     if (length(categorical_variables) > 0) {
+        inverse_as_factor <- inverse
+        inverse_as_factor[additional_categorical_variables] <-
+            lapply(inverse_as_factor[additional_categorical_variables], as.factor)
         expr <- paste0("~", paste(categorical_variables, collapse = "+"))
-        categorical_matrix <- model.matrix(as.formula(expr), data = inverse)[, -1]
+        categorical_matrix <- model.matrix(as.formula(expr), data = inverse_as_factor)[, -1]
         inverse_ohe <- cbind(inverse, categorical_matrix)
-        inverse_ohe[, factor_variables] <- NULL
+        inverse_ohe[, categorical_variables] <- NULL
     } else {
         inverse_ohe <- inverse
     }
diff --git a/R/surv_shap.R b/R/surv_shap.R
@@ -4,6 +4,7 @@
 #' @param new_observation new observations for which predictions need to be explained
 #' @param output_type a character, either `"survival"` or `"chf"`. Determines which type of prediction should be used for explanations.
 #' @param ... additional parameters, passed to internal functions
+#' @param N a positive integer, number of observations used as the background data
 #' @param y_true a two element numeric vector or matrix of one row and two columns, the first element being the true observed time and the second the status of the observation, used for plotting
 #' @param calculation_method a character, either `"kernelshap"` for use of `kernelshap` library (providing faster Kernel SHAP with refinements), `"exact_kernel"` for exact Kernel SHAP estimation, or `"treeshap"` for use of `treeshap` library (efficient implementation to compute SHAP values for tree-based models).
 #' @param aggregation_method a character, either `"integral"`, `"integral_absolute"`, `"mean_absolute"`, `"max_absolute"`, or `"sum_of_squares"`
@@ -18,6 +19,7 @@ surv_shap <- function(explainer,
                       new_observation,
                       output_type,
                       ...,
+                      N = NULL,
                       y_true = NULL,
                       calculation_method = c("kernelshap", "exact_kernel", "treeshap"),
                       aggregation_method = c("integral", "mean_absolute", "max_absolute", "sum_of_squares")
@@ -62,7 +64,6 @@ surv_shap <- function(explainer,
     }
 
     test_explainer(explainer, "surv_shap", has_data = TRUE, has_y = TRUE, has_survival = TRUE)
-
     # make this code also work for 1-row matrix
     col_index <- which(colnames(new_observation) %in% colnames(explainer$data))
     if (is.matrix(new_observation) && nrow(new_observation) == 1) {
@@ -128,11 +129,12 @@ surv_shap <- function(explainer,
     return(res)
 }
 
-use_exact_shap <- function(explainer, new_observation, output_type, ...) {
+
+use_exact_shap <- function(explainer, new_observation, output_type, N, ...) {
     shap_values <- sapply(
         X = as.character(seq_len(nrow(new_observation))),
         FUN = function(i) {
-            as.data.frame(shap_kernel(explainer, new_observation[as.integer(i), ], output_type, ...))
+            as.data.frame(shap_kernel(explainer, new_observation[as.integer(i), ], output_type, N, ...))
         },
         USE.NAMES = TRUE,
         simplify = FALSE
@@ -142,24 +144,24 @@ use_exact_shap <- function(explainer, new_observation, output_type, ...) {
 }
 
 
-shap_kernel <- function(explainer, new_observation, output_type, ...) {
+shap_kernel <- function(explainer, new_observation, output_type, N, ...) {
     timestamps <- explainer$times
     p <- ncol(explainer$data)
-
+    if (is.null(N)) N <- nrow(explainer$data)
+    background_data <- explainer$data[sample(1:nrow(explainer$data), N),]
 
     target_sf <- predict(explainer, new_observation, times = timestamps, output_type = output_type)
-    sfs <- predict(explainer, explainer$data, times = timestamps, output_type = output_type)
+    sfs <- predict(explainer, background_data, times = timestamps, output_type = output_type)
     baseline_sf <- apply(sfs, 2, mean)
 
-
     permutations <- expand.grid(rep(list(0:1), p))
     kernel_weights <- generate_shap_kernel_weights(permutations, p)
 
     shap_values <- calculate_shap_values(
         explainer,
         explainer$model,
         baseline_sf,
-        as.data.frame(explainer$data),
+        as.data.frame(background_data),
         permutations, kernel_weights,
         as.data.frame(new_observation),
         timestamps
@@ -227,7 +229,7 @@ aggregate_surv_shap <- function(survshap, times, method, ...) {
 }
 
 
-use_kernelshap <- function(explainer, new_observation, output_type, observation_aggregation_method,  ...) {
+use_kernelshap <- function(explainer, new_observation, output_type, N, ...) {
     predfun <- function(model, newdata) {
 
         if (output_type == "survival"){
@@ -257,6 +259,9 @@ use_kernelshap <- function(explainer, new_observation, output_type, observation_
         explainer_data <- data.frame(explainer_data)
     }
 
+    if (is.null(N)) N <- nrow(explainer$data)
+    background_data <- explainer$data[sample(1:nrow(explainer$data), N),]
+
     shap_values <- sapply(
         X = as.character(seq_len(nrow(new_observation))),
         FUN = function(i) {
diff --git a/README.md b/README.md
@@ -102,6 +102,9 @@ If you use `survex`, please cite [our preprint](https://arxiv.org/abs/2308.16113
 - W. Chen, B. Zhou, C. Y. Jeon, F. Xie, Y-C. Lin, R. K. Butler, Y. Zhou, T. Q. Luong, E. Lustigova, J. R. Pisegna, B. U. Wu. [Machine learning versus regression for prediction of sporadic pancreatic cancer](https://doi.org/10.1016/j.pan.2023.04.009). *Pancreatology*, 2023.
 - M. Nachit, Y. Horsmans, R. M. Summers, I. A. Leclercq, P. J. Pickhardt. [AI-based CT Body Composition Identifies Myosteatosis as Key Mortality Predictor in Asymptomatic Adults](https://doi.org/10.1148/radiol.222008). *Radiology*, 2023.
 - R. Passera, S. Zompi, J. Gill, A. Busca. [Explainable Machine Learning (XAI) for Survival in Bone Marrow Transplantation Trials: A Technical Report](https://doi.org/10.3390/biomedinformatics3030048). *BioMedInformatics*, 2023.
+- P. Donizy, M. Spytek, M. Krzyziński, K. Kotowski, A. Markiewicz, B. Romanowska-Dixon, P. Biecek, M. P. Hoang. [Ki67 is a better marker than PRAME in risk stratification of BAP1-positive and BAP1-loss uveal melanomas](http://dx.doi.org/10.1136/bjo-2023-323816). *British Journal of Ophthalmology*, 2023.
+- X. Qi, Y. Ge, A. Yang, Y. Liu, Q. Wang & G. Wu. [Potential value of mitochondrial regulatory pathways in the clinical application of clear cell renal cell carcinoma: a machine learning-based study](https://doi.org/10.1007/s00432-023-05393-8). *Journal of Cancer Research and Clinical Oncology*, 2023.
+
 - Share it with us!
 
 ## Related work
diff --git a/man/predict_parts.surv_explainer.Rd b/man/predict_parts.surv_explainer.Rd
diff --git a/man/surv_shap.Rd b/man/surv_shap.Rd