leekgroup · asondhi · Oct 29, 2021
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,18 +6,12 @@ Description: postpi corrects statistical inferences performed directly on predic
 License: GPL-2
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.1.1
-Imports:
-    gam,
-    caret,
-    broom,
-    matrixStats,
-    dplyr
-Suggests: 
-    knitr,
-    rmarkdown,
-    testthat
+RoxygenNote: 7.1.2
+Imports: gam, caret, broom, matrixStats, dplyr, survival, glmnet, mice
+Suggests: knitr, rmarkdown, testthat
 VignetteBuilder: knitr
-Depends: 
-    magrittr,
-    R (>= 2.10)
+Depends: R (>= 3.5.0), magrittr
+NeedsCompilation: no
+Packaged: 2021-10-28 20:20:09 UTC; arjun.sondhi
+Author: Siruo Wang [cre, aut]
+Maintainer: Siruo Wang <[email protected]>
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,9 +1,16 @@
 # Generated by roxygen2: do not edit by hand
 
+export(bootstrap_prob)
 export(postpi)
+export(postpi_covariate_calibrate)
+export(postpi_covariate_impute)
 export(postpi_der)
 export(postpi_relate)
 import(broom)
 import(caret)
 import(gam)
+import(glmnet)
 import(matrixStats)
+import(mice)
+import(survival)
+import(tidyverse)
diff --git a/R/bootstrap_prob.R b/R/bootstrap_prob.R
@@ -0,0 +1,76 @@
+#'
+#' bootstrap_prob function fits a Cox proportional hazards analysis model given predicted probabilities for a binary covariate.
+#'
+#' This function takes in an analysis dataset containing a column of predicted probabilities.
+#' The specified Cox model is fit over B nonparametric bootstrap iterations.
+#' For each bootstrapped dataset, a binary variable vector is drawn from the predicted probabilities.
+#'
+#' @import tidyverse
+#' @import survival
+#'
+#' @param analysis_data dataframe that contains the predicted probability variable, and the other relevant variables for analysis
+#' @param inf_formula formula describing Cox model of interest; passed to \code{coxph()}, should contain \code{pred_var_col} on the right side
+#' @param pred_prob_var character string; column name of the predicted probability variable in \code{analysis_data} for the binary variable of interest
+#' @param pred_var_col character string; column name assigned to the binary variable of interest used in analysis
+#' @param B integer; number of bootstrap iterations, defaults to 500
+#' @param alpha numeric; significance level for inference, defaults to 0.05
+#'
+#' @return tibble of inferential results; point estimates and 100(1 - alpha)% confidence intervals on the hazard ratio scale
+#'
+#'
+#' @export
+#'
+#' @examples
+#'
+#' data(EHRdata, package="postpi")
+#' library(tidyverse)
+#' library(survival)
+#'
+#' labeled_data <- EHRdata[[1]]
+#' analysis_data <- EHRdata[[2]]
+#' postpi_calib <- postpi_covariate_calibrate(calib_data = labeled_data,
+#'                                            true_var = "met_true",
+#'                                            pred_prob_var = "met_prob",
+#'                                            covariates = c("age_dx", "sex"),
+#'                                            outcome = "time * status")
+#'
+#' xtrain_analysis <- analysis_data %>%
+#'  select(met_prob, age_dx, sex, time, status) %>%
+#'  mutate(`time:status` = time * status) %>%
+#'  as.matrix
+#'
+#' analysis_data[["met_calib_prob"]] <- as.numeric(predict(postpi_calib$model,
+#'                                                         newx = xtrain_analysis,
+#'                                                         type = "response",
+#'                                                         s = postpi_calib$lambda,
+#'                                                         alpha = 0))
+#' inf_formula <- as.formula("Surv(time, status) ~ met_calib + age_dx + sex")
+#' res_calib <- bootstrap_prob(analysis_data, inf_formula, "met_calib_prob", "met_calib")
+#'
+
+bootstrap_prob <- function(analysis_data,
+                           inf_formula,
+                           pred_prob_col,
+                           pred_var_col,
+                           B = 500,
+                           alpha = 0.05) {
+
+  boot_dist <- NULL
+  for (i in 1:B) {
+    fit_ind <- sample(1:nrow(analysis_data), size = nrow(analysis_data), replace = TRUE)
+    data_fit <- analysis_data[fit_ind,]
+
+    data_fit[[pred_var_col]] <- rbinom(nrow(data_fit), 1, data_fit[[pred_prob_col]])
+
+    fit_i <- coxph(inf_formula, data = data_fit, control = coxph.control(iter.max = 100))
+    boot_dist <- rbind(boot_dist, coef(fit_i))
+  }
+
+  out <- data.frame(variable = colnames(boot_dist)) %>%
+    mutate(estimate = apply(boot_dist, 2, function(x) exp(median(x))),
+           ci_lower = apply(boot_dist, 2, function(x) exp(quantile(x, alpha / 2))),
+           ci_upper = apply(boot_dist, 2, function(x) exp(quantile(x, 1 - (alpha / 2))))) %>%
+    as_tibble()
+
+  return(out)
+}
diff --git a/R/data.R b/R/data.R
@@ -31,3 +31,24 @@
 #' @source \url{https://academic.oup.com/nar/article/46/9/e54/4920847}
 "TISSUEdata"
 
+
+#' Simulated EHR-derived data for covariate post-pi in survival analysis
+#'
+#' A dataset containing data for 2,000 patients with fully observed age at diagnosis,
+#' sex, survival time, and event indicator. Also contains predicted probability of
+#' metastatic disease, with true metastatic status available for first 1,000 patients.
+#'
+#' @format A list of two data frames (labeled and unlabeled) each with 1,000 rows and 6 or 7 variables:
+#' \describe{
+#'   \item{patientid}{Patient ID, integer}
+#'   \item{age_dx}{Age at diagnosis, numeric}
+#'   \item{sex}{Sex, binary}
+#'   \item{met_true}{True metastatic status, available for labeled data only}
+#'   \item{met_prob}{ML-predicted probability of metastatic status}
+#'   \item{time}{Observed event time, numeric}
+#'   \item{status}{Event indicator, 1 if event observed, 0 if censored}
+#' }
+#' 
+"EHRdata"
+
+
diff --git a/R/postpi_covariate_calib.R b/R/postpi_covariate_calib.R
@@ -0,0 +1,63 @@
+#'
+#' postpi_covariate_calibrate function fits a model relating a true binary variable to its predicted probability, and other variables to be used in a downstream analysis.
+#'
+#' This function takes in a labeled dataset for calibration containing:
+#' (1) true measurements of a binary variable that is being predicted using ML
+#' (2) predicted probabilities for this binary variable to be calibrated
+#' (3) additional covariates to be used in a downstream analytic model
+#' (4) the outcome of interest in the downstream analytic model
+#' These variable names are specified as character or character vector inputs to the function, which are then combined into a model specification using standard R forumla syntax.
+#' Note that in the case of a survival outcome, we recommend accounting for both the observed time and censoring indictor by specifying \code{time * status}.
+#'
+#' The calibration model is fit as an l2-penalized logistic regression model, using the glmnet package.
+#' The penalty parameter is set using cross-validation to minimize logistic deviance.
+#'
+#' @import tidyverse
+#' @import survival
+#' @import glmnet
+#'
+#' @param calib_data dataframe that contains the below variables needed for post-prediction inference calibration
+#' @param true_var character string; column name of the true binary variable of interest in \code{calib_data}
+#' @param pred_prob_var character string; column name of the predicted probability variable for the binary variable of interest in \\code{calib_data}
+#' @param covariates character vector; column names of the covariates for the analysis of interest in \code{calib_data}
+#' @param outcome character string; column name (or names in formula syntax) of the outcome variable for the analysis of interest in \code{calib_data}
+#'
+#' @return list containing glmnet object, optimal lambda parameter, and calibration formula.
+#'
+#'
+#' @export
+#'
+#' @examples
+#'
+#' data(EHRdata, package="postpi")
+#' library(tidyverse)
+#' library(survival)
+#'
+#' labeled_data <- EHRdata[[1]]
+#' postpi_calib <- postpi_covariate_calibrate(calib_data = labeled_data,
+#'                                            true_var = "met_true",
+#'                                            pred_prob_var = "met_prob",
+#'                                            covariates = c("age_dx", "sex"),
+#'                                            outcome = "time * status")
+#'
+
+postpi_covariate_calibrate <- function(calib_data,
+                                       true_var,
+                                       pred_prob_var,
+                                       covariates,
+                                       outcome) {
+
+  calib_features <- c(pred_prob_var, covariates, outcome)
+  calib_formula <- as.formula(paste0(true_var, " ~ ", paste(calib_features, collapse = " + ")))
+
+  xtrain <- model.matrix(calib_formula, data = calib_data)[,-1]
+  calib_m <- glmnet(x = xtrain, y = calib_data[[true_var]], family = "binomial", alpha = 0)
+  lambda_cv <- cv.glmnet(x = xtrain, y = calib_data[[true_var]], family = "binomial", alpha = 0)$lambda.min
+
+  postpi_calib <- list("model" = calib_m,
+                       "lambda" = lambda_cv,
+                       "formula" = calib_formula)
+  return(postpi_calib)
+}
+
+
diff --git a/R/postpi_covariate_impute.R b/R/postpi_covariate_impute.R
@@ -0,0 +1,66 @@
+#'
+#' postpi_covariate_impute function imputes a binary variable given its predicted probability and other variables to be used in a downstream analysis.
+#' The analysis is then implemented on multiply imputed datasets with the results combined using Rubin's rules.
+#'
+#' This function takes in a labeled dataset for training the imputation model containing:
+#' (1) true measurements of a binary variable that is being predicted using ML
+#' (2) predicted probabilities for this binary variable to be calibrated
+#' (3) additional covariates to be used in a downstream analytic model
+#' (4) the outcome of interest in the downstream analytic model
+#'
+#' This function also takes in an unlabeled dataset containing all of the same columns, with the exception of the true measurements of the binary variable
+#' The imputation model is fit as a logistic regression model, using the mice package.
+#'
+#' @import tidyverse
+#' @import survival
+#' @import mice
+#'
+#' @param labeled_data dataframe that contains labeled dataset for training imputation model
+#' @param true_var character string; column name of the true binary variable of interest in \code{labeled_data}
+#' @param unlabeled_data dataframe that contains all the columns in \code{labeled_data} except for \code{true_var}
+#' @param inf_formula formula describing Cox model of interest; passed to \code{coxph()}, should contain \code{true_var} on the right side
+#'
+#' @return tibble containing analysis results; point estimates and standard errors on log hazard ratio scale
+#'
+#'
+#' @export
+#'
+#' @examples
+#'
+#' data(EHRdata, package="postpi")
+#' library(tidyverse)
+#' library(survival)
+#' library(mice)
+#'
+#' labeled_data <- EHRdata[[1]]
+#' unlabeled_data <- EHRdata[[2]]
+#' inf_formula <- as.formula("Surv(time, status) ~ met_true + age_dx + sex")
+#'
+#' res_impute <- postpi_covariate_impute(labeled_data = labeled_data,
+#'                                       true_var = "met_true",
+#'                                       unlabeled_data = unlabeled_data,
+#'                                       inf_formula = inf_formula)
+#'
+#'
+
+postpi_covariate_impute <- function(labeled_data,
+                                    true_var,
+                                    unlabeled_data,
+                                    inf_formula) {
+
+  labeled_data[[true_var]] <- as.factor(labeled_data[[true_var]])
+  unlabeled_data[[true_var]] <- NA
+  mice_data <- bind_rows(labeled_data, unlabeled_data)
+
+  imp_results <- mice(mice_data, method = "logreg", m = 100, maxit = 1, printFlag = FALSE)
+
+  pooled_fit <- complete(imp_results, "all") %>%
+    lapply(function(dd) coxph(inf_formula, data = dd)) %>%
+    pool() %>%
+    summary() %>%
+    as_tibble()
+
+  return(pooled_fit)
+}
+
+
diff --git a/build/vignette.rds b/build/vignette.rds
diff --git a/data/EHRdata.rda b/data/EHRdata.rda
diff --git a/man/EHRdata.Rd b/man/EHRdata.Rd
diff --git a/man/bootstrap_prob.Rd b/man/bootstrap_prob.Rd