Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
22 changes: 8 additions & 14 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,12 @@ Description: postpi corrects statistical inferences performed directly on predic
License: GPL-2
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
Imports:
gam,
caret,
broom,
matrixStats,
dplyr
Suggests:
knitr,
rmarkdown,
testthat
RoxygenNote: 7.1.2
Imports: gam, caret, broom, matrixStats, dplyr, survival, glmnet, mice
Suggests: knitr, rmarkdown, testthat
VignetteBuilder: knitr
Depends:
magrittr,
R (>= 2.10)
Depends: R (>= 3.5.0), magrittr
NeedsCompilation: no
Packaged: 2021-10-28 20:20:09 UTC; arjun.sondhi
Author: Siruo Wang [cre, aut]
Maintainer: Siruo Wang <[email protected]>
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
# Generated by roxygen2: do not edit by hand

export(bootstrap_prob)
export(postpi)
export(postpi_covariate_calibrate)
export(postpi_covariate_impute)
export(postpi_der)
export(postpi_relate)
import(broom)
import(caret)
import(gam)
import(glmnet)
import(matrixStats)
import(mice)
import(survival)
import(tidyverse)
76 changes: 76 additions & 0 deletions R/bootstrap_prob.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#'
#' bootstrap_prob function fits a Cox proportional hazards analysis model given predicted probabilities for a binary covariate.
#'
#' This function takes in an analysis dataset containing a column of predicted probabilities.
#' The specified Cox model is fit over B nonparametric bootstrap iterations.
#' For each bootstrapped dataset, a binary variable vector is drawn from the predicted probabilities.
#'
#' @import tidyverse
#' @import survival
#'
#' @param analysis_data dataframe that contains the predicted probability variable, and the other relevant variables for analysis
#' @param inf_formula formula describing Cox model of interest; passed to \code{coxph()}, should contain \code{pred_var_col} on the right side
#' @param pred_prob_var character string; column name of the predicted probability variable in \code{analysis_data} for the binary variable of interest
#' @param pred_var_col character string; column name assigned to the binary variable of interest used in analysis
#' @param B integer; number of bootstrap iterations, defaults to 500
#' @param alpha numeric; significance level for inference, defaults to 0.05
#'
#' @return tibble of inferential results; point estimates and 100(1 - alpha)% confidence intervals on the hazard ratio scale
#'
#'
#' @export
#'
#' @examples
#'
#' data(EHRdata, package="postpi")
#' library(tidyverse)
#' library(survival)
#'
#' labeled_data <- EHRdata[[1]]
#' analysis_data <- EHRdata[[2]]
#' postpi_calib <- postpi_covariate_calibrate(calib_data = labeled_data,
#' true_var = "met_true",
#' pred_prob_var = "met_prob",
#' covariates = c("age_dx", "sex"),
#' outcome = "time * status")
#'
#' xtrain_analysis <- analysis_data %>%
#' select(met_prob, age_dx, sex, time, status) %>%
#' mutate(`time:status` = time * status) %>%
#' as.matrix
#'
#' analysis_data[["met_calib_prob"]] <- as.numeric(predict(postpi_calib$model,
#' newx = xtrain_analysis,
#' type = "response",
#' s = postpi_calib$lambda,
#' alpha = 0))
#' inf_formula <- as.formula("Surv(time, status) ~ met_calib + age_dx + sex")
#' res_calib <- bootstrap_prob(analysis_data, inf_formula, "met_calib_prob", "met_calib")
#'

bootstrap_prob <- function(analysis_data,
inf_formula,
pred_prob_col,
pred_var_col,
B = 500,
alpha = 0.05) {

boot_dist <- NULL
for (i in 1:B) {
fit_ind <- sample(1:nrow(analysis_data), size = nrow(analysis_data), replace = TRUE)
data_fit <- analysis_data[fit_ind,]

data_fit[[pred_var_col]] <- rbinom(nrow(data_fit), 1, data_fit[[pred_prob_col]])

fit_i <- coxph(inf_formula, data = data_fit, control = coxph.control(iter.max = 100))
boot_dist <- rbind(boot_dist, coef(fit_i))
}

out <- data.frame(variable = colnames(boot_dist)) %>%
mutate(estimate = apply(boot_dist, 2, function(x) exp(median(x))),
ci_lower = apply(boot_dist, 2, function(x) exp(quantile(x, alpha / 2))),
ci_upper = apply(boot_dist, 2, function(x) exp(quantile(x, 1 - (alpha / 2))))) %>%
as_tibble()

return(out)
}
21 changes: 21 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,24 @@
#' @source \url{https://academic.oup.com/nar/article/46/9/e54/4920847}
"TISSUEdata"


#' Simulated EHR-derived data for covariate post-pi in survival analysis
#'
#' A dataset containing data for 2,000 patients with fully observed age at diagnosis,
#' sex, survival time, and event indicator. Also contains predicted probability of
#' metastatic disease, with true metastatic status available for first 1,000 patients.
#'
#' @format A list of two data frames (labeled and unlabeled) each with 1,000 rows and 6 or 7 variables:
#' \describe{
#' \item{patientid}{Patient ID, integer}
#' \item{age_dx}{Age at diagnosis, numeric}
#' \item{sex}{Sex, binary}
#' \item{met_true}{True metastatic status, available for labeled data only}
#' \item{met_prob}{ML-predicted probability of metastatic status}
#' \item{time}{Observed event time, numeric}
#' \item{status}{Event indicator, 1 if event observed, 0 if censored}
#' }
#'
"EHRdata"


63 changes: 63 additions & 0 deletions R/postpi_covariate_calib.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#'
#' postpi_covariate_calibrate function fits a model relating a true binary variable to its predicted probability, and other variables to be used in a downstream analysis.
#'
#' This function takes in a labeled dataset for calibration containing:
#' (1) true measurements of a binary variable that is being predicted using ML
#' (2) predicted probabilities for this binary variable to be calibrated
#' (3) additional covariates to be used in a downstream analytic model
#' (4) the outcome of interest in the downstream analytic model
#' These variable names are specified as character or character vector inputs to the function, which are then combined into a model specification using standard R forumla syntax.
#' Note that in the case of a survival outcome, we recommend accounting for both the observed time and censoring indictor by specifying \code{time * status}.
#'
#' The calibration model is fit as an l2-penalized logistic regression model, using the glmnet package.
#' The penalty parameter is set using cross-validation to minimize logistic deviance.
#'
#' @import tidyverse
#' @import survival
#' @import glmnet
#'
#' @param calib_data dataframe that contains the below variables needed for post-prediction inference calibration
#' @param true_var character string; column name of the true binary variable of interest in \code{calib_data}
#' @param pred_prob_var character string; column name of the predicted probability variable for the binary variable of interest in \\code{calib_data}
#' @param covariates character vector; column names of the covariates for the analysis of interest in \code{calib_data}
#' @param outcome character string; column name (or names in formula syntax) of the outcome variable for the analysis of interest in \code{calib_data}
#'
#' @return list containing glmnet object, optimal lambda parameter, and calibration formula.
#'
#'
#' @export
#'
#' @examples
#'
#' data(EHRdata, package="postpi")
#' library(tidyverse)
#' library(survival)
#'
#' labeled_data <- EHRdata[[1]]
#' postpi_calib <- postpi_covariate_calibrate(calib_data = labeled_data,
#' true_var = "met_true",
#' pred_prob_var = "met_prob",
#' covariates = c("age_dx", "sex"),
#' outcome = "time * status")
#'

postpi_covariate_calibrate <- function(calib_data,
true_var,
pred_prob_var,
covariates,
outcome) {

calib_features <- c(pred_prob_var, covariates, outcome)
calib_formula <- as.formula(paste0(true_var, " ~ ", paste(calib_features, collapse = " + ")))

xtrain <- model.matrix(calib_formula, data = calib_data)[,-1]
calib_m <- glmnet(x = xtrain, y = calib_data[[true_var]], family = "binomial", alpha = 0)
lambda_cv <- cv.glmnet(x = xtrain, y = calib_data[[true_var]], family = "binomial", alpha = 0)$lambda.min

postpi_calib <- list("model" = calib_m,
"lambda" = lambda_cv,
"formula" = calib_formula)
return(postpi_calib)
}


66 changes: 66 additions & 0 deletions R/postpi_covariate_impute.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#'
#' postpi_covariate_impute function imputes a binary variable given its predicted probability and other variables to be used in a downstream analysis.
#' The analysis is then implemented on multiply imputed datasets with the results combined using Rubin's rules.
#'
#' This function takes in a labeled dataset for training the imputation model containing:
#' (1) true measurements of a binary variable that is being predicted using ML
#' (2) predicted probabilities for this binary variable to be calibrated
#' (3) additional covariates to be used in a downstream analytic model
#' (4) the outcome of interest in the downstream analytic model
#'
#' This function also takes in an unlabeled dataset containing all of the same columns, with the exception of the true measurements of the binary variable
#' The imputation model is fit as a logistic regression model, using the mice package.
#'
#' @import tidyverse
#' @import survival
#' @import mice
#'
#' @param labeled_data dataframe that contains labeled dataset for training imputation model
#' @param true_var character string; column name of the true binary variable of interest in \code{labeled_data}
#' @param unlabeled_data dataframe that contains all the columns in \code{labeled_data} except for \code{true_var}
#' @param inf_formula formula describing Cox model of interest; passed to \code{coxph()}, should contain \code{true_var} on the right side
#'
#' @return tibble containing analysis results; point estimates and standard errors on log hazard ratio scale
#'
#'
#' @export
#'
#' @examples
#'
#' data(EHRdata, package="postpi")
#' library(tidyverse)
#' library(survival)
#' library(mice)
#'
#' labeled_data <- EHRdata[[1]]
#' unlabeled_data <- EHRdata[[2]]
#' inf_formula <- as.formula("Surv(time, status) ~ met_true + age_dx + sex")
#'
#' res_impute <- postpi_covariate_impute(labeled_data = labeled_data,
#' true_var = "met_true",
#' unlabeled_data = unlabeled_data,
#' inf_formula = inf_formula)
#'
#'

postpi_covariate_impute <- function(labeled_data,
true_var,
unlabeled_data,
inf_formula) {

labeled_data[[true_var]] <- as.factor(labeled_data[[true_var]])
unlabeled_data[[true_var]] <- NA
mice_data <- bind_rows(labeled_data, unlabeled_data)

imp_results <- mice(mice_data, method = "logreg", m = 100, maxit = 1, printFlag = FALSE)

pooled_fit <- complete(imp_results, "all") %>%
lapply(function(dd) coxph(inf_formula, data = dd)) %>%
pool() %>%
summary() %>%
as_tibble()

return(pooled_fit)
}


Binary file added build/vignette.rds
Binary file not shown.
Binary file added data/EHRdata.rda
Binary file not shown.
27 changes: 27 additions & 0 deletions man/EHRdata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 64 additions & 0 deletions man/bootstrap_prob.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading