|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +# This script is used to run `AUCell` on a single SCE object for a set of marker gene sets |
| 4 | +# gene sets used are custom gene sets and a set of Ewing specific gene sets from MsigDB |
| 5 | +# the results are exported as a single TSV file with the following columns: |
| 6 | +# `gene_set`, `barcodes`, `auc`, and `auc_threshold` |
| 7 | + |
| 8 | + |
| 9 | +library(optparse) |
| 10 | + |
| 11 | +option_list <- list( |
| 12 | + make_option( |
| 13 | + opt_str = c("--sce_file"), |
| 14 | + type = "character", |
| 15 | + help = "Path to RDS file containing a processed SingleCellExperiment object to use with AUCell." |
| 16 | + ), |
| 17 | + make_option( |
| 18 | + opt_str = c("--custom_geneset_files"), |
| 19 | + type = "character", |
| 20 | + default = NULL, |
| 21 | + help = "Optional comma separated list of files where each file contains a custom gene set to use with AUCell. |
| 22 | + All TSV files must contain the `ensembl_gene_id` column. |
| 23 | + File names will be used as the name of the gene set." |
| 24 | + ), |
| 25 | + make_option( |
| 26 | + opt_str = c("--msigdb_genesets"), |
| 27 | + type = "character", |
| 28 | + help = "Path to TSV file containing all gene sets from MSigDB to use with AUCell. |
| 29 | + Must contain columns with `name`, `geneset`, `category`, and `subcategory`." |
| 30 | + ), |
| 31 | + make_option( |
| 32 | + opt_str = c("--max_rank_threshold"), |
| 33 | + type = "integer", |
| 34 | + default = 425, # 1% of all detected genes in merged object for SCPCP000015 |
| 35 | + help = "Number of genes detected to set as the `aucMaxRank`." |
| 36 | + ), |
| 37 | + make_option( |
| 38 | + opt_str = c("--output_file"), |
| 39 | + type = "character", |
| 40 | + help = "Path to file where results will be saved" |
| 41 | + ), |
| 42 | + make_option( |
| 43 | + opt_str = c("-t", "--threads"), |
| 44 | + type = "integer", |
| 45 | + default = 4, |
| 46 | + help = "Number of multiprocessing threads to use." |
| 47 | + ), |
| 48 | + make_option( |
| 49 | + opt_str = c("--seed"), |
| 50 | + type = "integer", |
| 51 | + default = 2025, |
| 52 | + help = "A random seed for reproducibility." |
| 53 | + ) |
| 54 | +) |
| 55 | + |
| 56 | +# Parse options |
| 57 | +opt <- parse_args(OptionParser(option_list = option_list)) |
| 58 | + |
| 59 | +# Set up ----------------------------------------------------------------------- |
| 60 | + |
| 61 | +# make sure input files exist |
| 62 | +stopifnot( |
| 63 | + "sce file must be specified using `--sce_file`" = !is.null(opt$sce_file) |
| 64 | +) |
| 65 | + |
| 66 | +stopifnot( |
| 67 | + "sce file does not exist" = file.exists(opt$sce_file), |
| 68 | + "MSigDB gene set file does not exist" = file.exists(opt$msigdb_genesets), |
| 69 | + "max_rank_threshold must be an integer" = is.integer(opt$max_rank_threshold) |
| 70 | +) |
| 71 | + |
| 72 | +# check that custom gene set files exist if provided |
| 73 | +use_custom_genesets <- !is.null(opt$custom_geneset_files) |
| 74 | +if (use_custom_genesets) { |
| 75 | + # first separate the files |
| 76 | + custom_geneset_files <- stringr::str_split_1(opt$custom_geneset_files, ",") |
| 77 | + |
| 78 | + stopifnot( |
| 79 | + "Custom gene set files do not exist" = all(file.exists(custom_geneset_files)) |
| 80 | + ) |
| 81 | +} |
| 82 | + |
| 83 | +# load SCE |
| 84 | +suppressPackageStartupMessages({ |
| 85 | + library(SingleCellExperiment) |
| 86 | +}) |
| 87 | + |
| 88 | + |
| 89 | +# set up multiprocessing params |
| 90 | +if (opt$threads > 1) { |
| 91 | + bp_param <- BiocParallel::MulticoreParam(opt$threads) |
| 92 | +} else { |
| 93 | + bp_param <- BiocParallel::SerialParam() |
| 94 | +} |
| 95 | + |
| 96 | +# make sure directory exists for writing output |
| 97 | +output_dir <- dirname(opt$output_file) |
| 98 | +fs::dir_create(output_dir) |
| 99 | + |
| 100 | +# read in SCE |
| 101 | +sce <- readr::read_rds(opt$sce_file) |
| 102 | + |
| 103 | +# remove genes that are not detected from SCE object |
| 104 | +genes_to_keep <- rowData(sce)$detected > 0 |
| 105 | +filtered_sce <- sce[genes_to_keep, ] |
| 106 | + |
| 107 | +# read in gene sets to use with msigdb |
| 108 | +msig_genesets_df <- readr::read_tsv(opt$msigdb_genesets) |
| 109 | + |
| 110 | +# Prep gene sets --------------------------------------------------------------- |
| 111 | + |
| 112 | +# get list of categories that we need to grab from msigdb |
| 113 | +category_list <- msig_genesets_df |> |
| 114 | + dplyr::select(category, subcategory) |> |
| 115 | + unique() |> |
| 116 | + purrr::transpose() |
| 117 | + |
| 118 | +# list of genesets and names |
| 119 | +geneset_list <- msig_genesets_df$geneset |> |
| 120 | + purrr::set_names(msig_genesets_df$name) |
| 121 | + |
| 122 | +# pull gene sets from msigbdr |
| 123 | +# first pull out info for each category and then pull out specific genes for geneset |
| 124 | +msig_genes_df <- category_list |> |
| 125 | + purrr::map(\(category_list){ |
| 126 | + # replace subcategory with default NULL |
| 127 | + # can't use NULL in tsv since it gets read in as a character |
| 128 | + if (is.na(category_list$subcategory)) { |
| 129 | + subcategory <- NULL |
| 130 | + } else { |
| 131 | + subcategory <- category_list$subcategory |
| 132 | + } |
| 133 | + |
| 134 | + msigdbr::msigdbr( |
| 135 | + species = "Homo sapiens", |
| 136 | + category = category_list$category, |
| 137 | + subcategory = subcategory |
| 138 | + ) |
| 139 | + }) |> |
| 140 | + dplyr::bind_rows() |> |
| 141 | + # only keep relevant gene sets |
| 142 | + dplyr::filter(gs_name %in% geneset_list) |
| 143 | + |
| 144 | +# create named list of genes in each gene set |
| 145 | +genes_list <- geneset_list |> |
| 146 | + purrr::map(\(name){ |
| 147 | + genes <- msig_genes_df |> |
| 148 | + dplyr::filter(gs_name == name) |> |
| 149 | + dplyr::pull(ensembl_gene) |> |
| 150 | + unique() |
| 151 | + }) |
| 152 | + |
| 153 | +# if custom gene sets are used add those to the list of gene sets |
| 154 | +if (use_custom_genesets) { |
| 155 | + # get names of gene sets using name of the files |
| 156 | + custom_geneset_names <- stringr::str_replace(basename(custom_geneset_files), ".tsv", "") |
| 157 | + |
| 158 | + # read in custom gene sets |
| 159 | + custom_genes_list <- custom_geneset_files |> |
| 160 | + purrr::set_names(custom_geneset_names) |> |
| 161 | + purrr::map(\(file) { |
| 162 | + gene_ids <- readr::read_tsv(file) |> |
| 163 | + dplyr::pull(ensembl_gene_id) |> |
| 164 | + unique() |
| 165 | + }) |
| 166 | + |
| 167 | + # combine custom and msig |
| 168 | + genes_list <- c(genes_list, custom_genes_list) |
| 169 | +} |
| 170 | + |
| 171 | +# build GeneSetCollection for AUCell |
| 172 | +collection <- genes_list |> |
| 173 | + purrr::imap(\(genes, name) GSEABase::GeneSet(genes, setName = name)) |> |
| 174 | + GSEABase::GeneSetCollection() |
| 175 | + |
| 176 | +# Run AUCell ------------------------------------------------------------------- |
| 177 | + |
| 178 | +# extract counts matrix |
| 179 | +counts_mtx <- counts(filtered_sce) |
| 180 | + |
| 181 | +# check intersection with gene sets |
| 182 | +overlap_pct <- genes_list |> |
| 183 | + purrr::map_dbl(\(list){ |
| 184 | + num_genes <- length(list) |
| 185 | + intersect(rownames(counts_mtx), list) |> |
| 186 | + length() / num_genes |
| 187 | + }) |
| 188 | + |
| 189 | +# if any gene sets don't have enough overlap (cutoff is 20%) |
| 190 | +# print a message and quit |
| 191 | +if (any(overlap_pct <= 0.20)) { |
| 192 | + message("Gene sets do not have at least 20% of genes present in SCE. |
| 193 | + AUCell will not be run.") |
| 194 | + # make empty data frame and save to output file |
| 195 | + data.frame( |
| 196 | + barcodes = colnames(sce), |
| 197 | + gene_set = NA, |
| 198 | + auc = NA, |
| 199 | + auc_thresholds = NA |
| 200 | + ) |> |
| 201 | + readr::write_tsv(opt$output_file) |
| 202 | + |
| 203 | + # don't run the rest |
| 204 | + quit(save = "no") |
| 205 | +} |
| 206 | + |
| 207 | +# run aucell |
| 208 | +auc_results <- AUCell::AUCell_run( |
| 209 | + counts_mtx, |
| 210 | + collection, |
| 211 | + aucMaxRank = opt$max_rank_threshold, |
| 212 | + BPPARAM = bp_param |
| 213 | +) |
| 214 | + |
| 215 | +# Get threshold ---------------------------------------------------------------- |
| 216 | + |
| 217 | +# get auc threshold for each geneset |
| 218 | +auc_thresholds <- AUCell::AUCell_exploreThresholds( |
| 219 | + auc_results, |
| 220 | + assign = TRUE, |
| 221 | + plotHist = FALSE |
| 222 | +) |> |
| 223 | + # extract select auc threshold |
| 224 | + purrr::map_dbl(\(results){ |
| 225 | + results$aucThr$selected |
| 226 | + }) |
| 227 | + |
| 228 | +# put into a data frame for easy joining with all auc values |
| 229 | +threshold_df <- data.frame( |
| 230 | + gene_set = names(auc_thresholds), |
| 231 | + auc_threshold = auc_thresholds |
| 232 | +) |
| 233 | + |
| 234 | +# Combine and export results --------------------------------------------------- |
| 235 | + |
| 236 | +# create data frame with auc for each cell and each geneset |
| 237 | +auc_df <- auc_results@assays@data$AUC |> |
| 238 | + as.data.frame() |> |
| 239 | + tibble::rownames_to_column("gene_set") |> |
| 240 | + tidyr::pivot_longer(!"gene_set", |
| 241 | + names_to = "barcodes", |
| 242 | + values_to = "auc" |
| 243 | + ) |> |
| 244 | + # add in threshold column |
| 245 | + dplyr::left_join(threshold_df, by = "gene_set") |> |
| 246 | + dplyr:::relocate(gene_set, .after = barcodes) |
| 247 | + |
| 248 | +# export results as table |
| 249 | +readr::write_tsv(auc_df, opt$output_file) |
0 commit comments