Merge pull request #131 from AlexsLemonade/allyhawkins/ewing-cell-types-part1

allyhawkins · web-flow · commit 2ed53d9ed302 · 2025-02-28T13:24:15.000-06:00
Initiate module for cell-type-ewings
diff --git a/_typos.toml b/_typos.toml
@@ -1,5 +1,8 @@
-[type.tf]
-extend-glob = ["*.tf"]
-
 [type.tf.extend-words]
 kms = "kms"
+
+[type.r]
+extend-glob = ["*.r"]
+
+[type.r.extend-identifiers]
+aucThr = "aucThr"
diff --git a/config/containers.config b/config/containers.config
@@ -20,4 +20,8 @@ params{
 
   // cell-type-consensus module
   consensus_cell_type_container = 'public.ecr.aws/openscpca/cell-type-consensus:v0.2.2'
+
+  // cell-type-ewings module
+  cell_type_ewing_container = 'public.ecr.aws/openscpca/cell-type-ewings:v0.2.2'
+
 }
diff --git a/config/module_params.config b/config/module_params.config
@@ -13,4 +13,9 @@ params{
   cell_type_consensus_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv'
   cell_type_consensus_marker_gene_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-consensus/references/validation-markers.tsv'
 
+  // cell type ewings
+  cell_type_ewings_auc_max_rank = 425 // 1% of the total detected genes in the merged object, equivalent to the value used to obtain assignments in OpenScPCA-analysis
+  cell_type_ewings_msigdb_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/msigdb-gene-sets.tsv'
+  cell_type_ewings_ews_high_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/aynaud-ews-targets.tsv'
+  cell_type_ewings_ews_low_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/wrenn-nt5e-genes.tsv'
 }
diff --git a/main.nf b/main.nf
@@ -7,6 +7,7 @@ include { merge_sce } from './modules/merge-sce'
 include { detect_doublets } from './modules/doublet-detection'
 include { seurat_conversion } from './modules/seurat-conversion'
 include { cell_type_consensus } from './modules/cell-type-consensus'
+include { cell_type_ewings } from './modules/cell-type-ewings'
 
 // **** Parameter checks ****
 include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
@@ -62,4 +63,7 @@ workflow {
 
   // Run the consensus cell type workflow
   cell_type_consensus(sample_ch)
+
+  // Run the cell type ewings workflow
+  cell_type_ewings(sample_ch)
 }
diff --git a/modules/cell-type-ewings/README.md b/modules/cell-type-ewings/README.md
@@ -0,0 +1,13 @@
+This module assigns cell types to all Ewing sarcoma samples in `SCPCP000015`.
+
+Scripts are derived from the the `cell-type-ewings` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
+
+Links to specific original scripts used in this module:
+
+- `01-aucell.R`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R>
+
+This module also uses the following reference files found in the `OpenScPCA-analysis` repository:
+
+- `aynaud-ews-targets.tsv` : <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/aynaud-ews-targets.tsv>
+- `wrenn-nt5e-genes.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/wrenn-nt5e-genes.tsv>
+- `msigdb-gene-sets.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/references/msigdb-gene-sets.tsv>
diff --git a/modules/cell-type-ewings/main.nf b/modules/cell-type-ewings/main.nf
@@ -0,0 +1,80 @@
+#!/usr/bin/env nextflow
+
+// Workflow to assign consensus cell type labels
+
+process ewing_aucell {
+  container params.cell_type_ewing_container
+  tag "${project_id}"
+  label 'mem_8'
+  publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-ewings/${project_id}/${sample_id}", mode: 'copy'
+  input:
+    tuple val(sample_id),
+          val(project_id),
+          path(library_files)
+    val auc_max_rank
+    path msigdb_list
+    path ews_high_list
+    path ews_low_list
+  output:
+    tuple val(sample_id),
+          val(project_id),
+          path(aucell_output_files)
+  script:
+    aucell_output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_ewing-aucell-results.tsv.gz")
+      }
+    // combine the custom gene sets into a single input
+    custom_geneset_files = [ews_high_list, ews_low_list].join(",")
+    """
+    for file in ${library_files}; do
+      aucell.R \
+        --sce_file \$file \
+        --custom_geneset_files ${custom_geneset_files} \
+        --msigdb_genesets ${msigdb_list} \
+        --max_rank_threshold ${auc_max_rank} \
+        --output_file \$(basename \${file%.rds}_ewing-aucell-results.tsv.gz) \
+        --threads ${task.cpus} \
+        --seed 2025
+    done
+    """
+
+  stub:
+    aucell_output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_ewing-aucell-results.tsv.gz")
+      }
+    """
+    for file in ${library_files}; do
+      touch \$(basename \${file%.rds}_ewing-aucell-results.tsv.gz)
+    done
+    """
+}
+
+
+
+workflow cell_type_ewings {
+  take:
+    sample_ch  // [sample_id, project_id, sample_path]
+  main:
+    // create [sample_id, project_id, [list of processed files]]
+    libraries_ch = sample_ch
+      .map{sample_id, project_id, sample_path ->
+        def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed")
+        return [sample_id, project_id, library_files]
+      }
+      // only run on SCPCP000015 with Ewing sarcoma samples
+      .filter{ it[1] == "SCPCP000015" }
+
+    // run aucell on ewing gene sets
+    ewing_aucell(
+      libraries_ch,
+      params.cell_type_ewings_auc_max_rank,
+      file(params.cell_type_ewings_msigdb_list),
+      file(params.cell_type_ewings_ews_high_list),
+      file(params.cell_type_ewings_ews_low_list)
+    )
+
+  emit:
+    aucell = ewing_aucell.out // [sample_id, project_id, [list of aucell_output_files]]
+}
diff --git a/modules/cell-type-ewings/resources/usr/bin/aucell.R b/modules/cell-type-ewings/resources/usr/bin/aucell.R
@@ -0,0 +1,249 @@
+#!/usr/bin/env Rscript
+
+# This script is used to run `AUCell` on a single SCE object for a set of marker gene sets
+# gene sets used are custom gene sets and a set of Ewing specific gene sets from MsigDB
+# the results are exported as a single TSV file with the following columns:
+# `gene_set`, `barcodes`, `auc`, and `auc_threshold`
+
+
+library(optparse)
+
+option_list <- list(
+  make_option(
+    opt_str = c("--sce_file"),
+    type = "character",
+    help = "Path to RDS file containing a processed SingleCellExperiment object to use with AUCell."
+  ),
+  make_option(
+    opt_str = c("--custom_geneset_files"),
+    type = "character",
+    default = NULL,
+    help = "Optional comma separated list of files where each file contains a custom gene set to use with AUCell.
+      All TSV files must contain the `ensembl_gene_id` column.
+      File names will be used as the name of the gene set."
+  ),
+  make_option(
+    opt_str = c("--msigdb_genesets"),
+    type = "character",
+    help = "Path to TSV file containing all gene sets from MSigDB to use with AUCell.
+      Must contain columns with `name`, `geneset`, `category`, and `subcategory`."
+  ),
+  make_option(
+    opt_str = c("--max_rank_threshold"),
+    type = "integer",
+    default = 425, # 1% of all detected genes in merged object for SCPCP000015
+    help = "Number of genes detected to set as the `aucMaxRank`."
+  ),
+  make_option(
+    opt_str = c("--output_file"),
+    type = "character",
+    help = "Path to file where results will be saved"
+  ),
+  make_option(
+    opt_str = c("-t", "--threads"),
+    type = "integer",
+    default = 4,
+    help = "Number of multiprocessing threads to use."
+  ),
+  make_option(
+    opt_str = c("--seed"),
+    type = "integer",
+    default = 2025,
+    help = "A random seed for reproducibility."
+  )
+)
+
+# Parse options
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Set up -----------------------------------------------------------------------
+
+# make sure input files exist
+stopifnot(
+  "sce file must be specified using `--sce_file`" = !is.null(opt$sce_file)
+)
+
+stopifnot(
+  "sce file does not exist" = file.exists(opt$sce_file),
+  "MSigDB gene set file does not exist" = file.exists(opt$msigdb_genesets),
+  "max_rank_threshold must be an integer" = is.integer(opt$max_rank_threshold)
+)
+
+# check that custom gene set files exist if provided
+use_custom_genesets <- !is.null(opt$custom_geneset_files)
+if (use_custom_genesets) {
+  # first separate the files
+  custom_geneset_files <- stringr::str_split_1(opt$custom_geneset_files, ",")
+
+  stopifnot(
+    "Custom gene set files do not exist" = all(file.exists(custom_geneset_files))
+  )
+}
+
+# load SCE
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+})
+
+
+# set up multiprocessing params
+if (opt$threads > 1) {
+  bp_param <- BiocParallel::MulticoreParam(opt$threads)
+} else {
+  bp_param <- BiocParallel::SerialParam()
+}
+
+# make sure directory exists for writing output
+output_dir <- dirname(opt$output_file)
+fs::dir_create(output_dir)
+
+# read in SCE
+sce <- readr::read_rds(opt$sce_file)
+
+# remove genes that are not detected from SCE object
+genes_to_keep <- rowData(sce)$detected > 0
+filtered_sce <- sce[genes_to_keep, ]
+
+# read in gene sets to use with msigdb
+msig_genesets_df <- readr::read_tsv(opt$msigdb_genesets)
+
+# Prep gene sets ---------------------------------------------------------------
+
+# get list of categories that we need to grab from msigdb
+category_list <- msig_genesets_df |>
+  dplyr::select(category, subcategory) |>
+  unique() |>
+  purrr::transpose()
+
+# list of genesets and names
+geneset_list <- msig_genesets_df$geneset |>
+  purrr::set_names(msig_genesets_df$name)
+
+# pull gene sets from msigbdr
+# first pull out info for each category and then pull out specific genes for geneset
+msig_genes_df <- category_list |>
+  purrr::map(\(category_list){
+    # replace subcategory with default NULL
+    # can't use NULL in tsv since it gets read in as a character
+    if (is.na(category_list$subcategory)) {
+      subcategory <- NULL
+    } else {
+      subcategory <- category_list$subcategory
+    }
+
+    msigdbr::msigdbr(
+      species = "Homo sapiens",
+      category = category_list$category,
+      subcategory = subcategory
+    )
+  }) |>
+  dplyr::bind_rows() |>
+  # only keep relevant gene sets
+  dplyr::filter(gs_name %in% geneset_list)
+
+# create named list of genes in each gene set
+genes_list <- geneset_list |>
+  purrr::map(\(name){
+    genes <- msig_genes_df |>
+      dplyr::filter(gs_name == name) |>
+      dplyr::pull(ensembl_gene) |>
+      unique()
+  })
+
+# if custom gene sets are used add those to the list of gene sets
+if (use_custom_genesets) {
+  # get names of gene sets using name of the files
+  custom_geneset_names <- stringr::str_replace(basename(custom_geneset_files), ".tsv", "")
+
+  # read in custom gene sets
+  custom_genes_list <- custom_geneset_files |>
+    purrr::set_names(custom_geneset_names) |>
+    purrr::map(\(file) {
+      gene_ids <- readr::read_tsv(file) |>
+        dplyr::pull(ensembl_gene_id) |>
+        unique()
+    })
+
+  # combine custom and msig
+  genes_list <- c(genes_list, custom_genes_list)
+}
+
+# build GeneSetCollection for AUCell
+collection <- genes_list |>
+  purrr::imap(\(genes, name) GSEABase::GeneSet(genes, setName = name)) |>
+  GSEABase::GeneSetCollection()
+
+# Run AUCell -------------------------------------------------------------------
+
+# extract counts matrix
+counts_mtx <- counts(filtered_sce)
+
+# check intersection with gene sets
+overlap_pct <- genes_list |>
+  purrr::map_dbl(\(list){
+    num_genes <- length(list)
+    intersect(rownames(counts_mtx), list) |>
+      length() / num_genes
+  })
+
+# if any gene sets don't have enough overlap (cutoff is 20%)
+# print a message and quit
+if (any(overlap_pct <= 0.20)) {
+  message("Gene sets do not have at least 20% of genes present in SCE.
+          AUCell will not be run.")
+  # make empty data frame and save to output file
+  data.frame(
+    barcodes = colnames(sce),
+    gene_set = NA,
+    auc = NA,
+    auc_thresholds = NA
+  ) |>
+    readr::write_tsv(opt$output_file)
+
+  # don't run the rest
+  quit(save = "no")
+}
+
+# run aucell
+auc_results <- AUCell::AUCell_run(
+  counts_mtx,
+  collection,
+  aucMaxRank = opt$max_rank_threshold,
+  BPPARAM = bp_param
+)
+
+# Get threshold ----------------------------------------------------------------
+
+# get auc threshold for each geneset
+auc_thresholds <- AUCell::AUCell_exploreThresholds(
+  auc_results,
+  assign = TRUE,
+  plotHist = FALSE
+) |>
+  # extract select auc threshold
+  purrr::map_dbl(\(results){
+    results$aucThr$selected
+  })
+
+# put into a data frame for easy joining with all auc values
+threshold_df <- data.frame(
+  gene_set = names(auc_thresholds),
+  auc_threshold = auc_thresholds
+)
+
+# Combine and export results ---------------------------------------------------
+
+# create data frame with auc for each cell and each geneset
+auc_df <- auc_results@assays@data$AUC |>
+  as.data.frame() |>
+  tibble::rownames_to_column("gene_set") |>
+  tidyr::pivot_longer(!"gene_set",
+    names_to = "barcodes",
+    values_to = "auc"
+  ) |>
+  # add in threshold column
+  dplyr::left_join(threshold_df, by = "gene_set") |>
+  dplyr:::relocate(gene_set, .after = barcodes)
+
+# export results as table
+readr::write_tsv(auc_df, opt$output_file)
diff --git a/nextflow_schema.json b/nextflow_schema.json

Original file line number	Diff line number	Diff line change
`@@ -20,4 +20,8 @@ params{`
`20`	`20`
`21`	`21`	`// cell-type-consensus module`
`22`	`22`	`consensus_cell_type_container = 'public.ecr.aws/openscpca/cell-type-consensus:v0.2.2'`
	`23`	`+`
	`24`	`+ // cell-type-ewings module`
	`25`	`+ cell_type_ewing_container = 'public.ecr.aws/openscpca/cell-type-ewings:v0.2.2'`
	`26`	`+`
`23`	`27`	`}`
Original file line number	Diff line number	Diff line change
`@@ -13,4 +13,9 @@ params{`
`13`	`13`	`cell_type_consensus_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv'`
`14`	`14`	`cell_type_consensus_marker_gene_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-consensus/references/validation-markers.tsv'`
`15`	`15`
	`16`	`+ // cell type ewings`
	`17`	`+ cell_type_ewings_auc_max_rank = 425 // 1% of the total detected genes in the merged object, equivalent to the value used to obtain assignments in OpenScPCA-analysis`
	`18`	`+ cell_type_ewings_msigdb_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/msigdb-gene-sets.tsv'`
	`19`	`+ cell_type_ewings_ews_high_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/aynaud-ews-targets.tsv'`
	`20`	`+ cell_type_ewings_ews_low_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/wrenn-nt5e-genes.tsv'`
`16`	`21`	`}`