Skip to content

Commit 2ed53d9

Browse files
authored
Merge pull request #131 from AlexsLemonade/allyhawkins/ewing-cell-types-part1
Initiate module for cell-type-ewings
2 parents f6dcd0d + 46006c1 commit 2ed53d9

File tree

8 files changed

+394
-3
lines changed

8 files changed

+394
-3
lines changed

_typos.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
[type.tf]
2-
extend-glob = ["*.tf"]
3-
41
[type.tf.extend-words]
52
kms = "kms"
3+
4+
[type.r]
5+
extend-glob = ["*.r"]
6+
7+
[type.r.extend-identifiers]
8+
aucThr = "aucThr"

config/containers.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,8 @@ params{
2020

2121
// cell-type-consensus module
2222
consensus_cell_type_container = 'public.ecr.aws/openscpca/cell-type-consensus:v0.2.2'
23+
24+
// cell-type-ewings module
25+
cell_type_ewing_container = 'public.ecr.aws/openscpca/cell-type-ewings:v0.2.2'
26+
2327
}

config/module_params.config

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,9 @@ params{
1313
cell_type_consensus_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv'
1414
cell_type_consensus_marker_gene_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-consensus/references/validation-markers.tsv'
1515

16+
// cell type ewings
17+
cell_type_ewings_auc_max_rank = 425 // 1% of the total detected genes in the merged object, equivalent to the value used to obtain assignments in OpenScPCA-analysis
18+
cell_type_ewings_msigdb_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/msigdb-gene-sets.tsv'
19+
cell_type_ewings_ews_high_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/aynaud-ews-targets.tsv'
20+
cell_type_ewings_ews_low_list = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/tags/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/wrenn-nt5e-genes.tsv'
1621
}

main.nf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ include { merge_sce } from './modules/merge-sce'
77
include { detect_doublets } from './modules/doublet-detection'
88
include { seurat_conversion } from './modules/seurat-conversion'
99
include { cell_type_consensus } from './modules/cell-type-consensus'
10+
include { cell_type_ewings } from './modules/cell-type-ewings'
1011

1112
// **** Parameter checks ****
1213
include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
@@ -62,4 +63,7 @@ workflow {
6263

6364
// Run the consensus cell type workflow
6465
cell_type_consensus(sample_ch)
66+
67+
// Run the cell type ewings workflow
68+
cell_type_ewings(sample_ch)
6569
}

modules/cell-type-ewings/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
This module assigns cell types to all Ewing sarcoma samples in `SCPCP000015`.
2+
3+
Scripts are derived from the the `cell-type-ewings` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
4+
5+
Links to specific original scripts used in this module:
6+
7+
- `01-aucell.R`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R>
8+
9+
This module also uses the following reference files found in the `OpenScPCA-analysis` repository:
10+
11+
- `aynaud-ews-targets.tsv` : <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/aynaud-ews-targets.tsv>
12+
- `wrenn-nt5e-genes.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/references/gene_signatures/wrenn-nt5e-genes.tsv>
13+
- `msigdb-gene-sets.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/v0.2.2/analyses/cell-type-ewings/references/msigdb-gene-sets.tsv>

modules/cell-type-ewings/main.nf

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env nextflow
2+
3+
// Workflow to assign consensus cell type labels
4+
5+
process ewing_aucell {
6+
container params.cell_type_ewing_container
7+
tag "${project_id}"
8+
label 'mem_8'
9+
publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-ewings/${project_id}/${sample_id}", mode: 'copy'
10+
input:
11+
tuple val(sample_id),
12+
val(project_id),
13+
path(library_files)
14+
val auc_max_rank
15+
path msigdb_list
16+
path ews_high_list
17+
path ews_low_list
18+
output:
19+
tuple val(sample_id),
20+
val(project_id),
21+
path(aucell_output_files)
22+
script:
23+
aucell_output_files = library_files
24+
.collect{
25+
it.name.replaceAll(/(?i).rds$/, "_ewing-aucell-results.tsv.gz")
26+
}
27+
// combine the custom gene sets into a single input
28+
custom_geneset_files = [ews_high_list, ews_low_list].join(",")
29+
"""
30+
for file in ${library_files}; do
31+
aucell.R \
32+
--sce_file \$file \
33+
--custom_geneset_files ${custom_geneset_files} \
34+
--msigdb_genesets ${msigdb_list} \
35+
--max_rank_threshold ${auc_max_rank} \
36+
--output_file \$(basename \${file%.rds}_ewing-aucell-results.tsv.gz) \
37+
--threads ${task.cpus} \
38+
--seed 2025
39+
done
40+
"""
41+
42+
stub:
43+
aucell_output_files = library_files
44+
.collect{
45+
it.name.replaceAll(/(?i).rds$/, "_ewing-aucell-results.tsv.gz")
46+
}
47+
"""
48+
for file in ${library_files}; do
49+
touch \$(basename \${file%.rds}_ewing-aucell-results.tsv.gz)
50+
done
51+
"""
52+
}
53+
54+
55+
56+
workflow cell_type_ewings {
57+
take:
58+
sample_ch // [sample_id, project_id, sample_path]
59+
main:
60+
// create [sample_id, project_id, [list of processed files]]
61+
libraries_ch = sample_ch
62+
.map{sample_id, project_id, sample_path ->
63+
def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed")
64+
return [sample_id, project_id, library_files]
65+
}
66+
// only run on SCPCP000015 with Ewing sarcoma samples
67+
.filter{ it[1] == "SCPCP000015" }
68+
69+
// run aucell on ewing gene sets
70+
ewing_aucell(
71+
libraries_ch,
72+
params.cell_type_ewings_auc_max_rank,
73+
file(params.cell_type_ewings_msigdb_list),
74+
file(params.cell_type_ewings_ews_high_list),
75+
file(params.cell_type_ewings_ews_low_list)
76+
)
77+
78+
emit:
79+
aucell = ewing_aucell.out // [sample_id, project_id, [list of aucell_output_files]]
80+
}
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
#!/usr/bin/env Rscript
2+
3+
# This script is used to run `AUCell` on a single SCE object for a set of marker gene sets
4+
# gene sets used are custom gene sets and a set of Ewing specific gene sets from MsigDB
5+
# the results are exported as a single TSV file with the following columns:
6+
# `gene_set`, `barcodes`, `auc`, and `auc_threshold`
7+
8+
9+
library(optparse)
10+
11+
option_list <- list(
12+
make_option(
13+
opt_str = c("--sce_file"),
14+
type = "character",
15+
help = "Path to RDS file containing a processed SingleCellExperiment object to use with AUCell."
16+
),
17+
make_option(
18+
opt_str = c("--custom_geneset_files"),
19+
type = "character",
20+
default = NULL,
21+
help = "Optional comma separated list of files where each file contains a custom gene set to use with AUCell.
22+
All TSV files must contain the `ensembl_gene_id` column.
23+
File names will be used as the name of the gene set."
24+
),
25+
make_option(
26+
opt_str = c("--msigdb_genesets"),
27+
type = "character",
28+
help = "Path to TSV file containing all gene sets from MSigDB to use with AUCell.
29+
Must contain columns with `name`, `geneset`, `category`, and `subcategory`."
30+
),
31+
make_option(
32+
opt_str = c("--max_rank_threshold"),
33+
type = "integer",
34+
default = 425, # 1% of all detected genes in merged object for SCPCP000015
35+
help = "Number of genes detected to set as the `aucMaxRank`."
36+
),
37+
make_option(
38+
opt_str = c("--output_file"),
39+
type = "character",
40+
help = "Path to file where results will be saved"
41+
),
42+
make_option(
43+
opt_str = c("-t", "--threads"),
44+
type = "integer",
45+
default = 4,
46+
help = "Number of multiprocessing threads to use."
47+
),
48+
make_option(
49+
opt_str = c("--seed"),
50+
type = "integer",
51+
default = 2025,
52+
help = "A random seed for reproducibility."
53+
)
54+
)
55+
56+
# Parse options
57+
opt <- parse_args(OptionParser(option_list = option_list))
58+
59+
# Set up -----------------------------------------------------------------------
60+
61+
# make sure input files exist
62+
stopifnot(
63+
"sce file must be specified using `--sce_file`" = !is.null(opt$sce_file)
64+
)
65+
66+
stopifnot(
67+
"sce file does not exist" = file.exists(opt$sce_file),
68+
"MSigDB gene set file does not exist" = file.exists(opt$msigdb_genesets),
69+
"max_rank_threshold must be an integer" = is.integer(opt$max_rank_threshold)
70+
)
71+
72+
# check that custom gene set files exist if provided
73+
use_custom_genesets <- !is.null(opt$custom_geneset_files)
74+
if (use_custom_genesets) {
75+
# first separate the files
76+
custom_geneset_files <- stringr::str_split_1(opt$custom_geneset_files, ",")
77+
78+
stopifnot(
79+
"Custom gene set files do not exist" = all(file.exists(custom_geneset_files))
80+
)
81+
}
82+
83+
# load SCE
84+
suppressPackageStartupMessages({
85+
library(SingleCellExperiment)
86+
})
87+
88+
89+
# set up multiprocessing params
90+
if (opt$threads > 1) {
91+
bp_param <- BiocParallel::MulticoreParam(opt$threads)
92+
} else {
93+
bp_param <- BiocParallel::SerialParam()
94+
}
95+
96+
# make sure directory exists for writing output
97+
output_dir <- dirname(opt$output_file)
98+
fs::dir_create(output_dir)
99+
100+
# read in SCE
101+
sce <- readr::read_rds(opt$sce_file)
102+
103+
# remove genes that are not detected from SCE object
104+
genes_to_keep <- rowData(sce)$detected > 0
105+
filtered_sce <- sce[genes_to_keep, ]
106+
107+
# read in gene sets to use with msigdb
108+
msig_genesets_df <- readr::read_tsv(opt$msigdb_genesets)
109+
110+
# Prep gene sets ---------------------------------------------------------------
111+
112+
# get list of categories that we need to grab from msigdb
113+
category_list <- msig_genesets_df |>
114+
dplyr::select(category, subcategory) |>
115+
unique() |>
116+
purrr::transpose()
117+
118+
# list of genesets and names
119+
geneset_list <- msig_genesets_df$geneset |>
120+
purrr::set_names(msig_genesets_df$name)
121+
122+
# pull gene sets from msigbdr
123+
# first pull out info for each category and then pull out specific genes for geneset
124+
msig_genes_df <- category_list |>
125+
purrr::map(\(category_list){
126+
# replace subcategory with default NULL
127+
# can't use NULL in tsv since it gets read in as a character
128+
if (is.na(category_list$subcategory)) {
129+
subcategory <- NULL
130+
} else {
131+
subcategory <- category_list$subcategory
132+
}
133+
134+
msigdbr::msigdbr(
135+
species = "Homo sapiens",
136+
category = category_list$category,
137+
subcategory = subcategory
138+
)
139+
}) |>
140+
dplyr::bind_rows() |>
141+
# only keep relevant gene sets
142+
dplyr::filter(gs_name %in% geneset_list)
143+
144+
# create named list of genes in each gene set
145+
genes_list <- geneset_list |>
146+
purrr::map(\(name){
147+
genes <- msig_genes_df |>
148+
dplyr::filter(gs_name == name) |>
149+
dplyr::pull(ensembl_gene) |>
150+
unique()
151+
})
152+
153+
# if custom gene sets are used add those to the list of gene sets
154+
if (use_custom_genesets) {
155+
# get names of gene sets using name of the files
156+
custom_geneset_names <- stringr::str_replace(basename(custom_geneset_files), ".tsv", "")
157+
158+
# read in custom gene sets
159+
custom_genes_list <- custom_geneset_files |>
160+
purrr::set_names(custom_geneset_names) |>
161+
purrr::map(\(file) {
162+
gene_ids <- readr::read_tsv(file) |>
163+
dplyr::pull(ensembl_gene_id) |>
164+
unique()
165+
})
166+
167+
# combine custom and msig
168+
genes_list <- c(genes_list, custom_genes_list)
169+
}
170+
171+
# build GeneSetCollection for AUCell
172+
collection <- genes_list |>
173+
purrr::imap(\(genes, name) GSEABase::GeneSet(genes, setName = name)) |>
174+
GSEABase::GeneSetCollection()
175+
176+
# Run AUCell -------------------------------------------------------------------
177+
178+
# extract counts matrix
179+
counts_mtx <- counts(filtered_sce)
180+
181+
# check intersection with gene sets
182+
overlap_pct <- genes_list |>
183+
purrr::map_dbl(\(list){
184+
num_genes <- length(list)
185+
intersect(rownames(counts_mtx), list) |>
186+
length() / num_genes
187+
})
188+
189+
# if any gene sets don't have enough overlap (cutoff is 20%)
190+
# print a message and quit
191+
if (any(overlap_pct <= 0.20)) {
192+
message("Gene sets do not have at least 20% of genes present in SCE.
193+
AUCell will not be run.")
194+
# make empty data frame and save to output file
195+
data.frame(
196+
barcodes = colnames(sce),
197+
gene_set = NA,
198+
auc = NA,
199+
auc_thresholds = NA
200+
) |>
201+
readr::write_tsv(opt$output_file)
202+
203+
# don't run the rest
204+
quit(save = "no")
205+
}
206+
207+
# run aucell
208+
auc_results <- AUCell::AUCell_run(
209+
counts_mtx,
210+
collection,
211+
aucMaxRank = opt$max_rank_threshold,
212+
BPPARAM = bp_param
213+
)
214+
215+
# Get threshold ----------------------------------------------------------------
216+
217+
# get auc threshold for each geneset
218+
auc_thresholds <- AUCell::AUCell_exploreThresholds(
219+
auc_results,
220+
assign = TRUE,
221+
plotHist = FALSE
222+
) |>
223+
# extract select auc threshold
224+
purrr::map_dbl(\(results){
225+
results$aucThr$selected
226+
})
227+
228+
# put into a data frame for easy joining with all auc values
229+
threshold_df <- data.frame(
230+
gene_set = names(auc_thresholds),
231+
auc_threshold = auc_thresholds
232+
)
233+
234+
# Combine and export results ---------------------------------------------------
235+
236+
# create data frame with auc for each cell and each geneset
237+
auc_df <- auc_results@assays@data$AUC |>
238+
as.data.frame() |>
239+
tibble::rownames_to_column("gene_set") |>
240+
tidyr::pivot_longer(!"gene_set",
241+
names_to = "barcodes",
242+
values_to = "auc"
243+
) |>
244+
# add in threshold column
245+
dplyr::left_join(threshold_df, by = "gene_set") |>
246+
dplyr:::relocate(gene_set, .after = barcodes)
247+
248+
# export results as table
249+
readr::write_tsv(auc_df, opt$output_file)

0 commit comments

Comments
 (0)