Skip to content

Commit c232c5d

Browse files
authored
Merge pull request #172 from AlexsLemonade/allyhawkins/export-openscpca-annotations
Module for exporting openscpca annotations
2 parents 5f8d269 + e8c94e8 commit c232c5d

File tree

7 files changed

+197
-1
lines changed

7 files changed

+197
-1
lines changed

main.nf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ include { cell_type_ewings } from './modules/cell-type-ewings'
1111
include { cell_type_neuroblastoma_04 } from './modules/cell-type-neuroblastoma-04'
1212
include { infercnv_gene_order } from './modules/infercnv-gene-order'
1313
include { cell_type_scimilarity } from './modules/cell-type-scimilarity'
14+
include { export_annotations } from './modules/export-annotations'
1415

1516
// **** Parameter checks ****
1617
include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
@@ -84,4 +85,11 @@ workflow {
8485
// only runs on SCPCP000004
8586
cell_type_neuroblastoma_04(sample_ch.filter{ it[1] == "SCPCP000004" })
8687

88+
// format and export json files with openscpca annotations
89+
// input expected to be sample id, project id, tsv files, annotation meta
90+
// annotation meta should be a groovy map (dictionary) containing at least `module_name:` and `annotation_column:` keys.
91+
// The optional key `ontology_column:` will also be used if provided.
92+
// mix outputs from all cell type modules first
93+
export_ch = cell_type_ewings.out.celltypes
94+
export_annotations(export_ch)
8795
}

modules/cell-type-ewings/main.nf

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,20 @@ workflow cell_type_ewings {
147147
// assign cell types
148148
ewing_assign_celltypes(assign_ch, file(params.cell_type_ewings_auc_thresholds_file))
149149

150+
// add ewing specific metadata to output tuple
151+
celltype_output_ch = ewing_assign_celltypes.out
152+
.map{ sample_id, project_id, assignment_files -> tuple(
153+
sample_id,
154+
project_id,
155+
assignment_files,
156+
[ // annotation metadata
157+
module_name: "cell-type-ewings",
158+
annotation_column: "ewing_annotation",
159+
ontology_column: "ewing_ontology"
160+
]
161+
)}
162+
150163
emit:
151164
aucell = ewing_aucell.out // [sample_id, project_id, [aucell output files], [mean gene expression files]]
152-
celltypes = ewing_assign_celltypes.out // [sample_id, project_id, [cell type assignment files]]
165+
celltypes = celltype_output_ch // [sample_id, project_id, [cell type assignment files], annotation_metadata]
153166
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
This module exports annotations from cell type modules in a uniform format to a public s3 bucket for use in other applications.
2+
Annotations can be found in `s3://openscpca-celltype-annotations-public-access`.
3+
4+
For each library, a JSON file is exported with the following information:
5+
6+
| | |
7+
| -- | -- |
8+
| `barcodes` | An array of unique cell barcodes |
9+
| `openscpca_celltype_annotation` | An array of cell type annotations assigned in `OpenScPCA-nf` |
10+
| `openscpca_celltype_ontology` | An array of Cell Ontology identifiers associated with the cell type annotation. If no Cell Ontology identifiers are assigned, this will be `NA` |
11+
| `module_name` | Name of the original analysis module used to assign cell type annotations in `OpenScPCA-analysis` |
12+
| `openscpca_nf_version` | Version of `OpenScPCA-nf` |
13+
| `release_date` | Release date of input ScPCA data |

modules/export-annotations/main.nf

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env nextflow
2+
3+
// Workflow to format and export openscpca annotations
4+
5+
process format_annotations {
6+
container params.scpcatools_slim_container
7+
tag "${sample_id}"
8+
label 'mem_8'
9+
publishDir "${params.annotations_bucket}/${params.release_prefix}/${project_id}/${sample_id}", mode: 'copy'
10+
input:
11+
tuple val(sample_id),
12+
val(project_id),
13+
path(annotations_tsv_files),
14+
val(annotation_metadata)
15+
output:
16+
tuple val(sample_id),
17+
val(project_id),
18+
path("*_openscpca-annotations.json")
19+
script:
20+
library_ids = annotations_tsv_files.collect{(it.name =~ /SCPCL\d{6}/)[0]}
21+
"""
22+
for library_id in ${library_ids.join(" ")};do
23+
# get the input files for the library id
24+
annotations_file=\$(ls ${annotations_tsv_files} | grep "\${library_id}")
25+
26+
export-celltype-json.R \
27+
--annotations_tsv_file \$annotations_file \
28+
--annotation_column "${annotation_metadata.annotation_column}" \
29+
${annotation_metadata.ontology_column ? "--ontology_column '${annotation_metadata.ontology_column}'" : ''} \
30+
--module_name ${annotation_metadata.module_name} \
31+
--release_date ${params.release_prefix} \
32+
--openscpca_nf_version ${workflow.manifest.version} \
33+
--output_json_file \${library_id}_openscpca-annotations.json
34+
done
35+
"""
36+
37+
stub:
38+
library_ids = annotations_tsv_files.collect{(it.name =~ /SCPCL\d{6}/)[0]}
39+
"""
40+
for library_id in ${library_ids.join(" ")};do
41+
touch \${library_id}_openscpca-annotations.json
42+
done
43+
"""
44+
}
45+
46+
workflow export_annotations {
47+
take:
48+
celltype_ch // [sample_id, project_id, [cell type assignment files], annotation metadata]
49+
main:
50+
// export json
51+
format_annotations(celltype_ch)
52+
53+
emit:
54+
format_annotations.out // [sample id, project id, annotations json]
55+
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/usr/bin/env Rscript
2+
3+
# This script is used to create a JSON file of annotations for a single library
4+
# JSON file will include barcodes, annotation column, ontology column (if provided),
5+
# openscpca-nf version, data release data, and module name
6+
7+
library(optparse)
8+
9+
option_list <- list(
10+
make_option(
11+
opt_str = c("--annotations_tsv_file"),
12+
type = "character",
13+
help = "Path to TSV file with cell type annotations"
14+
),
15+
make_option(
16+
opt_str = c("--annotation_column"),
17+
type = "character",
18+
help = "Name of the column containing the cell type annotations to use for openscpca_celltype_annotation"
19+
),
20+
make_option(
21+
opt_str = c("--ontology_column"),
22+
default = "",
23+
type = "character",
24+
help = "Name of the column containing the cell type ontology IDs to use for openscpca_celltype_ontology"
25+
),
26+
make_option(
27+
opt_str = c("--module_name"),
28+
type = "character",
29+
help = "Name of original module in OpenScPCA-analysis"
30+
),
31+
make_option(
32+
opt_str = c("--release_date"),
33+
type = "character",
34+
help = "Release date of data used when generating annotations"
35+
),
36+
make_option(
37+
opt_str = c("--openscpca_nf_version"),
38+
type = "character",
39+
help = "Version of OpenScPCA-nf workflow"
40+
),
41+
make_option(
42+
opt_str = "--output_json_file",
43+
type = "character",
44+
help = "Path to JSON file to save cell type annotations"
45+
)
46+
)
47+
48+
# Parse options
49+
opt <- parse_args(OptionParser(option_list = option_list))
50+
51+
# Set up -----------------------------------------------------------------------
52+
53+
# make sure input/output exist
54+
stopifnot(
55+
"annotations TSV file does not exist" = file.exists(opt$annotations_tsv_file),
56+
"annotation column must be provided" = !is.null(opt$annotation_column),
57+
"module name must be provided" = !is.null(opt$module_name),
58+
"release date must be provided" = !is.null(opt$release_date),
59+
"openscpca-nf version must be provided" = !is.null(opt$openscpca_nf_version),
60+
"output json file must end in .json" = stringr::str_ends(opt$output_json_file, "\\.json")
61+
)
62+
63+
# read in annotations
64+
annotations_df <- readr::read_tsv(opt$annotations_tsv_file)
65+
66+
# check that barcodes and annotation column exist
67+
stopifnot(
68+
"barcodes column must be present in provided TSV file" = "barcodes" %in% colnames(annotations_df),
69+
"annotation column is not present in provided TSV file" = opt$annotation_column %in% colnames(annotations_df)
70+
)
71+
72+
# check for ontology ids if provided
73+
if (!is.null(opt$ontology_column)) {
74+
stopifnot(
75+
"ontology column is not present in provided TSV file" = opt$ontology_column %in% colnames(annotations_df)
76+
)
77+
ontology_ids <- annotations_df[[opt$ontology_column]]
78+
} else {
79+
ontology_ids <- NA
80+
}
81+
82+
# build json contents
83+
json_contents <- list(
84+
module_name = opt$module_name,
85+
openscpca_nf_version = opt$openscpca_nf_version,
86+
release_date = opt$release_date,
87+
barcodes = annotations_df$barcodes,
88+
openscpca_celltype_annotation = annotations_df[[opt$annotation_column]],
89+
openscpca_celltype_ontology = ontology_ids
90+
)
91+
92+
# export json file
93+
jsonlite::write_json(
94+
json_contents,
95+
path = opt$output_json_file,
96+
simplifyVector = TRUE,
97+
auto_unbox = TRUE,
98+
pretty = TRUE
99+
)

nextflow.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ params {
4444
release_bucket = "s3://openscpca-data-release"
4545
results_bucket = "s3://openscpca-nf-workflow-results-staging"
4646
sim_bucket = "s3://openscpca-test-data-release-staging"
47+
annotations_bucket = "s3://openscpca-celltype-annotations-public-access"
4748
project = "all"
4849

4950
// URIs to reference files
@@ -111,6 +112,7 @@ profiles {
111112
release_bucket = "s3://openscpca-test-data-release-public-access" // test bucket
112113
results_bucket = "test/stub/results" // no output
113114
sim_bucket = "test/stub/simulated" // local output
115+
annotations_bucket = "test/stub/annotations" // local output
114116
project = "SCPCP000012" // a small project
115117

116118
// override large reference files for stub testing

nextflow_schema.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@
3636
"description": "Base URI for simulated data output",
3737
"help_text": "Standard configurations will use an S3 bucket, but local paths can also be used."
3838
},
39+
"annotations_bucket": {
40+
"type": "string",
41+
"default": "s3://openscpca-celltype-annotations-public-access",
42+
"description": "Base URI for saving cell type annotations output",
43+
"help_text": "Standard configurations will use an S3 bucket, but local paths can also be used."
44+
},
3945
"project": {
4046
"type": "string",
4147
"default": "all",

0 commit comments

Comments
 (0)