Merge pull request #172 from AlexsLemonade/allyhawkins/export-openscpca-annotations

allyhawkins · web-flow · commit c232c5d34325 · 2025-09-02T10:52:25.000-05:00
Module for exporting openscpca annotations
diff --git a/main.nf b/main.nf
@@ -11,6 +11,7 @@ include { cell_type_ewings } from './modules/cell-type-ewings'
 include { cell_type_neuroblastoma_04 } from './modules/cell-type-neuroblastoma-04'
 include { infercnv_gene_order } from './modules/infercnv-gene-order'
 include { cell_type_scimilarity } from './modules/cell-type-scimilarity'
+include { export_annotations } from './modules/export-annotations'
 
 // **** Parameter checks ****
 include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
@@ -84,4 +85,11 @@ workflow {
   // only runs on SCPCP000004
   cell_type_neuroblastoma_04(sample_ch.filter{ it[1] == "SCPCP000004" })
 
+  // format and export json files with openscpca annotations
+  // input expected to be sample id, project id, tsv files, annotation meta
+  // annotation meta should be a groovy map (dictionary) containing at least `module_name:` and  `annotation_column:` keys.
+  // The optional key `ontology_column:` will also be used if provided.
+  // mix outputs from all cell type modules first
+  export_ch = cell_type_ewings.out.celltypes
+  export_annotations(export_ch)
 }
diff --git a/modules/cell-type-ewings/main.nf b/modules/cell-type-ewings/main.nf
@@ -147,7 +147,20 @@ workflow cell_type_ewings {
     // assign cell types
     ewing_assign_celltypes(assign_ch, file(params.cell_type_ewings_auc_thresholds_file))
 
+    // add ewing specific metadata to output tuple
+    celltype_output_ch = ewing_assign_celltypes.out
+      .map{ sample_id, project_id, assignment_files -> tuple(
+        sample_id,
+        project_id,
+        assignment_files,
+        [ // annotation metadata
+          module_name: "cell-type-ewings",
+          annotation_column: "ewing_annotation",
+          ontology_column: "ewing_ontology"
+        ]
+      )}
+
   emit:
     aucell = ewing_aucell.out // [sample_id, project_id, [aucell output files], [mean gene expression files]]
-    celltypes = ewing_assign_celltypes.out // [sample_id, project_id, [cell type assignment files]]
+    celltypes = celltype_output_ch // [sample_id, project_id, [cell type assignment files], annotation_metadata]
 }
diff --git a/modules/export-annotations/README.md b/modules/export-annotations/README.md
@@ -0,0 +1,13 @@
+This module exports annotations from cell type modules in a uniform format to a public s3 bucket for use in other applications.
+Annotations can be found in `s3://openscpca-celltype-annotations-public-access`.
+
+For each library, a JSON file is exported with the following information:
+
+| | |
+| -- | -- |
+| `barcodes` | An array of unique cell barcodes |
+| `openscpca_celltype_annotation` | An array of cell type annotations assigned in `OpenScPCA-nf` |
+| `openscpca_celltype_ontology` | An array of Cell Ontology identifiers associated with the cell type annotation. If no Cell Ontology identifiers are assigned, this will be `NA` |
+| `module_name` | Name of the original analysis module used to assign cell type annotations in `OpenScPCA-analysis` |
+| `openscpca_nf_version` | Version of `OpenScPCA-nf` |
+| `release_date` | Release date of input ScPCA data |
diff --git a/modules/export-annotations/main.nf b/modules/export-annotations/main.nf
@@ -0,0 +1,55 @@
+#!/usr/bin/env nextflow
+
+// Workflow to format and export openscpca annotations
+
+process format_annotations {
+  container params.scpcatools_slim_container
+  tag "${sample_id}"
+  label 'mem_8'
+  publishDir "${params.annotations_bucket}/${params.release_prefix}/${project_id}/${sample_id}", mode: 'copy'
+  input:
+    tuple val(sample_id),
+          val(project_id),
+          path(annotations_tsv_files),
+          val(annotation_metadata)
+  output:
+    tuple val(sample_id),
+          val(project_id),
+          path("*_openscpca-annotations.json")
+  script:
+    library_ids = annotations_tsv_files.collect{(it.name =~ /SCPCL\d{6}/)[0]}
+    """
+    for library_id in ${library_ids.join(" ")};do
+      # get the input files for the library id
+      annotations_file=\$(ls ${annotations_tsv_files} | grep "\${library_id}")
+
+      export-celltype-json.R \
+        --annotations_tsv_file \$annotations_file \
+        --annotation_column "${annotation_metadata.annotation_column}" \
+        ${annotation_metadata.ontology_column ? "--ontology_column  '${annotation_metadata.ontology_column}'" : ''} \
+        --module_name ${annotation_metadata.module_name} \
+        --release_date ${params.release_prefix} \
+        --openscpca_nf_version ${workflow.manifest.version} \
+        --output_json_file \${library_id}_openscpca-annotations.json
+    done
+    """
+
+  stub:
+    library_ids = annotations_tsv_files.collect{(it.name =~ /SCPCL\d{6}/)[0]}
+    """
+    for library_id in ${library_ids.join(" ")};do
+      touch \${library_id}_openscpca-annotations.json
+    done
+    """
+}
+
+workflow export_annotations {
+  take:
+    celltype_ch  // [sample_id, project_id, [cell type assignment files], annotation metadata]
+  main:
+    // export json
+    format_annotations(celltype_ch)
+
+  emit:
+    format_annotations.out // [sample id, project id, annotations json]
+}
diff --git a/modules/export-annotations/resources/usr/bin/export-celltype-json.R b/modules/export-annotations/resources/usr/bin/export-celltype-json.R
@@ -0,0 +1,99 @@
+#!/usr/bin/env Rscript
+
+# This script is used to create a JSON file of annotations for a single library
+# JSON file will include barcodes, annotation column, ontology column (if provided),
+# openscpca-nf version, data release data, and module name
+
+library(optparse)
+
+option_list <- list(
+  make_option(
+    opt_str = c("--annotations_tsv_file"),
+    type = "character",
+    help = "Path to TSV file with cell type annotations"
+  ),
+  make_option(
+    opt_str = c("--annotation_column"),
+    type = "character",
+    help = "Name of the column containing the cell type annotations to use for openscpca_celltype_annotation"
+  ),
+  make_option(
+    opt_str = c("--ontology_column"),
+    default = "",
+    type = "character",
+    help = "Name of the column containing the cell type ontology IDs to use for openscpca_celltype_ontology"
+  ),
+  make_option(
+    opt_str = c("--module_name"),
+    type = "character",
+    help = "Name of original module in OpenScPCA-analysis"
+  ),
+  make_option(
+    opt_str = c("--release_date"),
+    type = "character",
+    help = "Release date of data used when generating annotations"
+  ),
+  make_option(
+    opt_str = c("--openscpca_nf_version"),
+    type = "character",
+    help = "Version of OpenScPCA-nf workflow"
+  ),
+  make_option(
+    opt_str = "--output_json_file",
+    type = "character",
+    help = "Path to JSON file to save cell type annotations"
+  )
+)
+
+# Parse options
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Set up -----------------------------------------------------------------------
+
+# make sure input/output exist
+stopifnot(
+  "annotations TSV file does not exist" = file.exists(opt$annotations_tsv_file),
+  "annotation column must be provided" = !is.null(opt$annotation_column),
+  "module name must be provided" = !is.null(opt$module_name),
+  "release date must be provided" = !is.null(opt$release_date),
+  "openscpca-nf version must be provided" = !is.null(opt$openscpca_nf_version),
+  "output json file must end in .json" = stringr::str_ends(opt$output_json_file, "\\.json")
+)
+
+# read in annotations
+annotations_df <- readr::read_tsv(opt$annotations_tsv_file)
+
+# check that barcodes and annotation column exist
+stopifnot(
+  "barcodes column must be present in provided TSV file" = "barcodes" %in% colnames(annotations_df),
+  "annotation column is not present in provided TSV file" = opt$annotation_column %in% colnames(annotations_df)
+)
+
+# check for ontology ids if provided
+if (!is.null(opt$ontology_column)) {
+  stopifnot(
+    "ontology column is not present in provided TSV file" = opt$ontology_column %in% colnames(annotations_df)
+  )
+  ontology_ids <- annotations_df[[opt$ontology_column]]
+} else {
+  ontology_ids <- NA
+}
+
+# build json contents
+json_contents <- list(
+  module_name = opt$module_name,
+  openscpca_nf_version = opt$openscpca_nf_version,
+  release_date = opt$release_date,
+  barcodes = annotations_df$barcodes,
+  openscpca_celltype_annotation = annotations_df[[opt$annotation_column]],
+  openscpca_celltype_ontology = ontology_ids
+)
+
+# export json file
+jsonlite::write_json(
+  json_contents,
+  path = opt$output_json_file,
+  simplifyVector = TRUE,
+  auto_unbox = TRUE,
+  pretty = TRUE
+)
diff --git a/nextflow.config b/nextflow.config
@@ -44,6 +44,7 @@ params {
   release_bucket = "s3://openscpca-data-release"
   results_bucket = "s3://openscpca-nf-workflow-results-staging"
   sim_bucket = "s3://openscpca-test-data-release-staging"
+  annotations_bucket = "s3://openscpca-celltype-annotations-public-access"
   project = "all"
 
   // URIs to reference files
@@ -111,6 +112,7 @@ profiles {
       release_bucket = "s3://openscpca-test-data-release-public-access" // test bucket
       results_bucket = "test/stub/results" // no output
       sim_bucket = "test/stub/simulated" // local output
+      annotations_bucket = "test/stub/annotations" // local output
       project = "SCPCP000012" // a small project
 
       // override large reference files for stub testing
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -36,6 +36,12 @@
           "description": "Base URI for simulated data output",
           "help_text": "Standard configurations will use an S3 bucket, but local paths can also be used."
         },
+        "annotations_bucket": {
+          "type": "string",
+          "default": "s3://openscpca-celltype-annotations-public-access",
+          "description": "Base URI for saving cell type annotations output",
+          "help_text": "Standard configurations will use an S3 bucket, but local paths can also be used."
+        },
         "project": {
           "type": "string",
           "default": "all",