AlexsLemonade · allyhawkins · Aug 28, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/config/containers.config b/config/containers.config
@@ -29,4 +29,7 @@ params{
   cell_type_nb_04_container = 'public.ecr.aws/openscpca/cell-type-neuroblastoma-04:latest'
 
 
+  // cell-type-scimilarity module
+  cell_type_scimilarity_container = 'public.ecr.aws/openscpca/cell-type-scimilarity:latest'
+
 }
diff --git a/config/module_params.config b/config/module_params.config
@@ -29,4 +29,9 @@ params{
   cell_type_nb_04_label_map_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-neuroblastoma-04/references/nbatlas-label-map.tsv'
   cell_type_nb_04_ontology_map_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-neuroblastoma-04/references/nbatlas-ontology-ids.tsv'
 
+  // cell type scimilarity
+  // TODO: Replace with tagged links once we have a new OpenScPCA-analysis tag
+  cell_type_scimilarity_model = 's3://scpca-references/celltype/scimilarity_references/model_v1.1'
+  cell_type_scimilarity_ontology_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-scimilarity/references/scimilarity-mapped-ontologies.tsv'
+
 }
diff --git a/main.nf b/main.nf
@@ -10,6 +10,7 @@ include { cell_type_consensus } from './modules/cell-type-consensus'
 include { cell_type_ewings } from './modules/cell-type-ewings'
 include { cell_type_neuroblastoma_04 } from './modules/cell-type-neuroblastoma-04'
 include { infercnv_gene_order } from './modules/infercnv-gene-order'
+include { cell_type_scimilarity } from './modules/cell-type-scimilarity'
 
 // **** Parameter checks ****
 include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
@@ -76,4 +77,7 @@ workflow {
   // Run the cell type neuroblastoma 04 workflow
   // only runs on SCPCP000004
   cell_type_neuroblastoma_04(sample_ch.filter{ it[1] == "SCPCP000004" })
+
+  // run the scimilarity cell type workflow
+  cell_type_scimilarity(sample_ch)
 }
diff --git a/modules/cell-type-scimilarity/README.md b/modules/cell-type-scimilarity/README.md
@@ -0,0 +1,12 @@
+This module performs cell type annotation using [`SCimilarity`](https://genentech.github.io/scimilarity/)
+Scripts are derived from the the `cell-type-scimilarity` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
+
+Links to specific original scripts used in this module:
+
+- `02-run-scimilarity.py`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-scimilarity/scripts/02-run-scimilarity.py>
+
+This module uses the [`SCimilarity` foundational model](https://zenodo.org/records/10685499) which can be found at `s3://scpca-references/celltype/scimilarity_references/model_v1.1`.
+
+This module also uses the following reference files found in the `OpenScPCA-analysis` repository:
+
+- `scimilarity-mapped-ontologies.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-scimilarity/references/scimilarity-mapped-ontologies.tsv>
diff --git a/modules/cell-type-scimilarity/main.nf b/modules/cell-type-scimilarity/main.nf
@@ -0,0 +1,64 @@
+#!/usr/bin/env nextflow
+
+// Workflow to assign cell types using SCimilarity
+
+process assign_scimilarity {
+  container params.cell_type_scimilarity_container
+  tag "${sample_id}"
+  label 'mem_32'
+  publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-scimilarity/${project_id}/${sample_id}", mode: 'copy'
+  input:
+    tuple val(sample_id),
+          val(project_id),
+          path(library_files)
+    path scimilarity_model
+    path ontology_map_file
+  output:
+    tuple val(sample_id),
+          val(project_id),
+          path("*_scimilarity-celltype-assignments.tsv.gz")
+  script:
+    """
+    for file in ${library_files}; do
+      run-scimilarity.py \
+        --model_dir ${scimilarity_model} \
+        --processed_h5ad_file \$file \
+        --ontology_map_file ${ontology_map_file} \
+        --predictions_tsv \$(basename \${file%_rna.h5ad}_scimilarity-celltype-assignments.tsv.gz) \
+        --seed 2025
+    done
+    """
+
+  stub:
+    """
+    for file in ${library_files}; do
+      touch \$(basename \${file%_rna.h5ad}_scimilarity-celltype-assignments.tsv.gz)
+    done
+    """
+}
+
+
+
+workflow cell_type_scimilarity {
+  take:
+    sample_ch  // [sample_id, project_id, sample_path]
+  main:
+    // create [sample_id, project_id, [list of processed files]]
+    libraries_ch = sample_ch
+      .map{sample_id, project_id, sample_path ->
+        def library_files = Utils.getLibraryFiles(sample_path, format: "anndata", process_level: "processed")
+        // filter to only include _rna.h5ad files and remove any _adt.h5ad files
+        library_files = library_files.findAll{ it.name =~ /(?i)_rna.h5ad$/ }
+        return [sample_id, project_id, library_files]
+      }
+
+    // assign cell types using scimilarity
+    assign_scimilarity(
+      libraries_ch,
+      file(params.cell_type_scimilarity_model, type: 'dir'),
+      file(params.cell_type_scimilarity_ontology_ref_file)
+    )
+
+  emit:
+    assign_scimilarity.out // [sample_id, project_id, [list of scimilarity output files]]
+}
diff --git a/modules/cell-type-scimilarity/resources/test/README.md b/modules/cell-type-scimilarity/resources/test/README.md
@@ -0,0 +1,3 @@
+This directory contains files used in testing when running with the `stub` profile.
+
+* `scimilarity_model` is an empty directory to use in `stub` runs instead of downloading the full `SCimilarity` model
diff --git a/modules/cell-type-scimilarity/resources/test/scimilarity_model/.gitkeep b/modules/cell-type-scimilarity/resources/test/scimilarity_model/.gitkeep
diff --git a/modules/cell-type-scimilarity/resources/usr/bin/run-scimilarity.py b/modules/cell-type-scimilarity/resources/usr/bin/run-scimilarity.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+# Script to annotate processed ScPCA objects with SCimilarity
+# Follows this tutorial: https://genentech.github.io/scimilarity/notebooks/cell_annotation_tutorial.html
+
+# Adapted from: https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-scimilarity/scripts/02-run-scimilarity.py
+
+import argparse
+import sys
+from pathlib import Path
+import random
+
+import anndata
+import pandas
+from scipy.sparse import csr_matrix
+from scimilarity import CellAnnotation
+from scimilarity.utils import align_dataset, lognorm_counts
+
+
+def format_scimilarity(adata: anndata.AnnData) -> anndata.AnnData:
+    """
+    Creates a new AnnData object formatted for running SCimilarity:
+
+    - Gene symbols, taken from the 'gene_symbol' column in 'adata.var' are used as var_names
+    If any duplicate gene symbols are found, they are collapsed by summing counts.
+    - The summed counts matrix is then stored in the 'counts' layer of the new AnnData object.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Input AnnData object, with "gene_symbol" column in `adata.var`.
+
+    Returns
+    -------
+    AnnData
+        New AnnData object with gene symbols as var_names and counts stored in the 'counts' layer.
+    """
+
+    # Check that gene symbol column exists
+    if not "gene_symbol" in adata.var.columns:
+        raise ValueError(
+            "The input AnnData object must have a 'gene_symbol' column in `adata.var`."
+        )
+
+    # set gene symbols as var_names and make sure X has the raw counts
+    adata.var_names = adata.var["gene_symbol"].astype(str)
+    adata.X = adata.raw.X
+
+    # create a DataFrame with raw counts, dropping anything that doesn't have a gene symbol
+    counts_df = adata.to_df().drop(columns=["nan"])
+    # Collapse duplicates by summing and make sparse
+    collapsed_df = counts_df.T.groupby(level=0).sum().T
+
+    # Build new AnnData with collapsed counts stored as layers
+    # this is expected by SCimilarity
+    adata_collapsed = anndata.AnnData(
+        obs=pandas.DataFrame(index=collapsed_df.index),  # original cell barcodes
+        var=pandas.DataFrame(
+            index=collapsed_df.columns
+        ),  # gene symbols after collapsing
+        layers={"counts": csr_matrix(collapsed_df)},
+    )
+
+    return adata_collapsed
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Annotate ScPCA samples using SCimilarity",
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=Path,
+        required=True,
+        help="Path to the directory containing the SCimilarity foundational model",
+    )
+    parser.add_argument(
+        "--processed_h5ad_file",
+        type=Path,
+        required=True,
+        help="Path to the processed AnnData object stored as an h5ad file",
+    )
+    parser.add_argument(
+        "--ontology_map_file",
+        type=Path,
+        required=True,
+        help="Path to TSV file containing Cell Ontology identifiers for annotation terms",
+    )
+    parser.add_argument(
+        "--predictions_tsv",
+        type=Path,
+        required=True,
+        help="Path to the output TSV file with cell type annotations",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=2025,
+        help="Random seed to ensure reproducibility",
+    )
+    arg = parser.parse_args()
+
+    # Set the seed
+    random.seed(arg.seed)
+
+    ################################################
+    ########### Input argument checks ##############
+    ################################################
+    arg_error = False
+
+    # Check that input files exist
+    if not arg.model_dir.is_dir():
+        print(
+            f"The provided reference SCimilarity model could not be found at: {arg.model_dir}.",
+            file=sys.stderr,
+        )
+        arg_error = True
+    if not arg.processed_h5ad_file.is_file():
+        print(
+            f"The provided input H5AD file could not be found at: {arg.processed_h5ad_file}.",
+            file=sys.stderr,
+        )
+        arg_error = True
+    if not arg.ontology_map_file.is_file():
+        print(
+            f"The ontology map file could not be found at: {arg.ontology_map_file}.",
+            file=sys.stderr,
+        )
+        arg_error = True
+
+    # Exit if error(s)
+    if arg_error:
+        sys.exit(1)
+
+    ################################################
+    ################ Prep input data ###############
+    ################################################
+
+    # Read in model
+    scimilarity_model = CellAnnotation(model_path=arg.model_dir)
+
+    # read in ontology identifiers
+    ontology_map = pandas.read_csv(
+        arg.ontology_map_file, sep="\t", index_col="scimilarity_celltype_annotation"
+    )
+
+    # Read and make sure object formatting is correct
+    processed_anndata = anndata.read_h5ad(arg.processed_h5ad_file)
+    processed_anndata = format_scimilarity(processed_anndata)
+
+    # Preprocess the data
+    # Align the query dataset to the reference model
+    processed_anndata = align_dataset(processed_anndata, scimilarity_model.gene_order)
+    # Log-normalize the counts
+    processed_anndata = lognorm_counts(processed_anndata)
+
+    ################################################
+    ############### Run Scimilarity ###############
+    ################################################
+
+    # compute embeddings
+    processed_anndata.obsm["X_scimilarity"] = scimilarity_model.get_embeddings(
+        processed_anndata.X
+    )
+
+    # Predict cell types
+    predictions, nn_idxs, nn_dists, nn_stats = scimilarity_model.get_predictions_knn(
+        processed_anndata.obsm["X_scimilarity"], weighting=True
+    )
+
+    ################################################
+    ################ Export annotations ############
+    ################################################
+
+    # prepare the predictions with min distance for export
+    predictions_df = pandas.DataFrame(
+        {
+            "barcode": processed_anndata.obs_names.to_list(),
+            "scimilarity_celltype_annotation": predictions.values,
+            "min_dist": nn_stats["min_dist"],
+        }
+    )
+    # add in ontology IDs
+    predictions_df = predictions_df.join(
+        ontology_map, on="scimilarity_celltype_annotation", validate="many_to_one"
+    )
+
+    # export TSV
+    predictions_df.to_csv(arg.predictions_tsv, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nextflow.config b/nextflow.config
@@ -113,8 +113,13 @@ profiles {
       sim_bucket = "test/stub/simulated" // local output
       project = "SCPCP000012" // a small project
 
+      // override large reference files for stub testing
       // Override full NBAtlas with empty file for stub testing
       cell_type_nb_04_nbatlas_url = "${projectDir}/modules/cell-type-neuroblastoma-04/resources/test/NBAtlas_test.rds"
+
+      // scimilarity model stub
+      cell_type_scimilarity_model = "modules/cell-type-scimilarity/resources/test/scimilarity_model" // local stub model
+
     }
     process {
       executor = 'local'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -94,6 +94,10 @@
         "cell_type_nb_04_container": {
           "type": "string",
           "default": "public.ecr.aws/openscpca/cell-type-neuroblastoma-04:latest"
+        },
+        "cell_type_scimilarity_container": {
+          "type": "string",
+          "default": "public.ecr.aws/openscpca/cell-type-scimilarity:latest"
         }
       }
     },
@@ -194,6 +198,20 @@
           "mimetype": "text/tab-separated-values",
           "description": "Table with AUC thresholds to use for each gene set to define cell states"
         },
+        "cell_type_scimilarity_model": {
+          "type": "string",
+          "default": "s3://scpca-references/celltype/scimilarity_references/model_v1.1",
+          "format": "directory-path",
+          "description": "URI to SCimilarity model"
+        },
+        "cell_type_scimilarity_ontology_ref_file": {
+          "type": "string",
+          "default": "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-scimilarity/references/scimilarity-mapped-ontologies.tsv",
+          "pattern": "\\.tsv$",
+          "format": "file-path",
+          "mimetype": "text/tab-separated-values",
+          "description": "TSV file with SCimilarity annotation labels and associated cell ontology identifiers"
+        },
         "cell_type_nb_04_scanvi_pp_threshold": {
           "type": "number",
           "default": 0.75,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		This directory contains files used in testing when running with the `stub` profile.

		* `scimilarity_model` is an empty directory to use in `stub` runs instead of downloading the full `SCimilarity` model