Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/containers.config
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ params{
cell_type_nb_04_container = 'public.ecr.aws/openscpca/cell-type-neuroblastoma-04:latest'


// cell-type-scimilarity module
cell_type_scimilarity_container = 'public.ecr.aws/openscpca/cell-type-scimilarity:latest'

}
5 changes: 5 additions & 0 deletions config/module_params.config
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,9 @@ params{
cell_type_nb_04_label_map_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-neuroblastoma-04/references/nbatlas-label-map.tsv'
cell_type_nb_04_ontology_map_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-neuroblastoma-04/references/nbatlas-ontology-ids.tsv'

// cell type scimilarity
// TODO: Replace with tagged links once we have a new OpenScPCA-analysis tag
cell_type_scimilarity_model = 's3://scpca-references/celltype/scimilarity_references/model_v1.1'
cell_type_scimilarity_ontology_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-scimilarity/references/scimilarity-mapped-ontologies.tsv'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

noting we'll want to update this one too with a tagged link, same as my NB urls above


}
4 changes: 4 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ include { cell_type_consensus } from './modules/cell-type-consensus'
include { cell_type_ewings } from './modules/cell-type-ewings'
include { cell_type_neuroblastoma_04 } from './modules/cell-type-neuroblastoma-04'
include { infercnv_gene_order } from './modules/infercnv-gene-order'
include { cell_type_scimilarity } from './modules/cell-type-scimilarity'

// **** Parameter checks ****
include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
Expand Down Expand Up @@ -76,4 +77,7 @@ workflow {
// Run the cell type neuroblastoma 04 workflow
// only runs on SCPCP000004
cell_type_neuroblastoma_04(sample_ch.filter{ it[1] == "SCPCP000004" })

// run the scimilarity cell type workflow
cell_type_scimilarity(sample_ch)
}
12 changes: 12 additions & 0 deletions modules/cell-type-scimilarity/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
This module performs cell type annotation using [`SCimilarity`](https://genentech.github.io/scimilarity/)
Scripts are derived from the the `cell-type-scimilarity` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.

Links to specific original scripts used in this module:

- `02-run-scimilarity.py`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-scimilarity/scripts/02-run-scimilarity.py>

This module uses the [`SCimilarity` foundational model](https://zenodo.org/records/10685499) which can be found at `s3://scpca-references/celltype/scimilarity_references/model_v1.1`.

This module also uses the following reference files found in the `OpenScPCA-analysis` repository:

- `scimilarity-mapped-ontologies.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-scimilarity/references/scimilarity-mapped-ontologies.tsv>
64 changes: 64 additions & 0 deletions modules/cell-type-scimilarity/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env nextflow

// Workflow to assign cell types using SCimilarity

process assign_scimilarity {
container params.cell_type_scimilarity_container
tag "${sample_id}"
label 'mem_32'
publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-scimilarity/${project_id}/${sample_id}", mode: 'copy'
input:
tuple val(sample_id),
val(project_id),
path(library_files)
path scimilarity_model
path ontology_map_file
output:
tuple val(sample_id),
val(project_id),
path("*_scimilarity-celltype-assignments.tsv.gz")
script:
"""
for file in ${library_files}; do
run-scimilarity.py \
--model_dir ${scimilarity_model} \
--processed_h5ad_file \$file \
--ontology_map_file ${ontology_map_file} \
--predictions_tsv \$(basename \${file%_rna.h5ad}_scimilarity-celltype-assignments.tsv.gz) \
--seed 2025
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is assigned in the file so you probably don't need it, but doesn't

done
"""

stub:
"""
for file in ${library_files}; do
touch \$(basename \${file%_rna.h5ad}_scimilarity-celltype-assignments.tsv.gz)
done
"""
}



workflow cell_type_scimilarity {
take:
sample_ch // [sample_id, project_id, sample_path]
main:
// create [sample_id, project_id, [list of processed files]]
libraries_ch = sample_ch
.map{sample_id, project_id, sample_path ->
def library_files = Utils.getLibraryFiles(sample_path, format: "anndata", process_level: "processed")
// filter to only include _rna.h5ad files and remove any _adt.h5ad files
library_files = library_files.findAll{ it.name =~ /(?i)_rna.h5ad$/ }
return [sample_id, project_id, library_files]
}

// assign cell types using scimilarity
assign_scimilarity(
libraries_ch,
file(params.cell_type_scimilarity_model, type: 'dir'),
file(params.cell_type_scimilarity_ontology_ref_file)
)

emit:
assign_scimilarity.out // [sample_id, project_id, [list of scimilarity output files]]
}
3 changes: 3 additions & 0 deletions modules/cell-type-scimilarity/resources/test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This directory contains files used in testing when running with the `stub` profile.

* `scimilarity_model` is an empty directory to use in `stub` runs instead of downloading the full `SCimilarity` model
Empty file.
193 changes: 193 additions & 0 deletions modules/cell-type-scimilarity/resources/usr/bin/run-scimilarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#!/usr/bin/env python3

# Script to annotate processed ScPCA objects with SCimilarity
# Follows this tutorial: https://genentech.github.io/scimilarity/notebooks/cell_annotation_tutorial.html

# Adapted from: https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-scimilarity/scripts/02-run-scimilarity.py

import argparse
import sys
from pathlib import Path
import random

import anndata
import pandas
from scipy.sparse import csr_matrix
from scimilarity import CellAnnotation
from scimilarity.utils import align_dataset, lognorm_counts


def format_scimilarity(adata: anndata.AnnData) -> anndata.AnnData:
"""
Creates a new AnnData object formatted for running SCimilarity:

- Gene symbols, taken from the 'gene_symbol' column in 'adata.var' are used as var_names
If any duplicate gene symbols are found, they are collapsed by summing counts.
- The summed counts matrix is then stored in the 'counts' layer of the new AnnData object.

Parameters
----------
adata : AnnData
Input AnnData object, with "gene_symbol" column in `adata.var`.

Returns
-------
AnnData
New AnnData object with gene symbols as var_names and counts stored in the 'counts' layer.
"""

# Check that gene symbol column exists
if not "gene_symbol" in adata.var.columns:
raise ValueError(
"The input AnnData object must have a 'gene_symbol' column in `adata.var`."
)

# set gene symbols as var_names and make sure X has the raw counts
adata.var_names = adata.var["gene_symbol"].astype(str)
adata.X = adata.raw.X

# create a DataFrame with raw counts, dropping anything that doesn't have a gene symbol
counts_df = adata.to_df().drop(columns=["nan"])
# Collapse duplicates by summing and make sparse
collapsed_df = counts_df.T.groupby(level=0).sum().T

# Build new AnnData with collapsed counts stored as layers
# this is expected by SCimilarity
adata_collapsed = anndata.AnnData(
obs=pandas.DataFrame(index=collapsed_df.index), # original cell barcodes
var=pandas.DataFrame(
index=collapsed_df.columns
), # gene symbols after collapsing
layers={"counts": csr_matrix(collapsed_df)},
)

return adata_collapsed


def main() -> None:
parser = argparse.ArgumentParser(
description="Annotate ScPCA samples using SCimilarity",
)
parser.add_argument(
"--model_dir",
type=Path,
required=True,
help="Path to the directory containing the SCimilarity foundational model",
)
parser.add_argument(
"--processed_h5ad_file",
type=Path,
required=True,
help="Path to the processed AnnData object stored as an h5ad file",
)
parser.add_argument(
"--ontology_map_file",
type=Path,
required=True,
help="Path to TSV file containing Cell Ontology identifiers for annotation terms",
)
parser.add_argument(
"--predictions_tsv",
type=Path,
required=True,
help="Path to the output TSV file with cell type annotations",
)
parser.add_argument(
"--seed",
type=int,
default=2025,
help="Random seed to ensure reproducibility",
)
arg = parser.parse_args()

# Set the seed
random.seed(arg.seed)

################################################
########### Input argument checks ##############
################################################
arg_error = False

# Check that input files exist
if not arg.model_dir.is_dir():
print(
f"The provided reference SCimilarity model could not be found at: {arg.model_dir}.",
file=sys.stderr,
)
arg_error = True
if not arg.processed_h5ad_file.is_file():
print(
f"The provided input H5AD file could not be found at: {arg.processed_h5ad_file}.",
file=sys.stderr,
)
arg_error = True
if not arg.ontology_map_file.is_file():
print(
f"The ontology map file could not be found at: {arg.ontology_map_file}.",
file=sys.stderr,
)
arg_error = True

# Exit if error(s)
if arg_error:
sys.exit(1)

################################################
################ Prep input data ###############
################################################

# Read in model
scimilarity_model = CellAnnotation(model_path=arg.model_dir)

# read in ontology identifiers
ontology_map = pandas.read_csv(
arg.ontology_map_file, sep="\t", index_col="scimilarity_celltype_annotation"
)

# Read and make sure object formatting is correct
processed_anndata = anndata.read_h5ad(arg.processed_h5ad_file)
processed_anndata = format_scimilarity(processed_anndata)

# Preprocess the data
# Align the query dataset to the reference model
processed_anndata = align_dataset(processed_anndata, scimilarity_model.gene_order)
# Log-normalize the counts
processed_anndata = lognorm_counts(processed_anndata)

################################################
############### Run Scimilarity ###############
################################################

# compute embeddings
processed_anndata.obsm["X_scimilarity"] = scimilarity_model.get_embeddings(
processed_anndata.X
)

# Predict cell types
predictions, nn_idxs, nn_dists, nn_stats = scimilarity_model.get_predictions_knn(
processed_anndata.obsm["X_scimilarity"], weighting=True
)

################################################
################ Export annotations ############
################################################

# prepare the predictions with min distance for export
predictions_df = pandas.DataFrame(
{
"barcode": processed_anndata.obs_names.to_list(),
"scimilarity_celltype_annotation": predictions.values,
"min_dist": nn_stats["min_dist"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this column in this workflow?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! This is a stat we are going to use to measure confidence, as recommended by SCimilarity docs, so we want to output it so we can use it for exploratory analysis.

}
)
# add in ontology IDs
predictions_df = predictions_df.join(
ontology_map, on="scimilarity_celltype_annotation", validate="many_to_one"
)

# export TSV
predictions_df.to_csv(arg.predictions_tsv, sep="\t", index=False)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,13 @@ profiles {
sim_bucket = "test/stub/simulated" // local output
project = "SCPCP000012" // a small project

// override large reference files for stub testing
// Override full NBAtlas with empty file for stub testing
cell_type_nb_04_nbatlas_url = "${projectDir}/modules/cell-type-neuroblastoma-04/resources/test/NBAtlas_test.rds"

// scimilarity model stub
cell_type_scimilarity_model = "modules/cell-type-scimilarity/resources/test/scimilarity_model" // local stub model

}
process {
executor = 'local'
Expand Down
18 changes: 18 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@
"cell_type_nb_04_container": {
"type": "string",
"default": "public.ecr.aws/openscpca/cell-type-neuroblastoma-04:latest"
},
"cell_type_scimilarity_container": {
"type": "string",
"default": "public.ecr.aws/openscpca/cell-type-scimilarity:latest"
}
}
},
Expand Down Expand Up @@ -194,6 +198,20 @@
"mimetype": "text/tab-separated-values",
"description": "Table with AUC thresholds to use for each gene set to define cell states"
},
"cell_type_scimilarity_model": {
"type": "string",
"default": "s3://scpca-references/celltype/scimilarity_references/model_v1.1",
"format": "directory-path",
"description": "URI to SCimilarity model"
},
"cell_type_scimilarity_ontology_ref_file": {
"type": "string",
"default": "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-scimilarity/references/scimilarity-mapped-ontologies.tsv",
"pattern": "\\.tsv$",
"format": "file-path",
"mimetype": "text/tab-separated-values",
"description": "TSV file with SCimilarity annotation labels and associated cell ontology identifiers"
},
"cell_type_nb_04_scanvi_pp_threshold": {
"type": "number",
"default": 0.75,
Expand Down