Merge remote-tracking branch 'origin/main' into feature/no-ref/add-mlflow-models

rcannood · rcannood · commit 374d45565e4e · 2025-10-11T12:43:01.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,16 @@
 ## New functionality
 
 * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52).
+* Added `methods/stacas` new method (PR #58).
+    - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations.
 * Added `method/drvi` component (PR #61).
+* Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68).
+
+* Added `metrics/cilisi` new metric component (PR #57).
+    - ciLISI measures batch mixing in a cell type-aware manner by computing iLISI within each cell type and normalizing
+        the scores between 0 and 1. Unlike iLISI, ciLISI preserves sensitivity to biological variance and avoids favoring
+        overcorrected datasets with removed cell type signals.
+        We propose adding this metric to substitute iLISI.
 
 ## Minor changes
 
@@ -12,7 +21,8 @@
 
 ## Bug fixes
 
-* Update scPRINT to use latest stable version (PR #xx)
+* Update scPRINT to use latest stable version (PR #70)
+* Fix kbet dependencies to numpy<2 and scipy<=1.13 (PR #78).
 
 # task_batch_integration 2.0.0
 
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit b60eda085e9cd505ec169fa30cc7e919e8563ad2
+Subproject commit 67da19a36ae56ea068804d15ccadec88a06da920
diff --git a/scripts/render_report.sh b/scripts/render_report.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+set -e
+
+common/scripts/render_results_report "$@"
diff --git a/src/methods/scgpt_finetuned/config.vsh.yaml b/src/methods/scgpt_finetuned/config.vsh.yaml
@@ -51,13 +51,20 @@ engines:
     image: openproblems/base_pytorch_nvidia:1
     # TODO: Try to find working installation of flash attention (flash-attn<1.0.5)
     setup:
-      - type: python
-        pypi:
-          - gdown
-          - scgpt # Install from PyPI to get dependencies
+      #- type: python
+      #  pypi:
+      #    - gdown
+      #    - scgpt # Install from PyPI to get dependencies
+      #- type: docker
+      #  # Force re-installing from GitHub to get bug fixes
+      #  run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git
       - type: docker
-        # Force re-installing from GitHub to get bug fixes
-        run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git
+        run: |
+          git clone https://github.com/bowang-lab/scGPT && \
+          pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121 && \
+          pip install "flash-attn<1.0.5" --no-build-isolation && \
+          pip install ipykernel pandas scanpy numba "numpy<1.24" torchtext==0.17.0 scib "scvi-tools<1.0" datasets==2.14.5 transformers==4.33.2 wandb "cell-gears<0.0.3" torch_geometric pyarrow==15.0.0 gdown && \
+          cd scGPT && pip install -e . --no-deps
 
 runners:
   - type: executable
diff --git a/src/methods/scgpt_zeroshot/config.vsh.yaml b/src/methods/scgpt_zeroshot/config.vsh.yaml
@@ -53,13 +53,20 @@ engines:
     image: openproblems/base_pytorch_nvidia:1
     # TODO: Try to find working installation of flash attention (flash-attn<1.0.5)
     setup:
-      - type: python
-        pypi:
-          - gdown
-          - scgpt # Install from PyPI to get dependencies
+      #- type: python
+      #  pypi:
+      #    - gdown
+      #    - scgpt # Install from PyPI to get dependencies
+      #- type: docker
+      #  # Force re-installing from GitHub to get bug fixes
+      #  run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git
       - type: docker
-        # Force re-installing from GitHub to get bug fixes
-        run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git
+        run: |
+          git clone https://github.com/bowang-lab/scGPT && \
+          pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121 && \
+          pip install "flash-attn<1.0.5" --no-build-isolation && \
+          pip install ipykernel pandas scanpy numba "numpy<1.24" torchtext==0.17.0 scib "scvi-tools<1.0" datasets==2.14.5 transformers==4.33.2 wandb "cell-gears<0.0.3" torch_geometric pyarrow==15.0.0 gdown && \
+          cd scGPT && pip install -e . --no-deps
 
 runners:
   - type: executable
diff --git a/src/methods/scprint/config.vsh.yaml b/src/methods/scprint/config.vsh.yaml
@@ -75,8 +75,7 @@ engines:
     setup:
       - type: python
         pip:
-          - scprint>=2.3.0
-          - gseapy>=1.1.8
+          - scprint==2.3.5
       - type: docker
         run: |
           lamin init --storage ./main --name main --schema bionty && \
@@ -87,6 +86,7 @@ engines:
         script: from scdataloader.utils import populate_my_ontology; populate_my_ontology()
 runners:
   - type: executable
+   # docker_run_args: --gpus all
   - type: nextflow
     directives:
       label: [hightime, highmem, midcpu, gpu, highsharedmem]
diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py
@@ -6,6 +6,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 from scdataloader import Preprocessor
+from scdataloader.utils import load_genes
 from scprint import scPrint
 from scprint.tasks import Embedder
 
@@ -63,34 +64,55 @@
         repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt"
     )
 
+print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True)
+
 if torch.cuda.is_available():
     print("CUDA is available, using GPU", flush=True)
-    precision = "16"
-    dtype = torch.float16
     transformer = "flash"
 else:
     print("CUDA is not available, using CPU", flush=True)
-    precision = "32"
-    dtype = torch.float32
     transformer = "normal"
 
-print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True)
+try:
+    m = torch.load(model_checkpoint_file)
+# if not use this instead since the model weights are by default mapped to GPU types
+except RuntimeError:
+    m = torch.load(model_checkpoint_file, map_location=torch.device("cpu"))
 
-m = torch.load(model_checkpoint_file, map_location=torch.device("cpu"))
+# both are for compatibility issues with different versions of the pretrained model, so we need to load it with the correct transformer
+if "prenorm" in m["hyper_parameters"]:
+    m["hyper_parameters"].pop("prenorm")
+    torch.save(m, model_checkpoint_file)
 if "label_counts" in m["hyper_parameters"]:
+    # you need to set precpt_gene_emb=None otherwise the model will look for its precomputed gene embeddings files although they were already converted into model weights, so you don't need this file for a pretrained model
     model = scPrint.load_from_checkpoint(
         model_checkpoint_file,
-        transformer=transformer,  # Don't use this for GPUs with flashattention
         precpt_gene_emb=None,
         classes=m["hyper_parameters"]["label_counts"],
+        transformer=transformer,
     )
 else:
     model = scPrint.load_from_checkpoint(
-        model_checkpoint_file,
-        transformer=transformer,  # Don't use this for GPUs with flashattention
-        precpt_gene_emb=None,
+        model_checkpoint_file, precpt_gene_emb=None, transformer=transformer
     )
 del m
+# this might happen if you have a model that was trained with a different set of genes than the one you are using in the ontology (e.g. newer ontologies), While having genes in the onlogy not in the model is fine. the opposite is not, so we need to remove the genes that are in the model but not in the ontology
+missing = set(model.genes) - set(load_genes(model.organisms).index)
+if len(missing) > 0:
+    print(
+        "Warning: some genes missmatch exist between model and ontology: solving...",
+    )
+    model._rm_genes(missing)
+
+# again if not on GPU you need to convert the model to float32
+if not torch.cuda.is_available():
+    model = model.to(torch.float32)
+
+# you can perform your inference on float16 if you have a GPU, otherwise use float64
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+
+# the models are often loaded with some parts still displayed as "cuda" and some as "cpu", so we need to make sure that the model is fully on the right device
+model = model.to("cuda" if torch.cuda.is_available() else "cpu")
 
 print("\n>>> Embedding data...", flush=True)
 n_cores = min(len(os.sched_getaffinity(0)), 24)
@@ -107,7 +129,6 @@
     keep_all_cls_pred=False,
     output_expression="none",
     save_every=30_000,
-    precision=precision,
     dtype=dtype,
 )
 embedded, _ = embedder(model, adata, cache=False)
diff --git a/src/methods/stacas/config.vsh.yaml b/src/methods/stacas/config.vsh.yaml
@@ -0,0 +1,37 @@
+__merge__: ../../api/comp_method.yaml
+name: stacas
+label: STACAS
+summary: Accurate semi-supervised integration of single-cell transcriptomics data
+description: |
+  STACAS is a method for scRNA-seq integration,
+  especially suited to accurately integrate datasets with large cell type imbalance
+  (e.g. in terms of proportions of distinct cell populations).
+  Prior cell type knowledge, given as cell type labels, can be provided to the algorithm to perform
+  semi-supervised integration, leading to increased preservation of biological variability
+  in the resulting integrated space.
+  STACAS is robust to incomplete cell type labels and can be applied to large-scale integration tasks.
+references:
+  doi: 10.1038/s41467-024-45240-z
+  # Andreatta M, Hérault L, Gueguen P, Gfeller D, Berenstein AJ, Carmona SJ.
+  # Semi-supervised integration of single-cell transcriptomics data.
+  # Nature Communications*. 2024;15(1):1-13. doi:10.1038/s41467-024-45240-z
+links:
+  documentation: https://carmonalab.github.io/STACAS.demo/STACAS.demo.html
+  repository: https://github.com/carmonalab/STACAS
+info:
+  preferred_normalization: log_cp10k
+  method_types: [embedding]
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      - type: r
+        github: carmonalab/STACAS@2.3.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/stacas/script.R b/src/methods/stacas/script.R
@@ -0,0 +1,56 @@
+requireNamespace("anndata", quietly = TRUE)
+suppressPackageStartupMessages({
+  library(STACAS)
+  library(Matrix)
+  library(SeuratObject)
+  library(Seurat)
+})
+
+## VIASH START
+par <- list(
+  input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(
+  name = "stacas"
+)
+## VIASH END
+
+cat("Reading input file\n")
+adata <- anndata::read_h5ad(par[["input"]])
+
+cat("Create Seurat object\n")
+# Transpose because Seurat expects genes in rows, cells in columns
+counts_r <- Matrix::t(adata$layers[["counts"]])
+normalized_r <- Matrix::t(adata$layers[["normalized"]])
+# Convert to a regular sparse matrix first and then to dgCMatrix
+counts_c <- as(as(counts_r, "CsparseMatrix"), "dgCMatrix")
+normalized_c <- as(as(normalized_r, "CsparseMatrix"), "dgCMatrix")
+
+# Create Seurat object with raw counts, these are needed to compute Variable Genes
+seurat_obj <- Seurat::CreateSeuratObject(counts = counts_c,
+                                         meta.data = adata$obs)
+# Manually assign pre-normalized values to the "data" slot
+seurat_obj@assays$RNA$data <- normalized_c
+
+cat("Run STACAS\n")
+object_integrated <- seurat_obj |>
+      Seurat::SplitObject(split.by = "batch") |>
+      STACAS::Run.STACAS() 
+
+cat("Store outputs\n")
+output <- anndata::AnnData(
+    uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name
+  ),
+  obs = adata$obs,
+  var = adata$var,
+  obsm = list(
+    X_emb = object_integrated@reductions$pca@cell.embeddings
+  )
+)
+
+cat("Write output AnnData to file\n")
+output$write_h5ad(par[["output"]], compression = "gzip")
diff --git a/src/metrics/bras/config.vsh.yaml b/src/metrics/bras/config.vsh.yaml
@@ -1,6 +1,7 @@
 __merge__: ../../api/comp_metric.yaml
 name: bras
 info:
+  metric_type: embedding
   metrics:
     - name: bras
       label: BRAS
diff --git a/src/metrics/cilisi/config.vsh.yaml b/src/metrics/cilisi/config.vsh.yaml
@@ -0,0 +1,51 @@
+__merge__: ../../api/comp_metric.yaml
+name: cilisi
+info:
+  metrics:
+    - name: cilisi
+      label: CiLISI
+      summary: Cell-type aware version of iLISI (Local inverse Simpson's Index).
+                iLISI is computed separately for each cell type or cluster, normalized between 0 and 1, and averaged across all cells (global mean).
+                By default, CiLISI is calculated only for groups with at least 10 cells and 2 distinct batch labels (configurable).
+      description: |
+        ciLISI measures batch mixing in a cell type-aware manner by computing iLISI within each cell type and normalizing
+        the scores between 0 and 1. Unlike iLISI, ciLISI preserves sensitivity to biological variance and avoids favoring
+        overcorrected datasets with removed cell type signals.
+      references:
+        doi: 10.1038/s41467-024-45240-z
+      links:
+        documentation: https://github.com/carmonalab/scIntegrationMetrics
+        repository: https://github.com/carmonalab/scIntegrationMetrics
+      min: 0
+      max: 1
+      maximize: true
+
+    - name: cilisi_means
+      label: CiLISI_means
+      summary: As CiLISI, but returns mean of per-group CiLISI values (i.e., average of the means per group). instead of a global average.
+      description: |
+        ciLISI measures batch mixing in a cell type-aware manner by computing iLISI within each cell type and normalizing
+        the scores between 0 and 1. Unlike iLISI, ciLISI preserves sensitivity to biological variance and avoids favoring
+        overcorrected datasets with removed cell type signals.
+      references:
+        doi: 10.1038/s41467-024-45240-z
+      links:
+        documentation: https://github.com/carmonalab/scIntegrationMetrics
+        repository: https://github.com/carmonalab/scIntegrationMetrics
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+       - type: r
+         github: https://github.com/carmonalab/scIntegrationMetrics.git@1.2.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/metrics/cilisi/script.R b/src/metrics/cilisi/script.R
@@ -0,0 +1,49 @@
+library(anndata)
+library(scIntegrationMetrics)
+
+## VIASH START
+par <- list(
+  input_integrated = "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad",
+  input_solution = "resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(
+  name = "cilisi"
+)
+## VIASH END
+
+cat("Reading input files\n")
+adata <- anndata::read_h5ad(par[["input_integrated"]])
+solution <- anndata::read_h5ad(par[["input_solution"]])
+embeddings <- adata$obsm[["X_emb"]]
+metadata <- solution$obs
+
+cat("Compute CiLISI metrics...\n")
+lisisplit <-
+  scIntegrationMetrics::compute_lisi_splitBy(
+    X = embeddings,
+    meta_data = metadata,
+    label_colnames = "batch",
+    perplexity = 30,
+    split_by_colname = "cell_type",
+    normalize = TRUE,
+    min.cells.split = 10,
+    min.vars.label = 2
+)
+# average CiLISI
+cilisi <- mean(unlist(lisisplit))
+# Mean per cell type
+cilisi_means <- mean(sapply(lisisplit, function(x) mean(x[, 1])))
+
+cat("Write output AnnData to file\n")
+output <- anndata::AnnData(
+  shape = c(1,2),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = adata$uns[["method_id"]],
+    metric_ids = c("cilisi", "cilisi_means"),
+    metric_values = list(cilisi, cilisi_means)
+  )
+)
+output$write_h5ad(par[["output"]], compression = "gzip")
diff --git a/src/metrics/clustering_overlap/config.vsh.yaml b/src/metrics/clustering_overlap/config.vsh.yaml
diff --git a/src/metrics/clustering_overlap/script.py b/src/metrics/clustering_overlap/script.py
diff --git a/src/metrics/kbet/config.vsh.yaml b/src/metrics/kbet/config.vsh.yaml

-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
++
 +#!/bin/bash
++
 +set -e
++
 +common/scripts/render_results_report "$@"