Update scPRINT to handle large datasets (#54)

jkobject · lazappi · web-flow · commit a75dc6e3f4a9 · 2025-03-18T10:43:01.000+01:00
* dbug scprint

* allowing flash attn

* Update _viash.yaml

* Update CHANGELOG

* adding some debug

* better model loading and new model

* final debug

* better now

* finish debug

* ending tests successfully

* removing flag

* new dataloader version

* Update CHANGELOG

---------

Co-authored-by: Luke Zappia &lt;lazappi@users.noreply.github.com&gt;
Co-authored-by: Luke Zappia &lt;luke@data-intuitive.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Minor changes
 
 * Un-pin the scPRINT version and update parameters (PR #51)
+* Update scPRINT to better handle large datasets, including a new default model (PR #54)
 
 # task_batch_integration 2.0.0
 
diff --git a/src/methods/scprint/config.vsh.yaml b/src/methods/scprint/config.vsh.yaml
@@ -35,7 +35,7 @@ info:
     scprint_large:
       model_name: "large"
     scprint_medium:
-      model_name: "medium"
+      model_name: "v2-medium"
     scprint_small:
       model_name: "small"
   test_setup:
@@ -48,8 +48,8 @@ arguments:
   - name: "--model_name"
     type: "string"
     description: Which model to use. Not used if --model is provided.
-    choices: ["large", "medium", "small"]
-    default: "large"
+    choices: ["large", "v2-medium", "small"]
+    default: "v2-medium"
   - name: --model
     type: file
     description: Path to the scPRINT model.
@@ -75,15 +75,17 @@ engines:
     setup:
       - type: python
         pip:
-          - scprint
+          - git+https://github.com/cantinilab/scPRINT.git@d8cc270b099c8d5dacf6913acc26f2b696685b2b
+          - gseapy==1.1.2
+          - git+https://github.com/jkobject/scDataLoader.git@c67c24a2e5c62399912be39169aae76e29e108aa
       - type: docker
         run: lamin init --storage ./main --name main --schema bionty
       - type: docker
         run: lamin load anonymous/main
-      - type: python
-        script: from scdataloader.utils import populate_my_ontology; populate_my_ontology()
       - type: python
         script: import bionty as bt; bt.core.sync_all_sources_to_latest()
+      - type: python
+        script: from scdataloader.utils import populate_my_ontology; populate_my_ontology()
 runners:
   - type: executable
   - type: nextflow
diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py
@@ -13,7 +13,7 @@
 par = {
     "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "output": "output.h5ad",
-    "model_name": "large",
+    "model_name": "v2-medium",
     "model": None,
 }
 meta = {"name": "scprint"}
@@ -30,14 +30,18 @@
 
 print("\n>>> Reading input data...", flush=True)
 input = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
-if input.uns["dataset_organism"] == "homo_sapiens":
-    input.obs["organism_ontology_term_id"] = "NCBITaxon:9606"
-elif input.uns["dataset_organism"] == "mus_musculus":
-    input.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
-else:
-    exit_non_applicable(
-        f"scPRINT requires human or mouse data, not '{input.uns['dataset_organism']}'"
-    )
+if (
+    "organism_ontology_term_id" not in input.obs.columns
+    and "dataset_organism" in input.uns
+):
+    if input.uns["dataset_organism"] == "homo_sapiens":
+        input.obs["organism_ontology_term_id"] = "NCBITaxon:9606"
+    elif input.uns["dataset_organism"] == "mus_musculus":
+        input.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
+    else:
+        exit_non_applicable(
+            f"scPRINT requires human or mouse data, not '{input.uns['dataset_organism']}'"
+        )
 adata = input.copy()
 
 print("\n>>> Preprocessing data...", flush=True)
@@ -59,25 +63,36 @@
         repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt"
     )
 
-print("\n>>> Embedding data...", flush=True)
 if torch.cuda.is_available():
     print("CUDA is available, using GPU", flush=True)
     precision = "16"
     dtype = torch.float16
-    transformer="flash"
+    transformer = "flash"
 else:
     print("CUDA is not available, using CPU", flush=True)
     precision = "32"
     dtype = torch.float32
-    transformer="normal"
+    transformer = "normal"
 
 print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True)
-model = scPrint.load_from_checkpoint(
-    model_checkpoint_file,
-    transformer=transformer,  # Don't use this for GPUs with flashattention
-    precpt_gene_emb=None,
-)
 
+m = torch.load(model_checkpoint_file, map_location=torch.device("cpu"))
+if "label_counts" in m["hyper_parameters"]:
+    model = scPrint.load_from_checkpoint(
+        model_checkpoint_file,
+        transformer=transformer,  # Don't use this for GPUs with flashattention
+        precpt_gene_emb=None,
+        classes=m["hyper_parameters"]["label_counts"],
+    )
+else:
+    model = scPrint.load_from_checkpoint(
+        model_checkpoint_file,
+        transformer=transformer,  # Don't use this for GPUs with flashattention
+        precpt_gene_emb=None,
+    )
+del m
+
+print("\n>>> Embedding data...", flush=True)
 n_cores = min(len(os.sched_getaffinity(0)), 24)
 print(f"Using {n_cores} worker cores")
 embedder = Embedder(
@@ -91,6 +106,7 @@
     pred_embedding=["cell_type_ontology_term_id"],
     keep_all_cls_pred=False,
     output_expression="none",
+    save_every=30_000,
     precision=precision,
     dtype=dtype,
 )
@@ -101,7 +117,7 @@
     obs=input.obs[[]],
     var=input.var[[]],
     obsm={
-        "X_emb": embedded.obsm["scprint"],
+        "X_emb": embedded.obsm["scprint_emb"],
     },
     uns={
         "dataset_id": input.uns["dataset_id"],