Un-pin scPRINT and update parameters (#51)

jkobject · lazappi · web-flow · commit 81856f138137 · 2025-02-21T08:48:24.000+01:00
Co-authored-by: Luke Zappia &lt;lazappi@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# task_batch_integration devel
+
+## Minor changes
+
+* Un-pin the scPRINT version and update parameters (PR #51)
+
 # task_batch_integration 2.0.0
 
 A major update to the OpenProblems framework, switching from a Python-based framework to a Viash + Nextflow-based framework. This update features the same concepts as the previous version, but with a new implementation that is more flexible, scalable, and maintainable.
diff --git a/_viash.yaml b/_viash.yaml
@@ -91,7 +91,11 @@ authors:
     info:
       github: sainirmayi
       orcid: 0009-0003-6319-9803
-
+  - name: Jeremie Kalfon
+    roles: [contributor]
+    info:
+      github: jkobject
+      orcid: 0000-0002-2818-9728
 config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
 
diff --git a/src/methods/scprint/config.vsh.yaml b/src/methods/scprint/config.vsh.yaml
@@ -57,7 +57,7 @@ arguments:
   - name: --batch_size
     type: integer
     description: The size of the batches to be used in the DataLoader.
-    default: 64
+    default: 32
   - name: --max_len
     type: integer
     description: The maximum length of the gene sequence.
@@ -75,19 +75,15 @@ engines:
     setup:
       - type: python
         pip:
-          - huggingface_hub
-          # Can be unpinned after https://github.com/cantinilab/scPRINT/issues/14 is resolved
-          - scprint==1.6.2
-          - scdataloader==1.6.4
+          - scprint
       - type: docker
         run: lamin init --storage ./main --name main --schema bionty
-      - type: python
-        script: import bionty as bt; bt.core.sync_all_sources_to_latest()
       - type: docker
         run: lamin load anonymous/main
       - type: python
         script: from scdataloader.utils import populate_my_ontology; populate_my_ontology()
-
+      - type: python
+        script: import bionty as bt; bt.core.sync_all_sources_to_latest()
 runners:
   - type: executable
   - type: nextflow
diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py
@@ -58,32 +58,39 @@
     model_checkpoint_file = hf_hub_download(
         repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt"
     )
-print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True)
-model = scPrint.load_from_checkpoint(
-    model_checkpoint_file,
-    transformer="normal",  # Don't use this for GPUs with flashattention
-    precpt_gene_emb=None,
-)
 
 print("\n>>> Embedding data...", flush=True)
 if torch.cuda.is_available():
     print("CUDA is available, using GPU", flush=True)
     precision = "16"
     dtype = torch.float16
+    transformer="flash"
 else:
     print("CUDA is not available, using CPU", flush=True)
     precision = "32"
     dtype = torch.float32
-n_cores_available = len(os.sched_getaffinity(0))
-print(f"Using {n_cores_available} worker cores")
+    transformer="normal"
+
+print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True)
+model = scPrint.load_from_checkpoint(
+    model_checkpoint_file,
+    transformer=transformer,  # Don't use this for GPUs with flashattention
+    precpt_gene_emb=None,
+)
+
+n_cores = min(len(os.sched_getaffinity(0)), 24)
+print(f"Using {n_cores} worker cores")
 embedder = Embedder(
     how="random expr",
     batch_size=par["batch_size"],
     max_len=par["max_len"],
     add_zero_genes=0,
-    num_workers=n_cores_available,
+    num_workers=n_cores,
     doclass=False,
     doplot=False,
+    pred_embedding=["cell_type_ontology_term_id"],
+    keep_all_cls_pred=False,
+    output_expression="none",
     precision=precision,
     dtype=dtype,
 )