Reduce sp and sc data to shared genes (#59)

LouisK92 · web-flow · commit dfbf973a1058 · 2025-09-20T13:45:42.000+02:00
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
@@ -19,13 +19,12 @@
 # Load the spatial data
 sdata = sd.read_zarr(par["input_sp"])
 
-# Subset the single-cell data to spatial genes
-genes_sp = []
-for key in sdata.tables.keys():
-    # todo: var column names need to be updated to match the rest of openproblems
-    genes_sp = genes_sp + sdata.tables[key].var_names.tolist()
-genes_sp = list(np.unique(genes_sp))
-adata = adata[:,adata.var["feature_name"].isin(genes_sp)].copy()
+# Subset single-cell and spatial data to shared genes
+sp_genes = sdata['transcripts']['feature_name'].unique().compute().tolist()
+sc_genes = adata.var["feature_name"].unique().tolist()
+shared_genes = list(set(sp_genes) & set(sc_genes))
+sdata['transcripts'] = sdata['transcripts'].loc[sdata['transcripts']['feature_name'].isin(shared_genes)]
+adata = adata[:,adata.var["feature_name"].isin(shared_genes)].copy()
 
 # Use feature names for adata instead of feature ids. convert to str
 adata.var.reset_index(inplace=True, drop=True)
diff --git a/src/workflows/process_datasets/test.sh b/src/workflows/process_datasets/test.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+# NOTE: For local testing you might need to reduce the memory in src/data_processors/process_dataset/config.vsh.yaml
+#       Don't forget to rebuild that dependency of the workflow:
+#       viash ns build src/data_processors/process_dataset/config.vsh.yaml --setup cachedbuild
+
 nextflow run . \
   -main-script target/nextflow/workflows/process_datasets/main.nf \
   -profile docker \