openproblems-bio · ghar1821 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -159,4 +159,6 @@
   * All control methods are updated to cater the new schema.
   * All control methods are re-enabled. Selectively disable them when running the pipeline using method exclude.
 
+* Fix bug in EMD where nan cannot be written out and added sklearn dependency for cytovi (PR #110). 
+
 
diff --git a/src/methods/cytovi/config.vsh.yaml b/src/methods/cytovi/config.vsh.yaml
@@ -72,6 +72,7 @@ engines:
           - pyyaml
           - requests
           - jsonschema
+          - scikit-learn
         github:
           - openproblems-bio/core#subdirectory=packages/python/openproblems
 

diff --git a/src/methods/cytovi/script.py b/src/methods/cytovi/script.py
@@ -1,8 +1,8 @@
 import anndata as ad
 import numpy as np
-import scanpy as sc
 from scvi.external import cytovi
 from sklearn.cluster import KMeans
+from threadpoolctl import threadpool_limits
 
 ## VIASH START
 par = {
@@ -38,11 +38,12 @@
 
 print("Clustering using k-means with k =", par["n_clusters"], flush=True)
 # cluster data using Kmeans
-adata_to_correct.obs["clusters"] = (
-    KMeans(n_clusters=par["n_clusters"], random_state=0)
-    .fit_predict(adata_to_correct.layers["scaled"])
-    .astype(str)
-)
+with threadpool_limits(limits=1):
+    adata_to_correct.obs["clusters"] = (
+        KMeans(n_clusters=par["n_clusters"], random_state=0)
+        .fit_predict(adata_to_correct.layers["scaled"])
+        .astype(str)
+    )
 # concatenate obs so we can use it for subsampling
 adata_to_correct.obs["sample_cluster"] = (
     adata_to_correct.obs["sample"].astype(str) + "_" + adata_to_correct.obs["clusters"]

diff --git a/src/metrics/emd/helper.py b/src/metrics/emd/helper.py
@@ -59,14 +59,12 @@ def calculate_vertical_emd(
 
         # mean cell type emd across all sample combinations, markers, and splits
         mean_emd_ct = np.nanmean(
-            emd_long[emd_long["cell_type"] != "global"]
-            .drop(columns=["cell_type", "first_sample", "second_sample"])
+            emd_long.drop(columns=["cell_type", "first_sample", "second_sample"])
             .to_numpy()
             .flatten()
         )
         max_emd_ct = np.nanmax(
-            emd_long[emd_long["cell_type"] != "global"]
-            .drop(columns=["cell_type", "first_sample", "second_sample"])
+            emd_long.drop(columns=["cell_type", "first_sample", "second_sample"])
             .to_numpy()
             .flatten()
         )
@@ -114,7 +112,7 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
             f" at least 2 samples per group. Skipping EMD vertical calculation."
         )
 
-        return np.nan, np.nan
+        return np.nan
 
     cell_types = i_adata.obs["cell_type"].unique()
 
@@ -156,6 +154,7 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
     # remove unparsable characters like "/"
     emd_vals.columns = emd_vals.columns.str.replace("/", "_")
 
+    # TODO remove me once we are happy with the results
     # prepare the data to draw the heatmap in cytonorm 2 supp paper.
     # 1 row/column = 1 sample, a cell is emd for a given marker
     # repeat for every marker assessed
Original file line number	Diff line number	Diff line change
Expand Up		@@ -159,4 +159,6 @@
		* All control methods are updated to cater the new schema.
		* All control methods are re-enabled. Selectively disable them when running the pipeline using method exclude.

		* Fix bug in EMD where nan cannot be written out and added sklearn dependency for cytovi (PR #110).