diff --git a/CHANGELOG.md b/CHANGELOG.md index 91dd0da3..2a0efbba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -159,4 +159,6 @@ * All control methods are updated to cater the new schema. * All control methods are re-enabled. Selectively disable them when running the pipeline using method exclude. +* Fix bug in EMD where nan cannot be written out and added sklearn dependency for cytovi (PR #110). + diff --git a/src/methods/cytovi/config.vsh.yaml b/src/methods/cytovi/config.vsh.yaml index 91095326..eb6ff7d8 100644 --- a/src/methods/cytovi/config.vsh.yaml +++ b/src/methods/cytovi/config.vsh.yaml @@ -72,6 +72,7 @@ engines: - pyyaml - requests - jsonschema + - scikit-learn github: - openproblems-bio/core#subdirectory=packages/python/openproblems diff --git a/src/methods/cytovi/script.py b/src/methods/cytovi/script.py index 6a6fd73f..6a9768af 100644 --- a/src/methods/cytovi/script.py +++ b/src/methods/cytovi/script.py @@ -1,8 +1,8 @@ import anndata as ad import numpy as np -import scanpy as sc from scvi.external import cytovi from sklearn.cluster import KMeans +from threadpoolctl import threadpool_limits ## VIASH START par = { @@ -38,11 +38,12 @@ print("Clustering using k-means with k =", par["n_clusters"], flush=True) # cluster data using Kmeans -adata_to_correct.obs["clusters"] = ( - KMeans(n_clusters=par["n_clusters"], random_state=0) - .fit_predict(adata_to_correct.layers["scaled"]) - .astype(str) -) +with threadpool_limits(limits=1): + adata_to_correct.obs["clusters"] = ( + KMeans(n_clusters=par["n_clusters"], random_state=0) + .fit_predict(adata_to_correct.layers["scaled"]) + .astype(str) + ) # concatenate obs so we can use it for subsampling adata_to_correct.obs["sample_cluster"] = ( adata_to_correct.obs["sample"].astype(str) + "_" + adata_to_correct.obs["clusters"] diff --git a/src/metrics/emd/helper.py b/src/metrics/emd/helper.py index c2f78a2c..14fb0b71 100644 --- a/src/metrics/emd/helper.py +++ b/src/metrics/emd/helper.py @@ -59,14 +59,12 @@ def calculate_vertical_emd( # mean cell type emd across all sample combinations, markers, and splits mean_emd_ct = np.nanmean( - emd_long[emd_long["cell_type"] != "global"] - .drop(columns=["cell_type", "first_sample", "second_sample"]) + emd_long.drop(columns=["cell_type", "first_sample", "second_sample"]) .to_numpy() .flatten() ) max_emd_ct = np.nanmax( - emd_long[emd_long["cell_type"] != "global"] - .drop(columns=["cell_type", "first_sample", "second_sample"]) + emd_long.drop(columns=["cell_type", "first_sample", "second_sample"]) .to_numpy() .flatten() ) @@ -114,7 +112,7 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li f" at least 2 samples per group. Skipping EMD vertical calculation." ) - return np.nan, np.nan + return np.nan cell_types = i_adata.obs["cell_type"].unique() @@ -156,6 +154,7 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li # remove unparsable characters like "/" emd_vals.columns = emd_vals.columns.str.replace("/", "_") + # TODO remove me once we are happy with the results # prepare the data to draw the heatmap in cytonorm 2 supp paper. # 1 row/column = 1 sample, a cell is emd for a given marker # repeat for every marker assessed