Skip to content

Commit fef6966

Browse files
selmanozleyentimtreispre-commit-ci[bot]
authored
Ligrec Reproducibility and Bug Fix in Sparse Data. (#991)
* cast data * unit test attempt? old code: 572k new: 424k nans * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update the fix and the test case * make the document clearer * added comment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Tim Treis <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent e44440e commit fef6966

File tree

2 files changed

+108
-2
lines changed

2 files changed

+108
-2
lines changed

src/squidpy/gr/_ligrec.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -735,11 +735,15 @@ def extractor(res: Sequence[TempResult]) -> TempResult:
735735
clustering = np.array(data["clusters"].values, dtype=np.int32)
736736

737737
mean = groups.mean().values.T # (n_genes, n_clusters)
738-
mask = groups.apply(lambda c: ((c > 0).sum() / len(c)) >= threshold).values.T # (n_genes, n_clusters)
738+
# see https://github.com/scverse/squidpy/pull/991#issuecomment-2888506296
739+
# for why we need to cast to int64 here
740+
mask = groups.apply(
741+
lambda c: ((c > 0).astype(np.int64).sum() / len(c)) >= threshold
742+
).values.T # (n_genes, n_clusters)
743+
739744
# (n_cells, n_genes)
740745
data = np.array(data[data.columns.difference(["clusters"])].values, dtype=np.float64, order="C")
741746
# all 3 should be C contiguous
742-
743747
return parallelize( # type: ignore[no-any-return]
744748
_analysis_helper,
745749
np.arange(n_perms, dtype=np.int32).tolist(),

tests/graph/test_ligrec.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pandas.testing import assert_frame_equal
1515
from scanpy import settings as s
1616
from scanpy.datasets import blobs
17+
from scipy.sparse import csc_matrix
1718

1819
from squidpy._constants._pkg_constants import Key
1920
from squidpy.gr import ligrec
@@ -461,3 +462,104 @@ def test_none_source_target(self, adata: AnnData):
461462
)
462463
assert isinstance(pt.interactions, pd.DataFrame)
463464
assert len(pt.interactions) == 1
465+
466+
def test_ligrec_nan_counts(self):
467+
"""
468+
For the test case with 2 clusters (A, B) and 3 gene pairs (Gene1→Gene2, Gene2→Gene3, Gene3→Gene1):
469+
470+
The mask is computed for each gene in each cluster as:
471+
mask[gene, cluster] = (number of cells with value > 0) / (total cells in cluster) >= threshold
472+
473+
Number of cells with value > 0 in each cluster:
474+
Cluster A: [1, 3, 0]
475+
Cluster B: [1, 0, 3]
476+
477+
Number of cells with value > 0 in each cluster divided by total number of cells in the cluster:
478+
Cluster A: [1/3, 3/3, 0/3] = [0.33, 1.0, 0.0]
479+
Cluster B: [1/3, 0/3, 3/3] = [0.33, 0.0, 1.0]
480+
481+
Using threshold=0.8 on this data, the mask is:
482+
Cluster A: [False, True, False]
483+
Cluster B: [False, False, True]
484+
485+
A value in the result becomes NaN if either:
486+
- The ligand's mask is False in the source cluster, OR
487+
- The receptor's mask is False in the target cluster
488+
489+
Only in one combination, the mask is both True in the source and target cluster.
490+
This is the case for Gene2→Gene3 in A→B.
491+
492+
This means from all the possible cluster pairs (A→A, A→B, B→A, B→B) and gene pairs (Gene1→Gene2, Gene2→Gene3, Gene3→Gene1),
493+
(4 cluster pairs * 3 gene pairs = 12 combinations) only one combination is non-NaN.
494+
495+
Therefore, the total number of NaNs is 11.
496+
497+
The expected p-values are:
498+
cluster_1 A B
499+
cluster_2 A B A B
500+
source target
501+
GENE1 GENE2 NaN NaN NaN NaN
502+
GENE2 GENE3 NaN 0.0 NaN NaN
503+
GENE3 GENE1 NaN NaN NaN NaN
504+
505+
"""
506+
# only Gene2→Gene3 is non-NaN
507+
#
508+
509+
expected_pvalues = np.array(
510+
[
511+
[
512+
np.nan,
513+
np.nan,
514+
np.nan,
515+
np.nan,
516+
],
517+
[
518+
np.nan,
519+
0.0,
520+
np.nan,
521+
np.nan,
522+
],
523+
[
524+
np.nan,
525+
np.nan,
526+
np.nan,
527+
np.nan,
528+
],
529+
]
530+
)
531+
532+
expected_nans = 11
533+
# Setup test data
534+
threshold = 0.8
535+
interactions = pd.DataFrame({"source": ["Gene1", "Gene2", "Gene3"], "target": ["Gene2", "Gene3", "Gene1"]})
536+
537+
# Create sparse matrix with test data
538+
X = csc_matrix(
539+
[
540+
[1.0, 0.1, 0.0], # A1
541+
[0.0, 1.0, 0.0], # A2
542+
[0.0, 1.0, 0.0], # A3
543+
[0.1, 0.0, 1.0], # B1
544+
[0.0, 0.0, 1.0], # B2
545+
[0.0, 0.0, 1.0], # B3
546+
]
547+
)
548+
549+
# Create AnnData object
550+
adata = AnnData(
551+
X=X,
552+
obs=pd.DataFrame({"cluster": ["A"] * 3 + ["B"] * 3}, index=[f"cell{i}" for i in range(1, 7)]),
553+
var=pd.DataFrame(index=["Gene1", "Gene2", "Gene3"]),
554+
)
555+
adata.obs["cluster"] = adata.obs["cluster"].astype("category")
556+
557+
# Run ligrec and compare NaN counts
558+
res = ligrec(
559+
adata, cluster_key="cluster", interactions=interactions, threshold=threshold, use_raw=False, copy=True
560+
)
561+
562+
actual_nans = np.sum(np.isnan(res["pvalues"].values))
563+
564+
assert actual_nans == expected_nans, f"NaN count mismatch: expected {expected_nans}, got {actual_nans}"
565+
np.testing.assert_array_equal(res["pvalues"].values, expected_pvalues)

0 commit comments

Comments
 (0)