Skip to content

Commit 55aa45b

Browse files
committed
fix bug in get_rankings()
1 parent 66dc980 commit 55aa45b

File tree

4 files changed

+68
-54
lines changed

4 files changed

+68
-54
lines changed

CHANGELOG.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,18 @@ and this project adheres to [Semantic Versioning][].
1414

1515
- First stable implementation of the UCell algorithm
1616
- Implements gene ranking and calculation of signature scores
17-
- Compared to the R version, we also include two different ways of
17+
- Compared to the R version, we also include two different ways of
1818
handling missing genes ("impute" or "skip", see the missing_genes parameter)
1919

2020
## Version 0.4.0
2121

2222
### Added
2323

24-
- Smoothing of UCell scores by k-neareast neighbors. Implemented
24+
- Smoothing of UCell scores by k-neareast neighbors. Implemented
2525
in new function `smooth_knn_scores()`
26+
27+
## Version 0.5.0
28+
29+
### Added
30+
31+
- Fixed a bug in `get_rankings()` where ties spanning max_rank could cause broadcasting errors.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ requires = [ "hatchling" ]
44

55
[project]
66
name = "pyucell"
7-
version = "0.4.0"
7+
version = "0.5.0"
88
description = "Gene signature scoring for single-cell data"
99
readme = "README.md"
1010
license = { file = "LICENSE" }

src/pyucell/ranks.py

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from warnings import warn
1+
import numpy as np
22
from anndata import AnnData
33
from scipy import sparse
44
from scipy.stats import rankdata
5-
import numpy as np
5+
66

77
def get_rankings(
88
data,
@@ -29,49 +29,62 @@ def get_rankings(
2929
ranks : csr_matrix of shape (genes, cells)
3030
Sparse matrix of ranks.
3131
"""
32-
33-
# Accept either AnnData or matrix directly
32+
# Load matrix
3433
if isinstance(data, AnnData):
3534
X = data.layers[layer] if layer else data.X
3635
else:
3736
X = data
3837

3938
n_cells, n_genes = X.shape
4039

41-
# Convert to array
42-
is_sparse = sparse.issparse(X)
43-
Xarr = X.toarray() if is_sparse else np.asarray(X)
40+
# Store COO components per cell in lists of arrays
41+
data_parts = []
42+
row_parts = []
43+
col_parts = []
4444

45-
# Allocate vectors, at most max_rank entries per cell
46-
n_cells, n_genes = X.shape
47-
nnz_per_cell = max_rank
48-
nnz_total = n_cells * nnz_per_cell
45+
for j in range(n_cells):
46+
col = X[j, :]
47+
if sparse.issparse(col):
48+
col = col.toarray().ravel()
49+
else:
50+
col = np.asarray(col, dtype=float)
4951

50-
data = np.empty(nnz_total, dtype=np.int32)
51-
rows = np.empty(nnz_total, dtype=np.int32)
52-
cols = np.empty(nnz_total, dtype=np.int32)
52+
# missing values
53+
np.nan_to_num(col, copy=False)
5354

54-
#Calculate ranks, while keeping the matrix sparse
55-
ptr = 0
56-
for j in range(n_cells):
57-
col = Xarr[j, :].astype(float)
58-
col[np.isnan(col)] = -np.inf
59-
ranks = rankdata(-col, method=ties_method)
60-
mask = ranks <= max_rank #mask out ranks to impose sparsity
61-
idx = np.nonzero(mask)[0]
62-
rks = ranks[idx].astype(np.int32)
63-
n = len(idx)
64-
65-
data[ptr:ptr+n] = rks
66-
rows[ptr:ptr+n] = idx
67-
cols[ptr:ptr+n] = j
68-
ptr += n
69-
70-
# slice arrays to actual size
71-
data = data[:ptr]
72-
rows = rows[:ptr]
73-
cols = cols[:ptr]
74-
75-
ranks_mat = sparse.coo_matrix((data, (rows,cols)), shape=(n_genes,n_cells)).tocsr()
76-
55+
# Only rank non-zero elements
56+
nz_idx = np.nonzero(col)[0]
57+
if len(nz_idx) == 0:
58+
continue
59+
60+
nz_vals = col[nz_idx]
61+
ranks = rankdata(-nz_vals, method=ties_method).astype(np.int32)
62+
63+
keep_mask = ranks <= max_rank
64+
kept_idx = nz_idx[keep_mask]
65+
kept_ranks = ranks[keep_mask]
66+
67+
if len(kept_idx) > max_rank:
68+
kept_idx = kept_idx[:max_rank]
69+
kept_ranks = kept_ranks[:max_rank]
70+
71+
n = len(kept_idx)
72+
if n == 0:
73+
continue
74+
75+
# Convert to small NumPy arrays per cell
76+
data_parts.append(kept_ranks)
77+
row_parts.append(kept_idx)
78+
col_parts.append(np.full(n, j, dtype=np.int32))
79+
80+
# All zeros
81+
if not data_parts:
82+
return sparse.csr_matrix((n_genes, n_cells), dtype=np.int32)
83+
84+
# Concatenate arrays only once at the end
85+
data_arr = np.concatenate(data_parts).astype(np.int32)
86+
rows_arr = np.concatenate(row_parts).astype(np.int32)
87+
cols_arr = np.concatenate(col_parts).astype(np.int32)
88+
89+
ranks_mat = sparse.csr_matrix((data_arr, (rows_arr, cols_arr)), shape=(n_genes, n_cells))
7790
return ranks_mat

src/pyucell/scoring.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,12 @@ def _calculate_U(ranks, idx, max_rank: int = 1500):
7575

7676
if len(present_idx) > 0:
7777
present_ranks = ranks[present_idx, :]
78-
if sparse.issparse(present_ranks):
79-
present_ranks = present_ranks.toarray()
80-
present_ranks = np.asarray(present_ranks, dtype=np.float32)
78+
# Always convert to dense safely
79+
present_ranks = present_ranks.toarray() if sparse.issparse(present_ranks) else np.asarray(present_ranks)
80+
# Ensure 2D shape even if single row
81+
if present_ranks.ndim == 1:
82+
present_ranks = present_ranks[np.newaxis, :]
83+
present_ranks = present_ranks.astype(np.float32)
8184
# rank==0 is equivalent to max_rank (for sparsity)
8285
present_ranks[present_ranks == 0] = max_rank
8386
rank_sum += present_ranks.sum(axis=0)
@@ -88,12 +91,7 @@ def _calculate_U(ranks, idx, max_rank: int = 1500):
8891
return score
8992

9093

91-
def _score_chunk(
92-
ranks: sparse.csr_matrix,
93-
sig_indices: dict,
94-
w_neg: float = 1.0,
95-
max_rank: int = 1500
96-
):
94+
def _score_chunk(ranks: sparse.csr_matrix, sig_indices: dict, w_neg: float = 1.0, max_rank: int = 1500):
9795
n_genes, n_cells = ranks.shape
9896
n_signatures = len(sig_indices)
9997
scores = np.zeros((n_cells, n_signatures), dtype=np.float32)
@@ -178,17 +176,14 @@ def process_chunk(start, end):
178176
# compute ranks
179177
ranks_chunk = get_rankings(chunk_X, max_rank=max_rank, ties_method=ties_method)
180178
# get UCell scores for chunk
181-
scores_chunk = _score_chunk(ranks_chunk, sig_indices, w_neg = w_neg, max_rank=max_rank)
179+
scores_chunk = _score_chunk(ranks_chunk, sig_indices, w_neg=w_neg, max_rank=max_rank)
182180
return (start, end, scores_chunk)
183181

184182
# Run chunks in serial or parallel
185183
if n_jobs == 1:
186184
results = [process_chunk(start, end) for start, end in chunks]
187185
else:
188-
results = Parallel(n_jobs=n_jobs, backend="loky")(
189-
delayed(process_chunk)(start, end)
190-
for start, end in chunks
191-
)
186+
results = Parallel(n_jobs=n_jobs, backend="loky")(delayed(process_chunk)(start, end) for start, end in chunks)
192187

193188
# Merge results back
194189
scores_all = np.zeros((n_cells, n_signatures), dtype=np.float32)

0 commit comments

Comments
 (0)