fix bug in get_rankings()

mass-a · mass-a · commit 55aa45b8c471 · 2025-11-14T12:23:59.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,12 +14,18 @@ and this project adheres to [Semantic Versioning][].
 
 	- First stable implementation of the UCell algorithm
 	- Implements gene ranking and calculation of signature scores
-	- Compared to the R version, we also include two different ways of 
+	- Compared to the R version, we also include two different ways of
 	  handling missing genes ("impute" or "skip", see the missing_genes parameter)
 
 ## Version 0.4.0
 
 ### Added
 
-	- Smoothing of UCell scores by k-neareast neighbors. Implemented 
+	- Smoothing of UCell scores by k-neareast neighbors. Implemented
 	  in new function `smooth_knn_scores()`
+
+## Version 0.5.0
+
+### Added
+
+	- Fixed a bug in `get_rankings()` where ties spanning max_rank could cause broadcasting errors.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [ "hatchling" ]
 
 [project]
 name = "pyucell"
-version = "0.4.0"
+version = "0.5.0"
 description = "Gene signature scoring for single-cell data"
 readme = "README.md"
 license = { file = "LICENSE" }
diff --git a/src/pyucell/ranks.py b/src/pyucell/ranks.py
@@ -1,8 +1,8 @@
-from warnings import warn
+import numpy as np
 from anndata import AnnData
 from scipy import sparse
 from scipy.stats import rankdata
-import numpy as np
+
 
 def get_rankings(
     data,
@@ -29,49 +29,62 @@ def get_rankings(
     ranks : csr_matrix of shape (genes, cells)
         Sparse matrix of ranks.
     """
-
-    # Accept either AnnData or matrix directly
+    # Load matrix
     if isinstance(data, AnnData):
         X = data.layers[layer] if layer else data.X
     else:
         X = data
 
     n_cells, n_genes = X.shape
 
-    # Convert to array
-    is_sparse = sparse.issparse(X)
-    Xarr = X.toarray() if is_sparse else np.asarray(X)
+    # Store COO components per cell in lists of arrays
+    data_parts = []
+    row_parts = []
+    col_parts = []
 
-    # Allocate vectors, at most max_rank entries per cell
-    n_cells, n_genes = X.shape
-    nnz_per_cell = max_rank 
-    nnz_total = n_cells * nnz_per_cell
+    for j in range(n_cells):
+        col = X[j, :]
+        if sparse.issparse(col):
+            col = col.toarray().ravel()
+        else:
+            col = np.asarray(col, dtype=float)
 
-    data = np.empty(nnz_total, dtype=np.int32)
-    rows = np.empty(nnz_total, dtype=np.int32)
-    cols = np.empty(nnz_total, dtype=np.int32)
+        # missing values
+        np.nan_to_num(col, copy=False)
 
-    #Calculate ranks, while keeping the matrix sparse
-    ptr = 0
-    for j in range(n_cells):
-        col = Xarr[j, :].astype(float)
-        col[np.isnan(col)] = -np.inf
-        ranks = rankdata(-col, method=ties_method)
-        mask = ranks <= max_rank  #mask out ranks to impose sparsity
-        idx = np.nonzero(mask)[0]
-        rks = ranks[idx].astype(np.int32)
-        n = len(idx)
-
-        data[ptr:ptr+n] = rks
-        rows[ptr:ptr+n] = idx
-        cols[ptr:ptr+n] = j
-        ptr += n
-
-    # slice arrays to actual size
-    data = data[:ptr]
-    rows = rows[:ptr]
-    cols = cols[:ptr]
-
-    ranks_mat = sparse.coo_matrix((data, (rows,cols)), shape=(n_genes,n_cells)).tocsr()
-    
+        # Only rank non-zero elements
+        nz_idx = np.nonzero(col)[0]
+        if len(nz_idx) == 0:
+            continue
+
+        nz_vals = col[nz_idx]
+        ranks = rankdata(-nz_vals, method=ties_method).astype(np.int32)
+
+        keep_mask = ranks <= max_rank
+        kept_idx = nz_idx[keep_mask]
+        kept_ranks = ranks[keep_mask]
+
+        if len(kept_idx) > max_rank:
+            kept_idx = kept_idx[:max_rank]
+            kept_ranks = kept_ranks[:max_rank]
+
+        n = len(kept_idx)
+        if n == 0:
+            continue
+
+        # Convert to small NumPy arrays per cell
+        data_parts.append(kept_ranks)
+        row_parts.append(kept_idx)
+        col_parts.append(np.full(n, j, dtype=np.int32))
+
+    # All zeros
+    if not data_parts:
+        return sparse.csr_matrix((n_genes, n_cells), dtype=np.int32)
+
+    # Concatenate arrays only once at the end
+    data_arr = np.concatenate(data_parts).astype(np.int32)
+    rows_arr = np.concatenate(row_parts).astype(np.int32)
+    cols_arr = np.concatenate(col_parts).astype(np.int32)
+
+    ranks_mat = sparse.csr_matrix((data_arr, (rows_arr, cols_arr)), shape=(n_genes, n_cells))
     return ranks_mat
diff --git a/src/pyucell/scoring.py b/src/pyucell/scoring.py
@@ -75,9 +75,12 @@ def _calculate_U(ranks, idx, max_rank: int = 1500):
 
     if len(present_idx) > 0:
         present_ranks = ranks[present_idx, :]
-        if sparse.issparse(present_ranks):
-            present_ranks = present_ranks.toarray()
-        present_ranks = np.asarray(present_ranks, dtype=np.float32)
+        # Always convert to dense safely
+        present_ranks = present_ranks.toarray() if sparse.issparse(present_ranks) else np.asarray(present_ranks)
+        # Ensure 2D shape even if single row
+        if present_ranks.ndim == 1:
+            present_ranks = present_ranks[np.newaxis, :]
+        present_ranks = present_ranks.astype(np.float32)
         # rank==0 is equivalent to max_rank (for sparsity)
         present_ranks[present_ranks == 0] = max_rank
         rank_sum += present_ranks.sum(axis=0)
@@ -88,12 +91,7 @@ def _calculate_U(ranks, idx, max_rank: int = 1500):
     return score
 
 
-def _score_chunk(
-    ranks: sparse.csr_matrix,
-    sig_indices: dict,
-    w_neg: float = 1.0,
-    max_rank: int = 1500
-    ):
+def _score_chunk(ranks: sparse.csr_matrix, sig_indices: dict, w_neg: float = 1.0, max_rank: int = 1500):
     n_genes, n_cells = ranks.shape
     n_signatures = len(sig_indices)
     scores = np.zeros((n_cells, n_signatures), dtype=np.float32)
@@ -178,17 +176,14 @@ def process_chunk(start, end):
         # compute ranks
         ranks_chunk = get_rankings(chunk_X, max_rank=max_rank, ties_method=ties_method)
         # get UCell scores for chunk
-        scores_chunk = _score_chunk(ranks_chunk, sig_indices, w_neg = w_neg, max_rank=max_rank)
+        scores_chunk = _score_chunk(ranks_chunk, sig_indices, w_neg=w_neg, max_rank=max_rank)
         return (start, end, scores_chunk)
 
     # Run chunks in serial or parallel
     if n_jobs == 1:
         results = [process_chunk(start, end) for start, end in chunks]
     else:
-        results = Parallel(n_jobs=n_jobs, backend="loky")(
-            delayed(process_chunk)(start, end)
-            for start, end in chunks
-        )
+        results = Parallel(n_jobs=n_jobs, backend="loky")(delayed(process_chunk)(start, end) for start, end in chunks)
 
     # Merge results back
     scores_all = np.zeros((n_cells, n_signatures), dtype=np.float32)