Merge pull request #17 from earmingol/dev

earmingol · web-flow · commit d2bf5f1a24bf · 2025-07-15T15:15:14.000+01:00
Implemented memory efficient normalization
diff --git a/sccellfie/__init__.py b/sccellfie/__init__.py
@@ -16,4 +16,4 @@
 from .reaction_activity import (compute_reaction_activity)
 from .sccellfie_pipeline import (run_sccellfie_pipeline)
 
-__version__ = "0.4.5"
+__version__ = "0.4.6"
diff --git a/sccellfie/preprocessing/adata_utils.py b/sccellfie/preprocessing/adata_utils.py
@@ -93,10 +93,15 @@ def stratified_subsample_adata(adata, group_column, target_fraction=0.20, random
     return adata_subsampled
 
 
-def normalize_adata(adata, target_sum=10_000, n_counts_key='n_counts', copy=False):
+from scipy import sparse
+from scipy.sparse import issparse, csr_matrix, hstack
+
+
+def normalize_adata(adata, target_sum=10_000, n_counts_key='n_counts',
+                    chunk_size=None, copy=False):
     """
-    Preprocesses an AnnData object by normalizing the data to a target sum.
-    Original adata object is updated in place.
+    Memory-efficient normalization of AnnData object.
+    Works directly on sparse matrices without converting to dense.
 
     Parameters
     ----------
@@ -109,6 +114,11 @@ def normalize_adata(adata, target_sum=10_000, n_counts_key='n_counts', copy=Fals
     n_counts_key : str, optional (default: 'n_counts')
         The key in adata.obs containing the total counts for each cell.
 
+    chunk_size : int or None, optional (default: None)
+        If None, process entire matrix at once (faster, more memory).
+        If int, process matrix in chunks of this size (slower, less memory).
+        Recommended for very large datasets (>1M cells).
+
     copy : bool, optional (default: False)
         If True, returns a copy of adata with the normalized data.
     """
@@ -118,37 +128,65 @@ def normalize_adata(adata, target_sum=10_000, n_counts_key='n_counts', copy=Fals
     # Check if total counts are already calculated
     if n_counts_key not in adata.obs.columns:
         warnings.warn(f"{n_counts_key} not found in adata.obs. Calculating total counts.", UserWarning)
-        n_counts_key = 'total_counts'  # scanpy uses 'total_counts' as the key
-        # Calculate total counts from the raw expression matrix
-        adata.obs[n_counts_key] = adata.X.sum(axis=1)
-
-    # Input data
-    X_view = adata.X
+        n_counts_key = 'total_counts'
 
-    warnings.warn("Normalizing data.", UserWarning)
+        if sparse.issparse(adata.X):
+            if chunk_size is not None:
+                # Chunked calculation for very large matrices
+                n_cells = adata.X.shape[0]
+                counts = np.zeros(n_cells)
 
-    # Check if matrix is sparse
-    is_sparse = sparse.issparse(X_view)
+                for start in range(0, n_cells, chunk_size):
+                    end = min(start + chunk_size, n_cells)
+                    counts[start:end] = np.array(adata.X[start:end].sum(axis=1)).flatten()
 
-    # Convert to dense if sparse
-    if is_sparse:
-        X_view = X_view.toarray()
+                adata.obs[n_counts_key] = counts
+            else:
+                # Standard calculation
+                adata.obs[n_counts_key] = np.array(adata.X.sum(axis=1)).flatten()
+        else:
+            # Dense matrix
+            adata.obs[n_counts_key] = np.array(adata.X.sum(axis=1)).flatten()
 
-    # Normalize
-    n_counts = adata.obs[n_counts_key].values[:, None]
-    X_norm = X_view / n_counts * target_sum
+    warnings.warn("Normalizing data.", UserWarning)
 
-    # Convert back to sparse if original was sparse
-    if is_sparse:
-        X_norm = sparse.csr_matrix(X_norm)
+    # Get counts and calculate scaling factors
+    n_counts = adata.obs[n_counts_key].values
+    scaling_factors = target_sum / n_counts
+
+    # Perform normalization
+    if sparse.issparse(adata.X):
+        if chunk_size is not None:
+            # Chunked processing for very large sparse matrices
+            n_cells = adata.X.shape[0]
+            normalized_chunks = []
+
+            for start in range(0, n_cells, chunk_size):
+                end = min(start + chunk_size, n_cells)
+                chunk_scaling = sparse.diags(scaling_factors[start:end], 0, format='csr')
+                normalized_chunk = chunk_scaling @ adata.X[start:end]
+                normalized_chunks.append(normalized_chunk)
+
+            # Combine chunks
+            adata.X = sparse.vstack(normalized_chunks, format='csr')
+        else:
+            # Standard sparse matrix normalization (most efficient)
+            scaling_matrix = sparse.diags(scaling_factors, 0, format='csr')
+            adata.X = scaling_matrix @ adata.X
+    else:
+        # Dense matrix normalization
+        adata.X = adata.X / n_counts[:, None] * target_sum
 
-    # Update adata
-    adata.X = X_norm
+    # Update metadata
     adata.uns['normalization'] = {
         'method': 'total_counts',
         'target_sum': target_sum,
-        'n_counts_key': n_counts_key
+        'n_counts_key': n_counts_key,
+        'chunked': chunk_size is not None
     }
+    if chunk_size is not None:
+        adata.uns['normalization']['chunk_size'] = chunk_size
+
     if copy:
         return adata
 
diff --git a/sccellfie/preprocessing/tests/test_adata_utils.py b/sccellfie/preprocessing/tests/test_adata_utils.py
@@ -98,6 +98,44 @@ def test_normalize_adata_dense():
         normalize_adata(adata, target_sum=1000)
 
 
+def test_normalize_adata_chunked():
+    """Test chunked normalization when counts already exist"""
+    # Create test data
+    adata = create_controlled_adata()
+
+    # Pre-calculate counts
+    adata.obs['n_counts'] = np.array([3, 9, 21, 21])
+
+    # Normalize without chunks
+    adata_no_chunk = adata.copy()
+    normalize_adata(adata_no_chunk, target_sum=1000, n_counts_key='n_counts', copy=False)
+
+    # Normalize with chunks
+    adata_chunked = adata.copy()
+    normalize_adata(adata_chunked, target_sum=1000, n_counts_key='n_counts',
+                    chunk_size=2, copy=False)  # Small chunk size for 4 cells
+
+    # Results should be identical
+    np.testing.assert_array_almost_equal(
+        adata_no_chunk.X.toarray(),
+        adata_chunked.X.toarray(),
+        decimal=10
+    )
+
+    # Check expected values
+    expected_normalized_X = np.array([
+        [333.33, 666.67, 0],
+        [333.33, 444.44, 222.22],
+        [238.10, 285.71, 476.19],
+        [333.33, 380.95, 285.71]
+    ])
+    np.testing.assert_array_almost_equal(
+        adata_chunked.X.toarray(),
+        expected_normalized_X,
+        decimal=2
+    )
+
+
 # Transform gene names tests
 # Mock data for testing
 MOCK_ENSEMBL2SYMBOL = {