Skip to content

Commit f3fcadf

Browse files
authored
Update normalize by volume (#65)
1 parent 4fed8c4 commit f3fcadf

File tree

2 files changed

+102
-12
lines changed

2 files changed

+102
-12
lines changed

src/methods_normalization/normalize_by_volume/config.vsh.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,21 @@ links:
1010
references:
1111
doi: "10.1101/2023.02.13.528102"
1212

13+
arguments:
14+
- name: --target_volume
15+
type: double
16+
1317
resources:
1418
- type: python_script
1519
path: script.py
1620

1721
engines:
1822
- type: docker
1923
image: openproblems/base_python:1
20-
__merge__:
21-
- /src/base/setup_txsim_partial.yaml
24+
setup:
25+
- type: python
26+
pypi:
27+
- scanpy
2228
- type: native
2329

2430
runners:
Lines changed: 94 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
1+
import numpy as np
12
import anndata as ad
2-
import txsim as tx
3+
import scanpy as sc
4+
from scipy import sparse
5+
from scipy.sparse import issparse
6+
from scanpy._utils import view_to_actual
7+
from sklearn.utils import sparsefuncs
38

49
## VIASH START
510
par = {
611
'input_spatial_aggregated_counts': 'resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad',
712
'input_cell_volumes': 'resources_test/task_ist_preprocessing/mouse_brain_combined/cell_volumes.h5ad',
813
'output': 'spatial_norm_counts.h5ad',
14+
15+
'target_volume': None,
916
}
1017
meta = {
1118
'name': 'normalize_by_volume',
@@ -14,20 +21,97 @@
1421

1522
assert par['input_cell_volumes'] is not None, 'Volume input is required for this normalization method.'
1623

24+
25+
### Function definition ###
26+
27+
def normalize_by_volume(
28+
adata: ad.AnnData,
29+
target_volume: float | None = None,
30+
volume_key: str = 'volume',
31+
key_added: str | None = None,
32+
layer: str | None = None,
33+
inplace: bool = True,
34+
) -> np.ndarray | sparse.spmatrix | None:
35+
"""Normalize counts by cell volume
36+
37+
Parameters
38+
----------
39+
adata : AnnData
40+
The annotated data matrix of shape `n_obs` x `n_vars`.
41+
Rows correspond to cells and columns to genes.
42+
target_volume
43+
If `None`, after normalization, each observation (cell) has a
44+
volume equal to the median of cell volumes. Note that when providing
45+
a float value it is crucial to know in which spatial units the cell volume
46+
was calculated.
47+
volume_key : str
48+
Name of the field in `adata.obs` where the cell volume (or area) is stored,
49+
by default 'volume'
50+
key_added
51+
Name of the field in `adata.obs` where the normalization factor is
52+
stored.
53+
layer
54+
Layer to normalize instead of `X`. If `None`, `X` is normalized.
55+
inplace
56+
Whether to update `adata` or return dictionary with normalized copies of
57+
`adata.X` and `adata.layers`.
58+
59+
60+
Returns
61+
-------
62+
np.ndarray
63+
If ``inplace=True``, ``adata.X`` is updated with the normalized values.
64+
Otherwise, returns the normalized numpy array or sparse matrix
65+
"""
66+
67+
view_to_actual(adata)
68+
69+
X = adata.X if (layer is None) else adata.layers[layer]
70+
71+
assert volume_key in adata.obs.columns, f"key {volume_key} not found in adata.obs"
72+
assert not adata.obs[volume_key].isnull().any(), "Nan values found in adata.obs[volume_key]. All cells must have valid volumes."
73+
assert (adata.obs[volume_key] > 0).all(), "All cell volumes in adata.obs[volume_key] must be > 0"
74+
75+
if target_volume is None:
76+
target_volume = adata.obs[volume_key].median()
77+
78+
volumes = adata.obs[volume_key].values.copy()
79+
norm_factors = target_volume / volumes
80+
81+
if issparse(X):
82+
if X.dtype != norm_factors.dtype:
83+
X = X.astype(np.float64)
84+
norm_factors = norm_factors.astype(np.float64)
85+
if inplace:
86+
sparsefuncs.inplace_row_scale(X, norm_factors)
87+
else:
88+
X_norm = X.copy()
89+
sparsefuncs.inplace_row_scale(X_norm, norm_factors)
90+
else:
91+
if inplace:
92+
np.divide(X, volumes[:,None], out=X)
93+
else:
94+
X_norm = X / volumes[:, None]
95+
96+
if key_added:
97+
adata.obs[key_added] = norm_factors
98+
99+
if not inplace:
100+
return X_norm
101+
102+
103+
### Main function ###
104+
17105
print('Reading input files', flush=True)
18-
adata = ad.read(par['input_spatial_aggregated_counts'])
19-
adata_volume = ad.read(par['input_cell_volumes'])
106+
adata = ad.read_h5ad(par['input_spatial_aggregated_counts'])
107+
adata_volume = ad.read_h5ad(par['input_cell_volumes'])
20108
assert adata.obs["cell_id"].astype(int).sort_values().equals(adata_volume.obs["cell_id"].astype(int).sort_values()), "Cell IDs do not match"
21109
adata.obs["volume"] = adata_volume.obs.loc[adata.obs["cell_id"].astype(str), "volume"]
22110

23111
print('Normalizing by volume', flush=True)
24-
# TODO: Add scaling parameter, also as input to the script
25-
tx.preprocessing.normalize_by_area(adata, area="volume")
26-
27-
adata.layers['normalized'] = adata.layers['lognorm'] #TODO: Check if openproblems "normalized" is understood as log normalized
28-
del adata.layers['norm']
29-
del adata.layers['lognorm']
30-
#del adata.layers['raw'] #TODO: Probably better to add this, otherwise some downstream methods' tests will succeed by chance
112+
adata.layers['normalized'] = adata.layers['counts'].copy()
113+
normalize_by_volume(adata, layer="normalized", target_volume=par["target_volume"])
114+
sc.pp.log1p(adata, layer="normalized")
31115

32116
print('Writing output', flush=True)
33117
adata.write(par['output'])

0 commit comments

Comments
 (0)