Skip to content

Commit 86f9251

Browse files
authored
Merge pull request #126 from haniffalab/dev
Create release 0.5.0
2 parents 9c68ff9 + 2d1f98a commit 86f9251

32 files changed

+478
-196
lines changed

bin/integrate_anndata.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
#!/usr/bin/env python3
22

33
from typing import Union
4+
import typing as T
45
import os
56
import fire
67
import zarr
78
import h5py
9+
import logging
810
import numpy as np
911
import pandas as pd
1012
import anndata as ad
@@ -13,25 +15,27 @@
1315
from pathlib import Path
1416

1517

16-
def reindex_and_concat(path: str, offset: int, features: str = None, **kwargs):
18+
def reindex_and_concat(
19+
path: str, offset: int, features: str = None, args: dict[str, T.Any] = {}, **kwargs
20+
):
1721
adata = read_anndata(path)
1822

19-
adata = reindex_anndata(adata, offset, no_save=True)
23+
adata = reindex_anndata(adata, offset, **args, **kwargs)
2024
if features:
21-
adata = concat_features(adata, features, no_save=True)
25+
adata = concat_features(adata, features, **args, **kwargs)
2226

2327
out_filename = "reindexed-concat-{}".format(
2428
os.path.splitext(os.path.basename(path))[0]
2529
)
26-
write_anndata(adata, out_filename, **kwargs)
30+
write_anndata(adata, out_filename, **args, **kwargs)
2731

2832
return
2933

3034

3135
def reindex_anndata(
3236
data: Union[ad.AnnData, str],
3337
offset: int,
34-
no_save: bool = False,
38+
no_save: bool = True,
3539
out_filename: str = None,
3640
**kwargs,
3741
):
@@ -55,7 +59,7 @@ def reindex_anndata(
5559
def concat_features(
5660
data: Union[ad.AnnData, str],
5761
features: str,
58-
no_save: bool = False,
62+
no_save: bool = True,
5963
out_filename: str = None,
6064
**kwargs,
6165
):
@@ -68,11 +72,11 @@ def concat_features(
6872
)
6973

7074
if features.endswith(".h5ad") and os.path.isfile(features):
71-
adata = concat_matrix_from_cell2location(adata, features)
75+
adata = concat_matrix_from_cell2location(adata, features, **kwargs)
7276
elif features.startswith("obs/"):
73-
adata = concat_matrix_from_obs(adata, features.split("/")[1])
77+
adata = concat_matrix_from_obs(adata, features.split("/")[1], **kwargs)
7478
elif features.startswith("obsm/"):
75-
adata = concat_matrix_from_obsm(adata, features.split("/")[1])
79+
adata = concat_matrix_from_obsm(adata, features.split("/")[1], **kwargs)
7680

7781
if no_save:
7882
return adata
@@ -134,10 +138,14 @@ def concat_matrix_from_cell2location(
134138
data: Union[ad.AnnData, str],
135139
c2l_file: str,
136140
q: str = "q05_cell_abundance_w_sf",
137-
sample: str = None,
141+
sample: tuple[str, str] = None,
138142
feature_name: str = "gene",
139143
obs_feature_name: str = None,
144+
sort: bool = True,
145+
sort_index: str = None,
146+
**kwargs,
140147
):
148+
sort = sort or sort_index is not None
141149
if isinstance(data, ad.AnnData):
142150
adata = data
143151
else:
@@ -153,6 +161,38 @@ def concat_matrix_from_cell2location(
153161
if sample:
154162
c2l_adata = c2l_adata[c2l_adata.obs[sample[0]] == sample[1]]
155163

164+
if sort:
165+
if not sort_index and adata.uns.get("webatlas_reindexed"):
166+
sort_index = "label_id"
167+
if sort_index:
168+
data_idx = adata.obs[sort_index]
169+
else:
170+
data_idx = adata.obs.index
171+
idx = c2l_adata.obs.index.get_indexer(data_idx.tolist())
172+
if -1 in idx: # Indices do not match
173+
logging.error(
174+
"Values do not match between AnnData object's"
175+
f" `{sort_index or 'index'}`"
176+
" and cell2location output index."
177+
)
178+
179+
logging.info("Attempting to match indices as substrings")
180+
try:
181+
data_idx = match_substring_indices(c2l_adata.obs.index, data_idx)
182+
if not data_idx.is_unique:
183+
raise Exception(
184+
"Found non-unique matches between indices as substrings."
185+
)
186+
idx = c2l_adata.obs.index.get_indexer(data_idx.tolist())
187+
if -1 in idx:
188+
raise Exception("Non-matching indices present.")
189+
except Exception:
190+
raise SystemError(
191+
"Failed to find a match between indices as substrings."
192+
)
193+
194+
c2l_adata = c2l_adata[idx,]
195+
156196
c2l_df = pd.DataFrame(
157197
c2l_adata.obsm[q].to_numpy(),
158198
index=c2l_adata.obs.index,
@@ -162,7 +202,9 @@ def concat_matrix_from_cell2location(
162202
dtype="float32",
163203
)
164204

165-
return concat_matrices(adata, c2l_df, "celltype", feature_name, obs_feature_name)
205+
return concat_matrices(
206+
adata, c2l_df, "celltype", feature_name, obs_feature_name, **kwargs
207+
)
166208

167209

168210
def concat_matrices(
@@ -269,5 +311,11 @@ def write_anndata(
269311
return
270312

271313

314+
def match_substring_indices(fullstring_idx, substring_idx):
315+
return pd.Series(substring_idx).apply(
316+
lambda x: fullstring_idx[fullstring_idx.str.contains(x)].values[0]
317+
)
318+
319+
272320
if __name__ == "__main__":
273321
fire.Fire()

bin/process_h5ad.py

Lines changed: 49 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,57 @@ def h5ad_to_zarr(
119119
return zarr_file
120120

121121

122+
def reindex_anndata_obs(adata: ad.AnnData) -> ad.AnnData:
123+
# check if index is numerical, if not reindex
124+
if not adata.obs.index.is_integer() and not (
125+
adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
126+
):
127+
IDX_NAME = "label_id"
128+
if IDX_NAME in adata.obs:
129+
adata.obs.rename(columns={IDX_NAME: f"_{IDX_NAME}"})
130+
adata.obs = adata.obs.reset_index(names=IDX_NAME)
131+
adata.obs.index = (
132+
pd.Categorical(adata.obs[IDX_NAME]).codes + 1
133+
) # avoid 0's for possible label images
134+
adata.uns["webatlas_reindexed"] = True
135+
adata.obs.index = adata.obs.index.astype(str)
136+
137+
return adata
138+
139+
140+
def subset_anndata(
141+
adata: ad.AnnData,
142+
obs_subset: tuple[str, T.Any] = None,
143+
var_subset: tuple[str, T.Any] = None,
144+
) -> ad.AnnData:
145+
# Subset adata by obs
146+
if obs_subset:
147+
obs_subset[1] = (
148+
[obs_subset[1]]
149+
if not isinstance(obs_subset[1], (list, tuple))
150+
else obs_subset[1]
151+
)
152+
adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
153+
154+
# Subset adata by var
155+
if var_subset:
156+
var_subset[1] = (
157+
[var_subset[1]]
158+
if not isinstance(var_subset[1], (list, tuple))
159+
else var_subset[1]
160+
)
161+
adata = adata[:, adata.var[var_subset[0]].isin(var_subset[1])]
162+
163+
return adata
164+
165+
122166
def preprocess_anndata(
123167
adata: ad.AnnData,
124168
compute_embeddings: bool = False,
125169
var_index: str = None,
126170
obs_subset: tuple[str, T.Any] = None,
127171
var_subset: tuple[str, T.Any] = None,
172+
**kwargs,
128173
):
129174
"""This function preprocesses an AnnData object, ensuring correct dtypes for zarr conversion
130175
@@ -140,23 +185,7 @@ def preprocess_anndata(
140185
to use to subset the AnnData object. Defaults to None.
141186
"""
142187

143-
# Subset adata by obs
144-
if obs_subset:
145-
obs_subset[1] = (
146-
[obs_subset[1]]
147-
if not isinstance(obs_subset[1], (list, tuple))
148-
else obs_subset[1]
149-
)
150-
adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
151-
152-
# Subset adata by var
153-
if var_subset:
154-
var_subset[1] = (
155-
[var_subset[1]]
156-
if not isinstance(var_subset[1], (list, tuple))
157-
else var_subset[1]
158-
)
159-
adata = adata[:, adata.var[var_subset[0]].isin(var_subset[1])]
188+
adata = subset_anndata(adata, obs_subset=obs_subset, var_subset=var_subset)
160189

161190
# reindex var with a specified column
162191
if var_index and var_index in adata.var:
@@ -165,14 +194,7 @@ def preprocess_anndata(
165194
adata.var.index = adata.var.index.astype(str)
166195
adata.var_names_make_unique()
167196

168-
# check if index is numerical, if not reindex
169-
if not adata.obs.index.is_integer() and not (
170-
adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
171-
):
172-
adata.obs["label_id"] = adata.obs.index
173-
adata.obs.index = pd.Categorical(adata.obs.index)
174-
adata.obs.index = adata.obs.index.codes
175-
adata.obs.index = adata.obs.index.astype(str)
197+
adata = reindex_anndata_obs(adata)
176198

177199
# turn obsm into a numpy array
178200
for k in adata.obsm_keys():
@@ -186,16 +208,14 @@ def preprocess_anndata(
186208
sc.pp.neighbors(adata)
187209
sc.tl.umap(adata)
188210

211+
# ensure data types for obs
189212
for col in adata.obs:
190-
# anndata >= 0.8.0
191-
# if data type is categorical vitessce will throw "path obs/X contains a group" and won"t find .zarray
192-
# if adata.obs[col].dtype == "category":
193-
# adata.obs[col] = adata.obs[col].cat.codes
194213
if adata.obs[col].dtype in ["int8", "int64"]:
195214
adata.obs[col] = adata.obs[col].astype("int32")
196215
if adata.obs[col].dtype == "bool":
197216
adata.obs[col] = adata.obs[col].astype(str).astype("category")
198217

218+
# ensure data types for obsm
199219
for col in adata.obsm:
200220
if type(adata.obsm[col]).__name__ in ["DataFrame", "Series"]:
201221
adata.obsm[col] = adata.obsm[col].to_numpy()

bin/process_spaceranger.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import tifffile as tf
1717
from pathlib import Path
1818
from skimage.draw import disk
19-
from process_h5ad import h5ad_to_zarr
19+
from process_h5ad import h5ad_to_zarr, reindex_anndata_obs, subset_anndata
2020

2121

2222
def spaceranger_to_anndata(
@@ -164,23 +164,9 @@ def visium_label(
164164
scalef = adata.uns["spatial"][sample_id]["scalefactors"]["tissue_hires_scalef"]
165165
shape = [int(hires_shape[0] / scalef), int(hires_shape[1] / scalef)]
166166

167-
# Subset adata by obs
168-
if obs_subset:
169-
obs_subset[1] = (
170-
[obs_subset[1]]
171-
if not isinstance(obs_subset[1], (list, tuple))
172-
else obs_subset[1]
173-
)
174-
adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
167+
adata = subset_anndata(adata, obs_subset=obs_subset)
175168

176-
# check if index is numerical, if not reindex
177-
if not adata.obs.index.is_integer() and not (
178-
adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
179-
):
180-
adata.obs["label_id"] = adata.obs.index
181-
adata.obs.index = pd.Categorical(adata.obs.index)
182-
adata.obs.index = adata.obs.index.codes
183-
adata.obs.index = adata.obs.index.astype(str)
169+
adata = reindex_anndata_obs(adata)
184170

185171
# turn obsm into a numpy array
186172
for k in adata.obsm_keys():
@@ -189,7 +175,6 @@ def visium_label(
189175
spot_diameter_fullres = adata.uns["spatial"][sample_id]["scalefactors"][
190176
"spot_diameter_fullres"
191177
]
192-
# hires_scalef = adata.uns["spatial"][sample_id]["scalefactors"]["tissue_hires_scalef"]
193178
spot_coords = adata.obsm["spatial"]
194179
assert adata.obs.shape[0] == spot_coords.shape[0]
195180

bin/process_xenium.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def xenium_to_anndata(
9797
# pd.Categorical.codes converts them to int this is done manually at this step
9898
# instead of reindex_anndata so we control what matches the label image
9999
adata.obs = adata.obs.reset_index()
100-
adata.obs.index = pd.Categorical(adata.obs["cell_id"]).codes.astype(str)
100+
adata.obs.index = (pd.Categorical(adata.obs["cell_id"]).codes + 1).astype(str)
101101

102102
return adata
103103

@@ -163,7 +163,7 @@ def xenium_label(
163163
# starting on v1.3 cell_id looks like "aaabinlp-1"
164164
# pd.Categorical.codes converts them to int
165165
# this is required so the label image matches the h5ad ids
166-
ids = pd.Categorical(ids).codes
166+
ids = pd.Categorical(ids).codes + 1
167167

168168
pols = z["polygon_vertices"][1]
169169

0 commit comments

Comments
 (0)