Skip to content

Commit 3810797

Browse files
authored
Update adata_to_multivec (#326)
* Update adata_to_multivec * Update pyproject.toml * Update multivec.py
1 parent 989af83 commit 3810797

File tree

2 files changed

+38
-3
lines changed

2 files changed

+38
-3
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ dependencies = [
3636
'scanpy>=1.9.3',
3737
'ome-zarr==0.8.3',
3838
'tifffile>=2020.10.1',
39-
'jsonschema>=3.2'
39+
'jsonschema>=3.2',
40+
'tqdm>=4.1.0'
4041
]
4142

4243
[project.optional-dependencies]

vitessce/data_utils/multivec.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,44 @@
22
import zarr
33
import numpy as np
44
import pandas as pd
5+
from tqdm import tqdm
56

67
from .anndata import to_dense
78
from .entities import GenomicProfiles
89

910

10-
def adata_to_multivec_zarr(adata, output_path, obs_set_col, obs_set_name, obs_set_vals=None, var_interval_col="interval", layer_key=None, assembly="hg38", starting_resolution=5000):
11+
def adata_to_multivec_zarr(adata, output_path, obs_set_col, obs_set_name, obs_set_vals=None, var_interval_col="interval", layer_key=None, assembly="hg38", starting_resolution=5000, chr_subset=None):
12+
"""
13+
Convert an AnnData object containing a cell-by-bin matrix to a Multivec-Zarr store.
14+
15+
:param adata: The object to convert.
16+
:type adata: anndata.AnnData
17+
:param output_path: The path to the output Zarr store.
18+
:type output_path: str
19+
:param obs_set_col: The name of the column in adata.obs that contains the cluster IDs.
20+
:type obs_set_col: str
21+
:param obs_set_name: The name of the cluster set.
22+
:type obs_set_name: str
23+
:param obs_set_vals: The cluster IDs to include in the output. If None, all cluster IDs will be included. This can be used to override the order of the cluster IDs.
24+
:type obs_set_vals: list[str] or None
25+
:param var_interval_col: The name of the column in adata.var that contains the bin interval strings. By default, "interval".
26+
:type var_interval_col: str
27+
:param layer_key: The name of the layer in adata.layers to use. If None, adata.X will be used. By default, None.
28+
:type layer_key: str or None
29+
:param assembly: The name of the genome assembly. By default, "hg38".
30+
:type assembly: str
31+
:param starting_resolution: The starting resolution of the data. By default, 5000.
32+
:type starting_resolution: int
33+
:param chr_subset: For debugging purposes, a subset of chromosomes to process. If None, all chromosomes in the assembly will be processed. By default, None.
34+
:type chr_subset: list[str] or None
35+
"""
1136
in_mtx = adata.layers[layer_key] if layer_key is not None else adata.X
1237
in_barcodes_df = adata.obs
1338
in_bins_df = adata.var
1439

40+
# Ensure that in_bins_df has a sequential integer index
41+
in_bins_df = in_bins_df.reset_index()
42+
1543
in_mtx = to_dense(in_mtx) # TODO: is this necessary?
1644

1745
# The bin datafram consists of one column like chrName:binStart-binEnd
@@ -84,20 +112,26 @@ def convert_bin_name_to_chr_end(bin_name):
84112
)
85113
chrom_name_to_length = genomic_profiles.chrom_name_to_length
86114

115+
# For debugging purposes, allow the user to specify a subset of chromosomes to process.
116+
chrom_names = chr_subset if chr_subset is not None else list(chrom_name_to_length.keys())
117+
87118
# Create each chromosome dataset.
88-
for chr_name, chr_len in chrom_name_to_length.items():
119+
for chr_name in tqdm(chrom_names):
120+
chr_len = chrom_name_to_length[chr_name]
89121
# The bins dataframe frustratingly does not contain every bin.
90122
# We need to figure out which bins are missing.
91123

92124
# We want to check for missing bins in each chromosome separately,
93125
# otherwise too much memory is used during the join step.
94126
chr_bins_in_df = in_bins_df.loc[in_bins_df["chr_name"] == chr_name]
95127
if chr_bins_in_df.shape[0] == 0:
128+
print("Warning: No bins found for chromosome", chr_name)
96129
# No processing or output is necessary if there is no data for this chromosome.
97130
# Continue on through all resolutions of this chromosome to the next chromosome.
98131
continue
99132
# Determine the indices of the matrix at which the bins for this chromosome start and end.
100133
chr_bin_i_start = int(chr_bins_in_df.head(1).iloc[0].name)
134+
# +1 because the end index is exclusive.
101135
chr_bin_i_end = int(chr_bins_in_df.tail(1).iloc[0].name) + 1
102136

103137
# Extract the part of the matrix corresponding to the current chromosome.

0 commit comments

Comments
 (0)