|
2 | 2 | import zarr |
3 | 3 | import numpy as np |
4 | 4 | import pandas as pd |
| 5 | +from tqdm import tqdm |
5 | 6 |
|
6 | 7 | from .anndata import to_dense |
7 | 8 | from .entities import GenomicProfiles |
8 | 9 |
|
9 | 10 |
|
10 | | -def adata_to_multivec_zarr(adata, output_path, obs_set_col, obs_set_name, obs_set_vals=None, var_interval_col="interval", layer_key=None, assembly="hg38", starting_resolution=5000): |
| 11 | +def adata_to_multivec_zarr(adata, output_path, obs_set_col, obs_set_name, obs_set_vals=None, var_interval_col="interval", layer_key=None, assembly="hg38", starting_resolution=5000, chr_subset=None): |
| 12 | + """ |
| 13 | + Convert an AnnData object containing a cell-by-bin matrix to a Multivec-Zarr store. |
| 14 | +
|
| 15 | + :param adata: The object to convert. |
| 16 | + :type adata: anndata.AnnData |
| 17 | + :param output_path: The path to the output Zarr store. |
| 18 | + :type output_path: str |
| 19 | + :param obs_set_col: The name of the column in adata.obs that contains the cluster IDs. |
| 20 | + :type obs_set_col: str |
| 21 | + :param obs_set_name: The name of the cluster set. |
| 22 | + :type obs_set_name: str |
| 23 | + :param obs_set_vals: The cluster IDs to include in the output. If None, all cluster IDs will be included. This can be used to override the order of the cluster IDs. |
| 24 | + :type obs_set_vals: list[str] or None |
| 25 | + :param var_interval_col: The name of the column in adata.var that contains the bin interval strings. By default, "interval". |
| 26 | + :type var_interval_col: str |
| 27 | + :param layer_key: The name of the layer in adata.layers to use. If None, adata.X will be used. By default, None. |
| 28 | + :type layer_key: str or None |
| 29 | + :param assembly: The name of the genome assembly. By default, "hg38". |
| 30 | + :type assembly: str |
| 31 | + :param starting_resolution: The starting resolution of the data. By default, 5000. |
| 32 | + :type starting_resolution: int |
| 33 | + :param chr_subset: For debugging purposes, a subset of chromosomes to process. If None, all chromosomes in the assembly will be processed. By default, None. |
| 34 | + :type chr_subset: list[str] or None |
| 35 | + """ |
11 | 36 | in_mtx = adata.layers[layer_key] if layer_key is not None else adata.X |
12 | 37 | in_barcodes_df = adata.obs |
13 | 38 | in_bins_df = adata.var |
14 | 39 |
|
| 40 | + # Ensure that in_bins_df has a sequential integer index |
| 41 | + in_bins_df = in_bins_df.reset_index() |
| 42 | + |
15 | 43 | in_mtx = to_dense(in_mtx) # TODO: is this necessary? |
16 | 44 |
|
17 | 45 | # The bin datafram consists of one column like chrName:binStart-binEnd |
@@ -84,20 +112,26 @@ def convert_bin_name_to_chr_end(bin_name): |
84 | 112 | ) |
85 | 113 | chrom_name_to_length = genomic_profiles.chrom_name_to_length |
86 | 114 |
|
| 115 | + # For debugging purposes, allow the user to specify a subset of chromosomes to process. |
| 116 | + chrom_names = chr_subset if chr_subset is not None else list(chrom_name_to_length.keys()) |
| 117 | + |
87 | 118 | # Create each chromosome dataset. |
88 | | - for chr_name, chr_len in chrom_name_to_length.items(): |
| 119 | + for chr_name in tqdm(chrom_names): |
| 120 | + chr_len = chrom_name_to_length[chr_name] |
89 | 121 | # The bins dataframe frustratingly does not contain every bin. |
90 | 122 | # We need to figure out which bins are missing. |
91 | 123 |
|
92 | 124 | # We want to check for missing bins in each chromosome separately, |
93 | 125 | # otherwise too much memory is used during the join step. |
94 | 126 | chr_bins_in_df = in_bins_df.loc[in_bins_df["chr_name"] == chr_name] |
95 | 127 | if chr_bins_in_df.shape[0] == 0: |
| 128 | + print("Warning: No bins found for chromosome", chr_name) |
96 | 129 | # No processing or output is necessary if there is no data for this chromosome. |
97 | 130 | # Continue on through all resolutions of this chromosome to the next chromosome. |
98 | 131 | continue |
99 | 132 | # Determine the indices of the matrix at which the bins for this chromosome start and end. |
100 | 133 | chr_bin_i_start = int(chr_bins_in_df.head(1).iloc[0].name) |
| 134 | + # +1 because the end index is exclusive. |
101 | 135 | chr_bin_i_end = int(chr_bins_in_df.tail(1).iloc[0].name) + 1 |
102 | 136 |
|
103 | 137 | # Extract the part of the matrix corresponding to the current chromosome. |
|
0 commit comments