|
1 |
| -from typing import Any, Dict, Mapping, Optional, Sequence, Tuple |
| 1 | +from typing import Mapping, Optional, Tuple |
2 | 2 |
|
3 |
| -import dask.array as da |
4 | 3 | import dask.dataframe as dd
|
5 | 4 | import numpy as np
|
6 |
| -import zarr |
7 | 5 |
|
8 | 6 | from ..typing import ArrayLike, DType
|
9 | 7 | from ..utils import encode_array, max_str_len
|
@@ -52,109 +50,3 @@ def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]:
|
52 | 50 | else:
|
53 | 51 | ids, names = encode_array(np.asarray(contig, dtype=str))
|
54 | 52 | return ids, names
|
55 |
| - |
56 |
| - |
57 |
| -def concatenate_and_rechunk( |
58 |
| - zarrs: Sequence[zarr.Array], |
59 |
| - chunks: Optional[Tuple[int, ...]] = None, |
60 |
| - dtype: DType = None, |
61 |
| -) -> da.Array: |
62 |
| - """Perform a concatenate and rechunk operation on a collection of Zarr arrays |
63 |
| - to produce an array with a uniform chunking, suitable for saving as |
64 |
| - a single Zarr array. |
65 |
| -
|
66 |
| - In contrast to Dask's ``rechunk`` method, the Dask computation graph |
67 |
| - is embarrassingly parallel and will make efficient use of memory, |
68 |
| - since no Zarr chunks are cached by the Dask scheduler. |
69 |
| -
|
70 |
| - The Zarr arrays must have matching shapes except in the first |
71 |
| - dimension. |
72 |
| -
|
73 |
| - Parameters |
74 |
| - ---------- |
75 |
| - zarrs |
76 |
| - Collection of Zarr arrays to concatenate. |
77 |
| - chunks : Optional[Tuple[int, ...]], optional |
78 |
| - The chunks to apply to the concatenated arrays. If not specified |
79 |
| - the chunks for the first array will be applied to the concatenated |
80 |
| - array. |
81 |
| - dtype |
82 |
| - The dtype of the concatenated array, by default the same as the |
83 |
| - first array. |
84 |
| -
|
85 |
| - Returns |
86 |
| - ------- |
87 |
| - A Dask array, suitable for saving as a single Zarr array. |
88 |
| -
|
89 |
| - Raises |
90 |
| - ------ |
91 |
| - ValueError |
92 |
| - If the Zarr arrays do not have matching shapes (except in the first |
93 |
| - dimension). |
94 |
| - """ |
95 |
| - |
96 |
| - if len(set([z.shape[1:] for z in zarrs])) > 1: |
97 |
| - shapes = [z.shape for z in zarrs] |
98 |
| - raise ValueError( |
99 |
| - f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}" |
100 |
| - ) |
101 |
| - |
102 |
| - lengths = np.array([z.shape[0] for z in zarrs]) |
103 |
| - lengths0 = np.insert(lengths, 0, 0, axis=0) # type: ignore[no-untyped-call] |
104 |
| - offsets = np.cumsum(lengths0) |
105 |
| - total_length = offsets[-1] |
106 |
| - |
107 |
| - shape = (total_length, *zarrs[0].shape[1:]) |
108 |
| - chunks = chunks or zarrs[0].chunks |
109 |
| - dtype = dtype or zarrs[0].dtype |
110 |
| - |
111 |
| - ar = da.empty(shape, chunks=chunks) |
112 |
| - |
113 |
| - def load_chunk( |
114 |
| - x: ArrayLike, |
115 |
| - zarrs: Sequence[zarr.Array], |
116 |
| - offsets: ArrayLike, |
117 |
| - block_info: Dict[Any, Any], |
118 |
| - ) -> ArrayLike: |
119 |
| - return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"]) |
120 |
| - |
121 |
| - return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype) |
122 |
| - |
123 |
| - |
124 |
| -def _zarr_index(offsets: ArrayLike, pos: int) -> int: |
125 |
| - """Return the index of the zarr file that pos falls in""" |
126 |
| - index: int = np.searchsorted(offsets, pos, side="right") - 1 # type: ignore[assignment] |
127 |
| - return index |
128 |
| - |
129 |
| - |
130 |
| -def _slice_zarrs( |
131 |
| - zarrs: Sequence[zarr.Array], offsets: ArrayLike, locs: Sequence[Tuple[int, ...]] |
132 |
| -) -> ArrayLike: |
133 |
| - """Slice concatenated zarrs by locs""" |
134 |
| - # convert array locations to slices |
135 |
| - locs = [slice(*loc) for loc in locs] # type: ignore[misc] |
136 |
| - # determine which zarr files are needed |
137 |
| - start, stop = locs[0].start, locs[0].stop # type: ignore[attr-defined] # stack on first axis |
138 |
| - i0 = _zarr_index(offsets, start) |
139 |
| - i1 = _zarr_index(offsets, stop) |
140 |
| - if i0 == i1: # within a single zarr file |
141 |
| - sel = slice(start - offsets[i0], stop - offsets[i0]) |
142 |
| - return zarrs[i0][(sel, *locs[1:])] |
143 |
| - else: # more than one zarr file |
144 |
| - slices = [] |
145 |
| - slices.append((i0, slice(start - offsets[i0], None))) |
146 |
| - for i in range(i0 + 1, i1): # entire zarr |
147 |
| - slices.append((i, slice(None))) |
148 |
| - if stop > offsets[i1]: |
149 |
| - slices.append((i1, slice(0, stop - offsets[i1]))) |
150 |
| - parts = [zarrs[i][(sel, *locs[1:])] for (i, sel) in slices] |
151 |
| - return np.concatenate(parts) # type: ignore[no-untyped-call] |
152 |
| - |
153 |
| - |
154 |
| -def str_is_int(x: str) -> bool: |
155 |
| - """Test if a string can be parsed as an int""" |
156 |
| - try: |
157 |
| - int(x) |
158 |
| - return True |
159 |
| - except ValueError: |
160 |
| - return False |
0 commit comments