|
1 |
| -import itertools |
2 | 1 | import struct
|
3 |
| -import tempfile |
4 |
| -import uuid |
5 |
| -from contextlib import contextmanager |
6 |
| -from typing import IO, Any, Dict, Hashable, Iterator, Optional, Sequence, TypeVar |
7 |
| -from urllib.parse import urlparse |
| 2 | +from typing import IO, Any, Dict, Optional, Sequence |
8 | 3 |
|
9 | 4 | import fsspec
|
10 |
| -from numcodecs import Delta, PackBits |
11 | 5 |
|
12 |
| -from sgkit.typing import PathType |
13 |
| - |
14 |
| -T = TypeVar("T") |
| 6 | +from bio2zarr.typing import PathType |
15 | 7 |
|
16 | 8 |
|
17 | 9 | def ceildiv(a: int, b: int) -> int:
|
18 | 10 | """Safe integer ceil function"""
|
19 | 11 | return -(-a // b)
|
20 | 12 |
|
21 | 13 |
|
22 |
| -# Based on https://dev.to/orenovadia/solution-chunked-iterator-python-riddle-3ple |
23 |
| -def chunks(iterator: Iterator[T], n: int) -> Iterator[Iterator[T]]: |
24 |
| - """ |
25 |
| - Convert an iterator into an iterator of iterators, where the inner iterators |
26 |
| - each return `n` items, except the last, which may return fewer. |
27 |
| -
|
28 |
| - For the special case of an empty iterator, an iterator of an empty iterator is |
29 |
| - returned. |
30 |
| - """ |
31 |
| - |
32 |
| - empty_iterator = True |
33 |
| - for first in iterator: # take one item out (exits loop if `iterator` is empty) |
34 |
| - empty_iterator = False |
35 |
| - rest_of_chunk = itertools.islice(iterator, 0, n - 1) |
36 |
| - yield itertools.chain([first], rest_of_chunk) # concatenate the first item back |
37 |
| - if empty_iterator: |
38 |
| - yield iter([]) |
39 |
| - |
40 |
| - |
41 | 14 | def get_file_length(
|
42 | 15 | path: PathType, storage_options: Optional[Dict[str, str]] = None
|
43 | 16 | ) -> int:
|
@@ -107,128 +80,3 @@ def open_gzip(path: PathType, storage_options: Optional[Dict[str, str]]) -> IO[A
|
107 | 80 | storage_options = storage_options or {}
|
108 | 81 | openfile: IO[Any] = fsspec.open(url, compression="gzip", **storage_options)
|
109 | 82 | return openfile
|
110 |
| - |
111 |
| - |
112 |
| -def url_filename(url: str) -> str: |
113 |
| - """Extract the filename from a URL""" |
114 |
| - filename: str = URL(url).name |
115 |
| - return filename |
116 |
| - |
117 |
| - |
118 |
| -def build_url(dir_url: PathType, child_path: str) -> str: |
119 |
| - """Combine a URL for a directory with a child path""" |
120 |
| - url = URL(str(dir_url)) |
121 |
| - # the division (/) operator discards query and fragment, so add them back |
122 |
| - return str((url / child_path).with_query(url.query).with_fragment(url.fragment)) |
123 |
| - |
124 |
| - |
125 |
| -@contextmanager |
126 |
| -def temporary_directory( |
127 |
| - suffix: Optional[str] = None, |
128 |
| - prefix: Optional[str] = None, |
129 |
| - dir: Optional[PathType] = None, |
130 |
| - storage_options: Optional[Dict[str, str]] = None, |
131 |
| - retain_temp_files: Optional[bool] = None, |
132 |
| -) -> Iterator[str]: |
133 |
| - """Create a temporary directory in a fsspec filesystem. |
134 |
| -
|
135 |
| - Parameters |
136 |
| - ---------- |
137 |
| - suffix : Optional[str], optional |
138 |
| - If not None, the name of the temporary directory will end with that suffix. |
139 |
| - prefix : Optional[str], optional |
140 |
| - If not None, the name of the temporary directory will start with that prefix. |
141 |
| - dir : Optional[PathType], optional |
142 |
| - If not None, the temporary directory will be created in that directory, otherwise |
143 |
| - the local filesystem directory returned by `tempfile.gettempdir()` will be used. |
144 |
| - The directory may be specified as any fsspec URL. |
145 |
| - storage_options : Optional[Dict[str, str]], optional |
146 |
| - Any additional parameters for the storage backend (see `fsspec.open`). |
147 |
| - retain_temp_files : Optional[bool], optional |
148 |
| - If True, the temporary directory will not be removed on exiting the context manager. |
149 |
| - Defaults to None, which means the directory will be removed. |
150 |
| - Yields |
151 |
| - ------- |
152 |
| - Generator[str, None, None] |
153 |
| - A context manager yielding the fsspec URL to the created directory. |
154 |
| - """ |
155 |
| - |
156 |
| - # Fill in defaults |
157 |
| - suffix = suffix or "" |
158 |
| - prefix = prefix or "" |
159 |
| - dir = dir or tempfile.gettempdir() |
160 |
| - storage_options = storage_options or {} |
161 |
| - |
162 |
| - # Find the filesystem by looking at the URL scheme (protocol), empty means local filesystem |
163 |
| - protocol = urlparse(str(dir)).scheme |
164 |
| - fs = fsspec.filesystem(protocol, **storage_options) |
165 |
| - |
166 |
| - # Construct a random directory name |
167 |
| - tempdir = build_url(dir, prefix + str(uuid.uuid4()) + suffix) |
168 |
| - try: |
169 |
| - fs.mkdir(tempdir) |
170 |
| - yield tempdir |
171 |
| - finally: |
172 |
| - # Remove the temporary directory on exiting the context manager |
173 |
| - if not retain_temp_files: |
174 |
| - fs.rm(tempdir, recursive=True) |
175 |
| - |
176 |
| - |
177 |
| -def get_default_vcf_encoding(ds, chunk_length, chunk_width, compressor): |
178 |
| - # Enforce uniform chunks in the variants dimension |
179 |
| - # Also chunk in the samples direction |
180 |
| - def get_chunk_size(dim: Hashable, size: int) -> int: |
181 |
| - if dim == "variants": |
182 |
| - return chunk_length |
183 |
| - elif dim == "samples": |
184 |
| - return chunk_width |
185 |
| - else: |
186 |
| - # Avoid chunk size of 0 |
187 |
| - return max(size, 1) |
188 |
| - |
189 |
| - default_encoding = {} |
190 |
| - for var in ds.data_vars: |
191 |
| - var_chunks = tuple( |
192 |
| - get_chunk_size(dim, size) |
193 |
| - for (dim, size) in zip(ds[var].dims, ds[var].shape) |
194 |
| - ) |
195 |
| - default_encoding[var] = dict(chunks=var_chunks, compressor=compressor) |
196 |
| - |
197 |
| - # Enable bit packing by default for boolean arrays |
198 |
| - if ds[var].dtype.kind == "b": |
199 |
| - default_encoding[var]["filters"] = [PackBits()] |
200 |
| - |
201 |
| - # Position is monotonically increasing (within a contig) so benefits from delta encoding |
202 |
| - if var == "variant_position": |
203 |
| - default_encoding[var]["filters"] = [Delta(ds[var].dtype)] |
204 |
| - |
205 |
| - return default_encoding |
206 |
| - |
207 |
| - |
208 |
| -def merge_encodings( |
209 |
| - default_encoding: Dict[str, Dict[str, Any]], overrides: Dict[str, Dict[str, Any]] |
210 |
| -) -> Dict[str, Dict[str, Any]]: |
211 |
| - """Merge a dictionary of dictionaries specifying encodings with another dictionary of dictionaries of overriding encodings. |
212 |
| -
|
213 |
| - Parameters |
214 |
| - ---------- |
215 |
| - default_encoding : Dict[str, Dict[str, Any]] |
216 |
| - The default encoding dictionary. |
217 |
| - overrides : Dict[str, Dict[str, Any]] |
218 |
| - A dictionary containing selective overrides. |
219 |
| -
|
220 |
| - Returns |
221 |
| - ------- |
222 |
| - Dict[str, Dict[str, Any]] |
223 |
| - The merged encoding dictionary |
224 |
| - """ |
225 |
| - merged = {} |
226 |
| - for var, d in default_encoding.items(): |
227 |
| - if var in overrides: |
228 |
| - merged[var] = {**d, **overrides[var]} |
229 |
| - else: |
230 |
| - merged[var] = d |
231 |
| - for var, d in overrides.items(): |
232 |
| - if var not in merged: |
233 |
| - merged[var] = d |
234 |
| - return merged |
0 commit comments