|
1 | 1 | from typing import IO, Any, Dict, Optional, Sequence, Union
|
2 | 2 | import contextlib
|
3 | 3 | import struct
|
4 |
| -import re |
5 | 4 | import pathlib
|
6 |
| -import itertools |
| 5 | +import gzip |
7 | 6 | from dataclasses import dataclass
|
| 7 | +import os |
8 | 8 |
|
9 |
| -import fsspec |
10 | 9 | import numpy as np
|
11 |
| -from cyvcf2 import VCF |
12 | 10 | import cyvcf2
|
13 | 11 | import humanfriendly
|
14 | 12 |
|
|
19 | 17 | TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
|
20 | 18 |
|
21 | 19 |
|
22 |
| -def open_gzip(path: PathType, storage_options: Optional[Dict[str, str]]) -> IO[Any]: |
23 |
| - url = str(path) |
24 |
| - storage_options = storage_options or {} |
25 |
| - openfile: IO[Any] = fsspec.open(url, compression="gzip", **storage_options) |
26 |
| - return openfile |
27 |
| - |
28 |
| - |
29 | 20 | def ceildiv(a: int, b: int) -> int:
|
30 | 21 | """Safe integer ceil function"""
|
31 | 22 | return -(-a // b)
|
32 | 23 |
|
33 | 24 |
|
34 |
| -def get_file_length( |
35 |
| - path: PathType, storage_options: Optional[Dict[str, str]] = None |
36 |
| -) -> int: |
37 |
| - """Get the length of a file in bytes.""" |
38 |
| - url = str(path) |
39 |
| - storage_options = storage_options or {} |
40 |
| - with fsspec.open(url, **storage_options) as openfile: |
41 |
| - fs = openfile.fs |
42 |
| - size = fs.size(url) |
43 |
| - if size is None: |
44 |
| - raise IOError(f"Cannot determine size of file {url}") # pragma: no cover |
45 |
| - return int(size) |
46 |
| - |
47 |
| - |
48 | 25 | def get_file_offset(vfp: int) -> int:
|
49 | 26 | """Convert a block compressed virtual file pointer to a file offset."""
|
50 | 27 | address_mask = 0xFFFFFFFFFFFF
|
@@ -224,7 +201,7 @@ def read_csi(
|
224 | 201 | ValueError
|
225 | 202 | If the file is not a CSI file.
|
226 | 203 | """
|
227 |
| - with open_gzip(file, storage_options=storage_options) as f: |
| 204 | + with gzip.open(file) as f: |
228 | 205 | magic = read_bytes_as_value(f, "4s")
|
229 | 206 | if magic != b"CSI\x01":
|
230 | 207 | raise ValueError("File not in CSI format.")
|
@@ -337,7 +314,7 @@ def read_tabix(
|
337 | 314 | ValueError
|
338 | 315 | If the file is not a tabix file.
|
339 | 316 | """
|
340 |
| - with open_gzip(file, storage_options=storage_options) as f: |
| 317 | + with gzip.open(file) as f: |
341 | 318 | magic = read_bytes_as_value(f, "4s")
|
342 | 319 | if magic != b"TBI\x01":
|
343 | 320 | raise ValueError("File not in Tabix format.")
|
@@ -457,7 +434,7 @@ def partition_into_regions(
|
457 | 434 | raise ValueError("target_part_size must be positive")
|
458 | 435 |
|
459 | 436 | # Calculate the desired part file boundaries
|
460 |
| - file_length = get_file_length(self.vcf_path) |
| 437 | + file_length = os.stat(self.vcf_path).st_size |
461 | 438 | if num_parts is not None:
|
462 | 439 | target_part_size_bytes = file_length // num_parts
|
463 | 440 | elif target_part_size_bytes is not None:
|
|
0 commit comments