Skip to content

Commit 5c44313

Browse files
Factor out fsspec calls
1 parent 0d29702 commit 5c44313

File tree

1 file changed

+5
-28
lines changed

1 file changed

+5
-28
lines changed

bio2zarr/vcf_utils.py

Lines changed: 5 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
from typing import IO, Any, Dict, Optional, Sequence, Union
22
import contextlib
33
import struct
4-
import re
54
import pathlib
6-
import itertools
5+
import gzip
76
from dataclasses import dataclass
7+
import os
88

9-
import fsspec
109
import numpy as np
11-
from cyvcf2 import VCF
1210
import cyvcf2
1311
import humanfriendly
1412

@@ -19,32 +17,11 @@
1917
TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
2018

2119

22-
def open_gzip(path: PathType, storage_options: Optional[Dict[str, str]]) -> IO[Any]:
23-
url = str(path)
24-
storage_options = storage_options or {}
25-
openfile: IO[Any] = fsspec.open(url, compression="gzip", **storage_options)
26-
return openfile
27-
28-
2920
def ceildiv(a: int, b: int) -> int:
3021
"""Safe integer ceil function"""
3122
return -(-a // b)
3223

3324

34-
def get_file_length(
35-
path: PathType, storage_options: Optional[Dict[str, str]] = None
36-
) -> int:
37-
"""Get the length of a file in bytes."""
38-
url = str(path)
39-
storage_options = storage_options or {}
40-
with fsspec.open(url, **storage_options) as openfile:
41-
fs = openfile.fs
42-
size = fs.size(url)
43-
if size is None:
44-
raise IOError(f"Cannot determine size of file {url}") # pragma: no cover
45-
return int(size)
46-
47-
4825
def get_file_offset(vfp: int) -> int:
4926
"""Convert a block compressed virtual file pointer to a file offset."""
5027
address_mask = 0xFFFFFFFFFFFF
@@ -224,7 +201,7 @@ def read_csi(
224201
ValueError
225202
If the file is not a CSI file.
226203
"""
227-
with open_gzip(file, storage_options=storage_options) as f:
204+
with gzip.open(file) as f:
228205
magic = read_bytes_as_value(f, "4s")
229206
if magic != b"CSI\x01":
230207
raise ValueError("File not in CSI format.")
@@ -337,7 +314,7 @@ def read_tabix(
337314
ValueError
338315
If the file is not a tabix file.
339316
"""
340-
with open_gzip(file, storage_options=storage_options) as f:
317+
with gzip.open(file) as f:
341318
magic = read_bytes_as_value(f, "4s")
342319
if magic != b"TBI\x01":
343320
raise ValueError("File not in Tabix format.")
@@ -457,7 +434,7 @@ def partition_into_regions(
457434
raise ValueError("target_part_size must be positive")
458435

459436
# Calculate the desired part file boundaries
460-
file_length = get_file_length(self.vcf_path)
437+
file_length = os.stat(self.vcf_path).st_size
461438
if num_parts is not None:
462439
target_part_size_bytes = file_length // num_parts
463440
elif target_part_size_bytes is not None:

0 commit comments

Comments
 (0)