Skip to content

Commit 19c6ac2

Browse files
Good implementation of no-indexes for IndexedVcf
1 parent e741038 commit 19c6ac2

File tree

2 files changed

+109
-29
lines changed

2 files changed

+109
-29
lines changed

bio2zarr/vcf_utils.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,7 @@ class Region:
8989
end: Optional[int] = None
9090

9191
def __post_init__(self):
92-
if self.contig is None:
93-
return
94-
92+
assert self.contig is not None
9593
if self.start is not None:
9694
self.start = int(self.start)
9795
assert self.start > 0
@@ -399,6 +397,9 @@ class VcfIndexType(Enum):
399397
class IndexedVcf(contextlib.AbstractContextManager):
400398
def __init__(self, vcf_path, index_path=None):
401399
self.vcf = None
400+
self.file_type = None
401+
self.index_type = None
402+
402403
vcf_path = pathlib.Path(vcf_path)
403404
if not vcf_path.exists():
404405
raise FileNotFoundError(vcf_path)
@@ -411,27 +412,28 @@ def __init__(self, vcf_path, index_path=None):
411412
vcf_path.suffix + VcfIndexType.CSI.value
412413
)
413414
if not index_path.exists():
414-
# Use this as a proxy for "no index"
415-
index_path = vcf_path
415+
# No supported index found
416+
index_path = None
416417
else:
417418
index_path = pathlib.Path(index_path)
419+
if not index_path.exists():
420+
raise FileNotFoundError(
421+
f"Specified index path {index_path} does not exist"
422+
)
418423

419424
self.vcf_path = vcf_path
420425
self.index_path = index_path
421-
self.file_type = None
422-
self.index_type = None
423-
424-
if index_path.suffix == VcfIndexType.CSI.value:
425-
self.index_type = VcfIndexType.CSI
426-
elif index_path.suffix == VcfIndexType.TABIX.value:
427-
self.index_type = VcfIndexType.TABIX
428-
self.file_type = VcfFileType.VCF
429-
# else:
430-
431-
# raise ValueError("Only .tbi or .csi indexes are supported.")
426+
if index_path is not None:
427+
if index_path.suffix == VcfIndexType.CSI.value:
428+
self.index_type = VcfIndexType.CSI
429+
elif index_path.suffix == VcfIndexType.TABIX.value:
430+
self.index_type = VcfIndexType.TABIX
431+
self.file_type = VcfFileType.VCF
432+
else:
433+
raise ValueError("Only .tbi or .csi indexes are supported.")
432434

433435
self.vcf = cyvcf2.VCF(vcf_path)
434-
if self.index_type is not None:
436+
if self.index_path is not None:
435437
self.vcf.set_index(str(self.index_path))
436438

437439
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
@@ -449,7 +451,15 @@ def __init__(self, vcf_path, index_path=None):
449451
self.sequence_names = self.vcf.seqnames
450452
elif self.index_type == VcfIndexType.TABIX:
451453
self.index = read_tabix(self.index_path)
454+
self.file_type = VcfFileType.VCF
452455
self.sequence_names = self.index.sequence_names
456+
else:
457+
assert self.index is None
458+
var = next(self.vcf)
459+
self.sequence_names = [var.CHROM]
460+
self.vcf.close()
461+
# There doesn't seem to be a way to reset the iterator
462+
self.vcf = cyvcf2.VCF(vcf_path)
453463

454464
def __exit__(self, exc_type, exc_val, exc_tb):
455465
if self.vcf is not None:
@@ -459,7 +469,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
459469

460470
def contig_record_counts(self):
461471
if self.index is None:
462-
return {None: np.inf}
472+
return {self.sequence_names[0]: np.inf}
463473
d = dict(zip(self.sequence_names, self.index.record_counts))
464474
if self.file_type == VcfFileType.BCF:
465475
d = {k: v for k, v in d.items() if v > 0}
@@ -468,10 +478,15 @@ def contig_record_counts(self):
468478
def count_variants(self, region):
469479
return sum(1 for _ in self.variants(region))
470480

471-
def variants(self, region):
481+
def variants(self, region=None):
472482
if self.index is None:
473-
assert region.contig is None
474-
yield from self.vcf
483+
contig = self.sequence_names[0]
484+
if region is not None:
485+
assert region.contig == contig
486+
for var in self.vcf:
487+
if var.CHROM != contig:
488+
raise ValueError("Multi-contig VCFs must be indexed")
489+
yield var
475490
else:
476491
start = 1 if region.start is None else region.start
477492
for var in self.vcf(str(region)):
@@ -498,9 +513,6 @@ def partition_into_regions(
498513
num_parts: Optional[int] = None,
499514
target_part_size: Union[None, int, str] = None,
500515
):
501-
if self.index is None:
502-
return [Region()]
503-
504516
if num_parts is None and target_part_size is None:
505517
raise ValueError("One of num_parts or target_part_size must be specified")
506518

@@ -520,6 +532,9 @@ def partition_into_regions(
520532
if target_part_size_bytes < 1:
521533
raise ValueError("target_part_size must be positive")
522534

535+
if self.index is None:
536+
return [Region(self.sequence_names[0])]
537+
523538
# Calculate the desired part file boundaries
524539
file_length = os.stat(self.vcf_path).st_size
525540
if num_parts is not None:

tests/test_vcf_utils.py

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pathlib
2+
import shutil
23

34
import numpy as np
45
import pytest
@@ -159,6 +160,27 @@ def test_partition_into_n_parts(self, index_file, total_records, num_parts):
159160
assert np.sum(part_variant_counts) == total_records
160161
assert_part_counts_non_zero(part_variant_counts, index_file)
161162

163+
@pytest.mark.parametrize(
164+
("vcf_file", "total_records"),
165+
[
166+
("1kg_2020_chrM.vcf.gz", 23),
167+
("1kg_2020_chr20_annotations.bcf", 21),
168+
],
169+
)
170+
@pytest.mark.parametrize("num_parts", [1, 2, 3])
171+
def test_partition_into_n_parts_unindexed(
172+
self, tmp_path, vcf_file, total_records, num_parts
173+
):
174+
copy_path = tmp_path / vcf_file
175+
shutil.copyfile(data_path / vcf_file, copy_path)
176+
indexed_vcf = vcf_utils.IndexedVcf(copy_path)
177+
regions = list(indexed_vcf.partition_into_regions(num_parts=num_parts))
178+
assert len(regions) == 1
179+
part_variant_counts = np.array(
180+
[indexed_vcf.count_variants(region) for region in regions]
181+
)
182+
assert np.sum(part_variant_counts) == total_records
183+
162184
def test_tabix_multi_chrom_bug(self):
163185
indexed_vcf = self.get_instance("multi_contig.vcf.gz.tbi")
164186
regions = list(indexed_vcf.partition_into_regions(num_parts=10))
@@ -213,11 +235,54 @@ def test_partition_invalid_arguments(self):
213235
with pytest.raises(ValueError, match=r"target_part_size must be positive"):
214236
indexed_vcf.partition_into_regions(target_part_size=0)
215237

216-
def test_bad_index(self):
217-
with pytest.raises(
218-
ValueError, match=r"Only .tbi or .csi indexes are supported."
219-
):
220-
vcf_utils.IndexedVcf(data_path / "sample.vcf.gz", "y")
238+
@pytest.mark.parametrize("path", ["y", data_path / "xxx", "/x/y.csi"])
239+
def test_missing_index_file(self, path):
240+
with pytest.raises(FileNotFoundError, match="Specified index path"):
241+
vcf_utils.IndexedVcf(data_path / "sample.vcf.gz", path)
242+
243+
def test_bad_index_format(self):
244+
vcf_file = data_path / "sample.vcf.gz"
245+
with pytest.raises(ValueError, match="Only .tbi or .csi indexes"):
246+
vcf_utils.IndexedVcf(vcf_file, vcf_file)
247+
248+
@pytest.mark.parametrize(
249+
"filename",
250+
[
251+
"1kg_2020_chrM.vcf.gz",
252+
"1kg_2020_chrM.bcf",
253+
"1kg_2020_chr20_annotations.bcf",
254+
"chr_m_indels.vcf.gz",
255+
"NA12878.prod.chr20snippet.g.vcf.gz",
256+
],
257+
)
258+
def test_unindexed_single_contig(self, tmp_path, filename):
259+
f1 = vcf_utils.IndexedVcf(data_path / filename)
260+
assert f1.index is not None
261+
copy_path = tmp_path / filename
262+
shutil.copyfile(data_path / filename, copy_path)
263+
f2 = vcf_utils.IndexedVcf(copy_path)
264+
assert f2.index is None
265+
crc1 = f1.contig_record_counts()
266+
assert len(crc1) == 1
267+
contig = next(iter(crc1.keys()))
268+
assert f2.contig_record_counts() == {contig: np.inf}
269+
region = vcf_utils.Region(contig)
270+
# The full variants returned by cyvcf2 don't compare for equality,
271+
# so just check the chrom/pos values
272+
v1 = [(v.CHROM, v.POS) for v in f1.variants(region)]
273+
v2 = [(v.CHROM, v.POS) for v in f2.variants(region)]
274+
assert v1 == v2
275+
276+
@pytest.mark.parametrize(
277+
"filename",
278+
["sample.vcf.gz", "sample.bcf", "multi_contig.vcf.gz"],
279+
)
280+
def test_unindexed_multi_contig(self, tmp_path, filename):
281+
copy_path = tmp_path / filename
282+
shutil.copyfile(data_path / filename, copy_path)
283+
f = vcf_utils.IndexedVcf(copy_path)
284+
with pytest.raises(ValueError, match="Multi-contig VCFs must be indexed"):
285+
list(f.variants())
221286

222287

223288
class TestCsiIndex:

0 commit comments

Comments
 (0)