Skip to content

Commit 7796a04

Browse files
Add support for unindexed VCFs
1 parent 1f9274b commit 7796a04

File tree

4 files changed

+163
-31
lines changed

4 files changed

+163
-31
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# 0.1.5 2025-03-xx
22

3-
- Add support for merging contig IDs across multiple VCFs (#342)
3+
- Add support for merging contig IDs across multiple VCFs (#335)
4+
5+
- Add support for unindexed (and uncompressed) VCFs (#337)
46

57
# 0.1.4 2025-03-10
68

bio2zarr/vcf_utils.py

Lines changed: 53 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,10 @@ class Region:
8989
end: Optional[int] = None
9090

9191
def __post_init__(self):
92-
if self.start is not None:
92+
assert self.contig is not None
93+
if self.start is None:
94+
self.start = 1
95+
else:
9396
self.start = int(self.start)
9497
assert self.start > 0
9598
if self.end is not None:
@@ -396,6 +399,9 @@ class VcfIndexType(Enum):
396399
class IndexedVcf(contextlib.AbstractContextManager):
397400
def __init__(self, vcf_path, index_path=None):
398401
self.vcf = None
402+
self.file_type = None
403+
self.index_type = None
404+
399405
vcf_path = pathlib.Path(vcf_path)
400406
if not vcf_path.exists():
401407
raise FileNotFoundError(vcf_path)
@@ -408,30 +414,34 @@ def __init__(self, vcf_path, index_path=None):
408414
vcf_path.suffix + VcfIndexType.CSI.value
409415
)
410416
if not index_path.exists():
411-
raise FileNotFoundError(
412-
f"Cannot find .tbi or .csi file for {vcf_path}"
413-
)
417+
# No supported index found
418+
index_path = None
414419
else:
415420
index_path = pathlib.Path(index_path)
421+
if not index_path.exists():
422+
raise FileNotFoundError(
423+
f"Specified index path {index_path} does not exist"
424+
)
416425

417426
self.vcf_path = vcf_path
418427
self.index_path = index_path
419-
self.file_type = None
420-
self.index_type = None
421-
422-
if index_path.suffix == VcfIndexType.CSI.value:
423-
self.index_type = VcfIndexType.CSI
424-
elif index_path.suffix == VcfIndexType.TABIX.value:
425-
self.index_type = VcfIndexType.TABIX
426-
self.file_type = VcfFileType.VCF
427-
else:
428-
raise ValueError("Only .tbi or .csi indexes are supported.")
428+
if index_path is not None:
429+
if index_path.suffix == VcfIndexType.CSI.value:
430+
self.index_type = VcfIndexType.CSI
431+
elif index_path.suffix == VcfIndexType.TABIX.value:
432+
self.index_type = VcfIndexType.TABIX
433+
self.file_type = VcfFileType.VCF
434+
else:
435+
raise ValueError("Only .tbi or .csi indexes are supported.")
429436

430437
self.vcf = cyvcf2.VCF(vcf_path)
431-
self.vcf.set_index(str(self.index_path))
438+
if self.index_path is not None:
439+
self.vcf.set_index(str(self.index_path))
440+
432441
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
433442
self.sequence_names = None
434443

444+
self.index = None
435445
if self.index_type == VcfIndexType.CSI:
436446
# Determine the file-type based on the "aux" field.
437447
self.index = read_csi(self.index_path)
@@ -441,9 +451,17 @@ def __init__(self, vcf_path, index_path=None):
441451
self.sequence_names = self.index.parse_vcf_aux()
442452
else:
443453
self.sequence_names = self.vcf.seqnames
444-
else:
454+
elif self.index_type == VcfIndexType.TABIX:
445455
self.index = read_tabix(self.index_path)
456+
self.file_type = VcfFileType.VCF
446457
self.sequence_names = self.index.sequence_names
458+
else:
459+
assert self.index is None
460+
var = next(self.vcf)
461+
self.sequence_names = [var.CHROM]
462+
self.vcf.close()
463+
# There doesn't seem to be a way to reset the iterator
464+
self.vcf = cyvcf2.VCF(vcf_path)
447465

448466
def __exit__(self, exc_type, exc_val, exc_tb):
449467
if self.vcf is not None:
@@ -452,6 +470,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
452470
return False
453471

454472
def contig_record_counts(self):
473+
if self.index is None:
474+
return {self.sequence_names[0]: np.inf}
455475
d = dict(zip(self.sequence_names, self.index.record_counts))
456476
if self.file_type == VcfFileType.BCF:
457477
d = {k: v for k, v in d.items() if v > 0}
@@ -460,12 +480,21 @@ def contig_record_counts(self):
460480
def count_variants(self, region):
461481
return sum(1 for _ in self.variants(region))
462482

463-
def variants(self, region):
464-
start = 1 if region.start is None else region.start
465-
for var in self.vcf(str(region)):
466-
# Need to filter because of indels overlapping the region
467-
if var.POS >= start:
483+
def variants(self, region=None):
484+
if self.index is None:
485+
contig = self.sequence_names[0]
486+
if region is not None:
487+
assert region.contig == contig
488+
for var in self.vcf:
489+
if var.CHROM != contig:
490+
raise ValueError("Multi-contig VCFs must be indexed")
468491
yield var
492+
else:
493+
start = 1 if region.start is None else region.start
494+
for var in self.vcf(str(region)):
495+
# Need to filter because of indels overlapping the region
496+
if var.POS >= start:
497+
yield var
469498

470499
def _filter_empty_and_refine(self, regions):
471500
"""
@@ -505,6 +534,9 @@ def partition_into_regions(
505534
if target_part_size_bytes < 1:
506535
raise ValueError("target_part_size must be positive")
507536

537+
if self.index is None:
538+
return [Region(self.sequence_names[0])]
539+
508540
# Calculate the desired part file boundaries
509541
file_length = os.stat(self.vcf_path).st_size
510542
if num_parts is not None:

tests/test_simulated_data.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,14 @@ def assert_ts_ds_equal(ts, ds, ploidy=1):
4242
nt.assert_equal(ds.variant_position, ts.sites_position)
4343

4444

45-
def write_vcf(ts, vcf_path, contig_id="1"):
45+
def write_vcf(ts, vcf_path, contig_id="1", indexed=False):
4646
with open(vcf_path, "w") as f:
4747
ts.write_vcf(f, contig_id=contig_id)
48-
# This also compresses the input file
49-
pysam.tabix_index(str(vcf_path), preset="vcf")
50-
return vcf_path.with_suffix(vcf_path.suffix + ".gz")
48+
if indexed:
49+
# This also compresses the input file
50+
pysam.tabix_index(str(vcf_path), preset="vcf")
51+
vcf_path = vcf_path.with_suffix(vcf_path.suffix + ".gz")
52+
return vcf_path
5153

5254

5355
# https://github.com/sgkit-dev/bio2zarr/issues/336
@@ -75,6 +77,9 @@ def test_multi_contig(self, contig_ids, tmp_path):
7577
vcfs.append(vcf_path)
7678
tss[contig_id] = ts
7779

80+
self.validate_tss_vcf_list(contig_ids, tss, vcfs, tmp_path)
81+
82+
def validate_tss_vcf_list(self, contig_ids, tss, vcfs, tmp_path):
7883
out = tmp_path / "example.vcf.zarr"
7984
vcf2zarr.convert(vcfs, out)
8085
ds = sg.load_dataset(out).set_index(
@@ -93,6 +98,34 @@ def test_multi_contig(self, contig_ids, tmp_path):
9398
dss = ds.sel(variants=(contig, slice(0, None)))
9499
assert_ts_ds_equal(tss[contig_id], dss)
95100

101+
@pytest.mark.parametrize("indexed", [True, False])
102+
def test_indexed(self, indexed, tmp_path):
103+
ts = run_simulation(num_samples=12, seed=34)
104+
vcf_path = write_vcf(ts, tmp_path / "sim.vcf", indexed=indexed)
105+
out = tmp_path / "example.vcf.zarr"
106+
vcf2zarr.convert([vcf_path], out)
107+
ds = sg.load_dataset(out)
108+
assert_ts_ds_equal(ts, ds)
109+
110+
@pytest.mark.parametrize("num_contigs", [2, 3, 6])
111+
def test_mixed_indexed(self, num_contigs, tmp_path):
112+
contig_ids = [f"x{j}" for j in range(num_contigs)]
113+
114+
vcfs = []
115+
tss = {}
116+
for seed, contig_id in enumerate(contig_ids, 1):
117+
ts = run_simulation(num_samples=3, seed=seed)
118+
vcf_path = write_vcf(
119+
ts,
120+
tmp_path / f"{contig_id}.vcf",
121+
contig_id=contig_id,
122+
indexed=seed % 2 == 0,
123+
)
124+
vcfs.append(vcf_path)
125+
tss[contig_id] = ts
126+
127+
self.validate_tss_vcf_list(contig_ids, tss, vcfs, tmp_path)
128+
96129

97130
# https://github.com/sgkit-dev/bio2zarr/issues/336
98131
@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")

tests/test_vcf_utils.py

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pathlib
2+
import shutil
23

34
import numpy as np
45
import pytest
@@ -159,6 +160,27 @@ def test_partition_into_n_parts(self, index_file, total_records, num_parts):
159160
assert np.sum(part_variant_counts) == total_records
160161
assert_part_counts_non_zero(part_variant_counts, index_file)
161162

163+
@pytest.mark.parametrize(
164+
("vcf_file", "total_records"),
165+
[
166+
("1kg_2020_chrM.vcf.gz", 23),
167+
("1kg_2020_chr20_annotations.bcf", 21),
168+
],
169+
)
170+
@pytest.mark.parametrize("num_parts", [1, 2, 3])
171+
def test_partition_into_n_parts_unindexed(
172+
self, tmp_path, vcf_file, total_records, num_parts
173+
):
174+
copy_path = tmp_path / vcf_file
175+
shutil.copyfile(data_path / vcf_file, copy_path)
176+
indexed_vcf = vcf_utils.IndexedVcf(copy_path)
177+
regions = list(indexed_vcf.partition_into_regions(num_parts=num_parts))
178+
assert len(regions) == 1
179+
part_variant_counts = np.array(
180+
[indexed_vcf.count_variants(region) for region in regions]
181+
)
182+
assert np.sum(part_variant_counts) == total_records
183+
162184
def test_tabix_multi_chrom_bug(self):
163185
indexed_vcf = self.get_instance("multi_contig.vcf.gz.tbi")
164186
regions = list(indexed_vcf.partition_into_regions(num_parts=10))
@@ -213,11 +235,54 @@ def test_partition_invalid_arguments(self):
213235
with pytest.raises(ValueError, match=r"target_part_size must be positive"):
214236
indexed_vcf.partition_into_regions(target_part_size=0)
215237

216-
def test_bad_index(self):
217-
with pytest.raises(
218-
ValueError, match=r"Only .tbi or .csi indexes are supported."
219-
):
220-
vcf_utils.IndexedVcf(data_path / "sample.vcf.gz", "y")
238+
@pytest.mark.parametrize("path", ["y", data_path / "xxx", "/x/y.csi"])
239+
def test_missing_index_file(self, path):
240+
with pytest.raises(FileNotFoundError, match="Specified index path"):
241+
vcf_utils.IndexedVcf(data_path / "sample.vcf.gz", path)
242+
243+
def test_bad_index_format(self):
244+
vcf_file = data_path / "sample.vcf.gz"
245+
with pytest.raises(ValueError, match="Only .tbi or .csi indexes"):
246+
vcf_utils.IndexedVcf(vcf_file, vcf_file)
247+
248+
@pytest.mark.parametrize(
249+
"filename",
250+
[
251+
"1kg_2020_chrM.vcf.gz",
252+
"1kg_2020_chrM.bcf",
253+
"1kg_2020_chr20_annotations.bcf",
254+
"chr_m_indels.vcf.gz",
255+
"NA12878.prod.chr20snippet.g.vcf.gz",
256+
],
257+
)
258+
def test_unindexed_single_contig(self, tmp_path, filename):
259+
f1 = vcf_utils.IndexedVcf(data_path / filename)
260+
assert f1.index is not None
261+
copy_path = tmp_path / filename
262+
shutil.copyfile(data_path / filename, copy_path)
263+
f2 = vcf_utils.IndexedVcf(copy_path)
264+
assert f2.index is None
265+
crc1 = f1.contig_record_counts()
266+
assert len(crc1) == 1
267+
contig = next(iter(crc1.keys()))
268+
assert f2.contig_record_counts() == {contig: np.inf}
269+
region = vcf_utils.Region(contig)
270+
# The full variants returned by cyvcf2 don't compare for equality,
271+
# so just check the chrom/pos values
272+
v1 = [(v.CHROM, v.POS) for v in f1.variants(region)]
273+
v2 = [(v.CHROM, v.POS) for v in f2.variants(region)]
274+
assert v1 == v2
275+
276+
@pytest.mark.parametrize(
277+
"filename",
278+
["sample.vcf.gz", "sample.bcf", "multi_contig.vcf.gz"],
279+
)
280+
def test_unindexed_multi_contig(self, tmp_path, filename):
281+
copy_path = tmp_path / filename
282+
shutil.copyfile(data_path / filename, copy_path)
283+
f = vcf_utils.IndexedVcf(copy_path)
284+
with pytest.raises(ValueError, match="Multi-contig VCFs must be indexed"):
285+
list(f.variants())
221286

222287

223288
class TestCsiIndex:

0 commit comments

Comments
 (0)