Skip to content

Commit ec219e0

Browse files
committed
Error on overlapping partitions
1 parent c7b43ed commit ec219e0

File tree

2 files changed

+39
-2
lines changed

2 files changed

+39
-2
lines changed

bio2zarr/vcf.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,21 @@ def scan_vcf(path, target_num_partitions):
284284
return metadata, vcf.raw_header
285285

286286

287+
def check_overlap(partitions):
288+
for i in range(1, len(partitions)):
289+
prev_partition = partitions[i - 1]
290+
current_partition = partitions[i]
291+
if (
292+
prev_partition.region.contig == current_partition.region.contig
293+
and prev_partition.region.end > current_partition.region.start
294+
):
295+
raise ValueError(
296+
f"Multiple VCFs have the region "
297+
f"{prev_partition.region.contig}:{prev_partition.region.start}-"
298+
f"{current_partition.region.end}"
299+
)
300+
301+
287302
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
288303
logger.info(
289304
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
@@ -331,6 +346,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
331346
all_partitions.sort(
332347
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
333348
)
349+
check_overlap(all_partitions)
334350
icf_metadata.partitions = all_partitions
335351
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
336352
return icf_metadata, header

tests/test_vcf.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import json
2-
32
import pytest
43
import xarray.testing as xt
54
import sgkit as sg
65

7-
from bio2zarr import vcf
6+
from bio2zarr import vcf, vcf_utils
87

98

109
@pytest.fixture(scope="module")
@@ -203,3 +202,25 @@ def test_call_genotype_phased(self, schema):
203202
},
204203
"filters": [],
205204
}
205+
206+
207+
@pytest.mark.parametrize(
208+
"regions",
209+
[
210+
# Overlapping partitions
211+
[("1", 100, 200), ("1", 150, 250)],
212+
# Overlap by one position
213+
[("1", 100, 201), ("1", 200, 300)],
214+
# Contained overlap
215+
[("1", 100, 300), ("1", 150, 250)],
216+
# Exactly equal
217+
[("1", 100, 200), ("1", 100, 200)],
218+
],
219+
)
220+
def test_check_overlap(regions):
221+
partitions = [
222+
vcf.VcfPartition("", region=vcf_utils.Region(contig, start, end))
223+
for contig, start, end in regions
224+
]
225+
with pytest.raises(ValueError, match="Multiple VCFs have the region"):
226+
vcf.check_overlap(partitions)

0 commit comments

Comments
 (0)