Skip to content

Commit e65f888

Browse files
Merge pull request #109 from benjeffery/check_overlap
Error on overlapping partitions
2 parents 96bf60f + ec219e0 commit e65f888

File tree

2 files changed

+39
-2
lines changed

2 files changed

+39
-2
lines changed

bio2zarr/vcf.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,21 @@ def scan_vcf(path, target_num_partitions):
284284
return metadata, vcf.raw_header
285285

286286

287+
def check_overlap(partitions):
288+
for i in range(1, len(partitions)):
289+
prev_partition = partitions[i - 1]
290+
current_partition = partitions[i]
291+
if (
292+
prev_partition.region.contig == current_partition.region.contig
293+
and prev_partition.region.end > current_partition.region.start
294+
):
295+
raise ValueError(
296+
f"Multiple VCFs have the region "
297+
f"{prev_partition.region.contig}:{prev_partition.region.start}-"
298+
f"{current_partition.region.end}"
299+
)
300+
301+
287302
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
288303
logger.info(
289304
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
@@ -331,6 +346,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
331346
all_partitions.sort(
332347
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
333348
)
349+
check_overlap(all_partitions)
334350
icf_metadata.partitions = all_partitions
335351
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
336352
return icf_metadata, header

tests/test_vcf.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import json
2-
32
import pytest
43
import xarray.testing as xt
54
import sgkit as sg
65
import zarr
76

8-
from bio2zarr import vcf
7+
from bio2zarr import vcf, vcf_utils
98

109

1110
@pytest.fixture(scope="module")
@@ -297,3 +296,25 @@ def test_call_genotype_phased(self, schema):
297296
},
298297
"filters": [],
299298
}
299+
300+
301+
@pytest.mark.parametrize(
302+
"regions",
303+
[
304+
# Overlapping partitions
305+
[("1", 100, 200), ("1", 150, 250)],
306+
# Overlap by one position
307+
[("1", 100, 201), ("1", 200, 300)],
308+
# Contained overlap
309+
[("1", 100, 300), ("1", 150, 250)],
310+
# Exactly equal
311+
[("1", 100, 200), ("1", 100, 200)],
312+
],
313+
)
314+
def test_check_overlap(regions):
315+
partitions = [
316+
vcf.VcfPartition("", region=vcf_utils.Region(contig, start, end))
317+
for contig, start, end in regions
318+
]
319+
with pytest.raises(ValueError, match="Multiple VCFs have the region"):
320+
vcf.check_overlap(partitions)

0 commit comments

Comments
 (0)