Skip to content

Commit 496fa31

Browse files
Refactor vcf partition tests into test_vcf_utils
1 parent 0152662 commit 496fa31

File tree

2 files changed

+95
-99
lines changed

2 files changed

+95
-99
lines changed

tests/test_vcf_partition.py

Lines changed: 0 additions & 99 deletions
This file was deleted.

tests/test_vcf_utils.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from bio2zarr.tbi import read_tabix
66
from bio2zarr.vcf_partition import get_csi_path
77
from bio2zarr.vcf_partition import get_tabix_path
8+
from bio2zarr.vcf_partition import partition_into_regions
9+
810

911
from .utils import count_variants, path_for_test
1012

@@ -67,3 +69,96 @@ def test_record_counts(self, shared_datadir, vcf_file):
6769
def test_read_tabix__invalid_tbi(self, shared_datadir, file):
6870
with pytest.raises(ValueError, match=r"File not in Tabix format."):
6971
read_tabix(path_for_test(shared_datadir, file, True))
72+
73+
74+
class TestPartitionIntoRegions:
75+
@pytest.mark.parametrize(
76+
"vcf_file",
77+
[
78+
"CEUTrio.20.21.gatk3.4.g.bcf",
79+
"CEUTrio.20.21.gatk3.4.g.vcf.bgz",
80+
"NA12878.prod.chr20snippet.g.vcf.gz",
81+
],
82+
)
83+
def test_num_parts(self, shared_datadir, vcf_file):
84+
vcf_path = path_for_test(shared_datadir, vcf_file, True)
85+
86+
regions = partition_into_regions(vcf_path, num_parts=4)
87+
88+
assert regions is not None
89+
part_variant_counts = [count_variants(vcf_path, region) for region in regions]
90+
total_variants = count_variants(vcf_path)
91+
92+
assert sum(part_variant_counts) == total_variants
93+
94+
def test_num_parts_large(self, shared_datadir):
95+
vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
96+
97+
regions = partition_into_regions(vcf_path, num_parts=100)
98+
assert regions is not None
99+
assert len(regions) == 18
100+
101+
part_variant_counts = [count_variants(vcf_path, region) for region in regions]
102+
total_variants = count_variants(vcf_path)
103+
104+
assert sum(part_variant_counts) == total_variants
105+
106+
@pytest.mark.parametrize(
107+
"target_part_size",
108+
[
109+
100_000,
110+
"100KB",
111+
"100 kB",
112+
],
113+
)
114+
def test_target_part_size(self, shared_datadir, target_part_size):
115+
vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
116+
117+
regions = partition_into_regions(vcf_path, target_part_size=target_part_size)
118+
assert regions is not None
119+
assert len(regions) == 5
120+
121+
part_variant_counts = [count_variants(vcf_path, region) for region in regions]
122+
total_variants = count_variants(vcf_path)
123+
124+
assert sum(part_variant_counts) == total_variants
125+
126+
def test_invalid_arguments(self, shared_datadir):
127+
vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
128+
129+
with pytest.raises(
130+
ValueError, match=r"One of num_parts or target_part_size must be specified"
131+
):
132+
partition_into_regions(vcf_path)
133+
134+
with pytest.raises(
135+
ValueError,
136+
match=r"Only one of num_parts or target_part_size may be specified",
137+
):
138+
partition_into_regions(vcf_path, num_parts=4, target_part_size=100_000)
139+
140+
with pytest.raises(ValueError, match=r"num_parts must be positive"):
141+
partition_into_regions(vcf_path, num_parts=0)
142+
143+
with pytest.raises(ValueError, match=r"target_part_size must be positive"):
144+
partition_into_regions(vcf_path, target_part_size=0)
145+
146+
def test_one_part(self, shared_datadir):
147+
vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
148+
parts = partition_into_regions(vcf_path, num_parts=1)
149+
assert parts == ["20:1-", "21"]
150+
151+
def test_missing_index(self, shared_datadir):
152+
vcf_path = path_for_test(
153+
shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz", True
154+
)
155+
with pytest.raises(ValueError, match=r"Cannot find .tbi or .csi file."):
156+
partition_into_regions(vcf_path, num_parts=2)
157+
158+
bogus_index_path = path_for_test(
159+
shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz.index", True
160+
)
161+
with pytest.raises(
162+
ValueError, match=r"Only .tbi or .csi indexes are supported."
163+
):
164+
partition_into_regions(vcf_path, index_path=bogus_index_path, num_parts=2)

0 commit comments

Comments
 (0)