2
2
3
3
import pytest
4
4
from cyvcf2 import VCF
5
+ import numpy as np
5
6
6
7
from bio2zarr import vcf_utils
7
8
16
17
17
18
data_path = pathlib .Path ("tests/data/vcf/" )
18
19
19
- # bcftools index -s
20
- @pytest .mark .parametrize (["index_file" , "expected" ], [
21
- ("sample.vcf.gz.tbi" , [2 , 6 , 1 ]),
22
- ("sample.bcf.csi" , [2 , 6 , 1 ]),
23
- ("sample_no_genotypes.vcf.gz.csi" , [2 , 6 , 1 ]),
24
- ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi" , [3450 , 16460 ]),
25
- ("CEUTrio.20.21.gatk3.4.g.bcf.csi" , [3450 , 16460 ]),
26
- ("1kg_2020_chrM.vcf.gz.tbi" , [23 ]),
27
- ("1kg_2020_chrM.vcf.gz.csi" , [23 ]),
28
- # ("1kg_2020_chrM.bcf.csi", [23]),
29
- # ("1kg_2020_chr20_annotations.bcf.csi", [21]),
30
- ])
31
- def test_index_record_count (index_file , expected ):
32
- index = vcf_utils .read_index (data_path / index_file )
33
- assert index .record_counts == expected
34
-
35
-
36
-
37
- # class TestCEUTrio2021VcfExample:
38
- # data_path = "tests/data/vcf/CEUTrio.20.21.gatk3.4.g.vcf.bgz"
39
-
40
- # @pytest.fixture(scope="class")
41
- # def index(self):
42
- # tabix_path = get_tabix_path(self.data_path)
43
- # return read_tabix(tabix_path)
44
-
45
- # def test_record_counts(self, index):
46
- # assert index.record_counts == [3450, 16460]
47
- # # print(index)
48
- # # # print(index.sequence_names)
49
- # # print(index.record_counts)
50
- # # for i, contig in enumerate(tabix.sequence_names):
51
- # # assert tabix.record_counts[i] == count_variants(vcf_path, contig)
52
-
53
- # # def test_one_region(self, index):
54
- # # parts = partition_into_regions(self.data_path, num_parts=1)
55
- # # assert parts == ["20:1-", "21"]
56
-
57
-
58
- # class TestCEUTrio2021BcfExample(TestCEUTrio2021VcfExample):
59
- # data_path = "tests/data/vcf/CEUTrio.20.21.gatk3.4.g.bcf"
60
20
61
- # @pytest.fixture(scope="class")
62
- # def index(self):
63
- # csi_path = get_csi_path(self.data_path)
64
- # return read_csi(csi_path)
21
+ # values computed using bcftools index -s
22
+ @pytest .mark .parametrize (
23
+ ["index_file" , "expected" ],
24
+ [
25
+ ("sample.vcf.gz.tbi" , {"19" : 2 , "20" : 6 , "X" : 1 }),
26
+ ("sample.bcf.csi" , {"19" : 2 , "20" : 6 , "X" : 1 }),
27
+ ("sample_no_genotypes.vcf.gz.csi" , {"19" : 2 , "20" : 6 , "X" : 1 }),
28
+ ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi" , {"20" : 3450 , "21" : 16460 }),
29
+ ("CEUTrio.20.21.gatk3.4.g.bcf.csi" , {"20" : 3450 , "21" : 16460 }),
30
+ ("1kg_2020_chrM.vcf.gz.tbi" , {"chrM" : 23 }),
31
+ ("1kg_2020_chrM.vcf.gz.csi" , {"chrM" : 23 }),
32
+ ("1kg_2020_chrM.bcf.csi" , {"chrM" : 23 }),
33
+ ("1kg_2020_chr20_annotations.bcf.csi" , {"chr20" : 21 }),
34
+ ("NA12878.prod.chr20snippet.g.vcf.gz.tbi" , {"20" : 301778 }),
35
+ ],
36
+ )
37
+ def test_index_record_count (index_file , expected ):
38
+ vcf_path = data_path / ("." .join (list (index_file .split ("." ))[:- 1 ]))
39
+ indexed_vcf = vcf_utils .IndexedVcf (vcf_path , data_path / index_file )
40
+ assert indexed_vcf .contig_record_counts () == expected
41
+
42
+
43
+ @pytest .mark .parametrize (
44
+ ["index_file" , "expected" ],
45
+ [
46
+ ("sample.vcf.gz.tbi" , ["19:1-" , "20" , "X" ]),
47
+ ("sample.bcf.csi" , ["19:1-" , "20" , "X" ]),
48
+ ("sample_no_genotypes.vcf.gz.csi" , ["19:1-" , "20" , "X" ]),
49
+ ("CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi" , ["20:1-" , "21" ]),
50
+ ("CEUTrio.20.21.gatk3.4.g.bcf.csi" , ["20:1-" , "21" ]),
51
+ ("1kg_2020_chrM.vcf.gz.tbi" , ["chrM:1-" ]),
52
+ ("1kg_2020_chrM.vcf.gz.csi" , ["chrM:1-" ]),
53
+ ("1kg_2020_chrM.bcf.csi" , ["chrM:1-" ]),
54
+ ("1kg_2020_chr20_annotations.bcf.csi" , ["chr20:49153-" ]),
55
+ ("NA12878.prod.chr20snippet.g.vcf.gz.tbi" , ["20:1-" ]),
56
+ ],
57
+ )
58
+ def test_partition_into_one_part (index_file , expected ):
59
+ vcf_path = data_path / ("." .join (list (index_file .split ("." ))[:- 1 ]))
60
+ indexed_vcf = vcf_utils .IndexedVcf (vcf_path , data_path / index_file )
61
+ regions = indexed_vcf .partition_into_regions (num_parts = 1 )
62
+ assert all (isinstance (r , vcf_utils .Region ) for r in regions )
63
+ assert [str (r ) for r in regions ] == expected
65
64
66
65
67
66
class TestCsiIndex :
68
-
69
-
70
67
@pytest .mark .parametrize (
71
68
"filename" ,
72
69
["CEUTrio.20.21.gatk3.4.g.vcf.bgz" , "CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi" ],
@@ -77,10 +74,12 @@ def test_invalid_csi(self, filename):
77
74
78
75
79
76
class TestTabixIndex :
80
-
81
77
@pytest .mark .parametrize (
82
78
"filename" ,
83
- ["CEUTrio.20.21.gatk3.4.g.vcf.bgz" , "CEUTrio.20.21.gatk3.4.g.bcf.csi" , ],
79
+ [
80
+ "CEUTrio.20.21.gatk3.4.g.vcf.bgz" ,
81
+ "CEUTrio.20.21.gatk3.4.g.bcf.csi" ,
82
+ ],
84
83
)
85
84
def test_invalid_tbi (self , filename ):
86
85
with pytest .raises (ValueError , match = r"File not in Tabix format." ):
@@ -159,11 +158,6 @@ def test_invalid_arguments(self):
159
158
with pytest .raises (ValueError , match = r"target_part_size must be positive" ):
160
159
partition_into_regions (vcf_path , target_part_size = 0 )
161
160
162
- def test_one_part (self ):
163
- vcf_path = data_path / "CEUTrio.20.21.gatk3.4.g.vcf.bgz"
164
- parts = partition_into_regions (vcf_path , num_parts = 1 )
165
- assert parts == ["20:1-" , "21" ]
166
-
167
161
@pytest .mark .skip ("TODO" )
168
162
def test_missing_index (self , temp_path ):
169
163
vcf_path = data_path / "CEUTrio.20.21.gatk3.4.g.vcf.bgz"
0 commit comments