1
1
import sys
2
2
3
3
import numpy .testing as nt
4
+ import pysam
4
5
import pytest
5
6
import sgkit as sg
6
7
@@ -41,12 +42,13 @@ def assert_ts_ds_equal(ts, ds, ploidy=1):
41
42
nt .assert_equal (ds .variant_position , ts .sites_position )
42
43
43
44
44
- def write_vcf (ts , vcf_path , contig_id = "1" ):
45
+ def write_vcf (ts , vcf_path , contig_id = "1" , indexed = False ):
45
46
with open (vcf_path , "w" ) as f :
46
47
ts .write_vcf (f , contig_id = contig_id )
47
- # # This also compresses the input file
48
- # pysam.tabix_index(str(vcf_path), preset="vcf")
49
- # return vcf_path.with_suffix(vcf_path.suffix + ".gz")
48
+ if indexed :
49
+ # This also compresses the input file
50
+ pysam .tabix_index (str (vcf_path ), preset = "vcf" )
51
+ vcf_path = vcf_path .with_suffix (vcf_path .suffix + ".gz" )
50
52
return vcf_path
51
53
52
54
@@ -75,6 +77,7 @@ def test_multi_contig(self, contig_ids, tmp_path):
75
77
vcfs .append (vcf_path )
76
78
tss [contig_id ] = ts
77
79
80
+ def validate_tss_vcf_list (self , contig_ids , tss , vcfs , tmp_path ):
78
81
out = tmp_path / "example.vcf.zarr"
79
82
vcf2zarr .convert (vcfs , out )
80
83
ds = sg .load_dataset (out ).set_index (
@@ -93,6 +96,34 @@ def test_multi_contig(self, contig_ids, tmp_path):
93
96
dss = ds .sel (variants = (contig , slice (0 , None )))
94
97
assert_ts_ds_equal (tss [contig_id ], dss )
95
98
99
+ @pytest .mark .parametrize ("indexed" , [True , False ])
100
+ def test_indexed (self , indexed , tmp_path ):
101
+ ts = run_simulation (num_samples = 12 , seed = 34 )
102
+ vcf_path = write_vcf (ts , tmp_path / "sim.vcf" , indexed = indexed )
103
+ out = tmp_path / "example.vcf.zarr"
104
+ vcf2zarr .convert ([vcf_path ], out )
105
+ ds = sg .load_dataset (out )
106
+ assert_ts_ds_equal (ts , ds )
107
+
108
+ @pytest .mark .parametrize ("num_contigs" , [2 , 3 , 6 ])
109
+ def test_mixed_indexed (self , num_contigs , tmp_path ):
110
+ contig_ids = [f"x{ j } " for j in range (num_contigs )]
111
+
112
+ vcfs = []
113
+ tss = {}
114
+ for seed , contig_id in enumerate (contig_ids , 1 ):
115
+ ts = run_simulation (num_samples = 3 , seed = seed )
116
+ vcf_path = write_vcf (
117
+ ts ,
118
+ tmp_path / f"{ contig_id } .vcf" ,
119
+ contig_id = contig_id ,
120
+ indexed = seed % 2 == 0 ,
121
+ )
122
+ vcfs .append (vcf_path )
123
+ tss [contig_id ] = ts
124
+
125
+ self .validate_tss_vcf_list (contig_ids , tss , vcfs , tmp_path )
126
+
96
127
97
128
# https://github.com/sgkit-dev/bio2zarr/issues/336
98
129
@pytest .mark .skipif (sys .platform == "darwin" , reason = "msprime OSX pip packages broken" )
0 commit comments