23
23
import zarr
24
24
25
25
from . import core
26
+ from . import provenance
26
27
27
28
logger = logging .getLogger (__name__ )
28
29
@@ -178,6 +179,7 @@ def make_field_def(name, vcf_type, vcf_number):
178
179
def scan_vcfs (paths , show_progress ):
179
180
partitions = []
180
181
vcf_metadata = None
182
+ header = None
181
183
logger .info (f"Scanning { len (paths )} VCFs" )
182
184
for path in tqdm .tqdm (paths , desc = "Scan " , disable = not show_progress ):
183
185
vcf = cyvcf2 .VCF (path )
@@ -215,6 +217,9 @@ def scan_vcfs(paths, show_progress):
215
217
216
218
if vcf_metadata is None :
217
219
vcf_metadata = metadata
220
+ # We just take the first header, assuming the others
221
+ # are compatible.
222
+ header = vcf .raw_header
218
223
else :
219
224
if metadata != vcf_metadata :
220
225
raise ValueError ("Incompatible VCF chunks" )
@@ -230,7 +235,7 @@ def scan_vcfs(paths, show_progress):
230
235
)
231
236
partitions .sort (key = lambda x : x .first_position )
232
237
vcf_metadata .partitions = partitions
233
- return vcf_metadata
238
+ return vcf_metadata , header
234
239
235
240
236
241
def sanitise_value_bool (buff , j , value ):
@@ -668,9 +673,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
668
673
669
674
670
675
class PickleChunkedVcf (collections .abc .Mapping ):
671
- def __init__ (self , path , metadata ):
676
+ def __init__ (self , path , metadata , vcf_header ):
672
677
self .path = path
673
678
self .metadata = metadata
679
+ self .vcf_header = vcf_header
674
680
675
681
self .columns = {}
676
682
for field in self .metadata .fields :
@@ -753,7 +759,9 @@ def load(path):
753
759
path = pathlib .Path (path )
754
760
with open (path / "metadata.json" ) as f :
755
761
metadata = VcfMetadata .fromdict (json .load (f ))
756
- return PickleChunkedVcf (path , metadata )
762
+ with open (path / "header.txt" ) as f :
763
+ header = f .read ()
764
+ return PickleChunkedVcf (path , metadata , header )
757
765
758
766
@staticmethod
759
767
def convert_partition (
@@ -820,8 +828,8 @@ def convert(
820
828
):
821
829
out_path = pathlib .Path (out_path )
822
830
# TODO make scan work in parallel using general progress code too
823
- vcf_metadata = scan_vcfs (vcfs , show_progress = show_progress )
824
- pcvcf = PickleChunkedVcf (out_path , vcf_metadata )
831
+ vcf_metadata , header = scan_vcfs (vcfs , show_progress = show_progress )
832
+ pcvcf = PickleChunkedVcf (out_path , vcf_metadata , header )
825
833
pcvcf .mkdirs ()
826
834
827
835
total_variants = sum (
@@ -855,6 +863,8 @@ def convert(
855
863
856
864
with open (out_path / "metadata.json" , "w" ) as f :
857
865
json .dump (vcf_metadata .asdict (), f , indent = 4 )
866
+ with open (out_path / "header.txt" , "w" ) as f :
867
+ f .write (header )
858
868
return pcvcf
859
869
860
870
@@ -1214,7 +1224,6 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
1214
1224
logger .debug ("Contig done" )
1215
1225
1216
1226
def encode_filters (self , pcvcf , filter_names ):
1217
- self .root .attrs ["filters" ] = filter_names
1218
1227
array = self .root .array (
1219
1228
"filter_id" ,
1220
1229
filter_names ,
@@ -1277,6 +1286,10 @@ def convert(
1277
1286
for column in conversion_spec .columns .values ():
1278
1287
sgvcf .create_array (column )
1279
1288
1289
+ sgvcf .root .attrs ["vcf_zarr_version" ] = "0.2"
1290
+ sgvcf .root .attrs ["vcf_header" ] = pcvcf .vcf_header
1291
+ sgvcf .root .attrs ["source" ] = f"bio2zarr-{ provenance .__version__ } "
1292
+
1280
1293
progress_config = core .ProgressConfig (
1281
1294
total = pcvcf .total_uncompressed_bytes ,
1282
1295
title = "Encode" ,
0 commit comments