@@ -178,6 +178,7 @@ def make_field_def(name, vcf_type, vcf_number):
178
178
def scan_vcfs (paths , show_progress ):
179
179
partitions = []
180
180
vcf_metadata = None
181
+ header = None
181
182
logger .info (f"Scanning { len (paths )} VCFs" )
182
183
for path in tqdm .tqdm (paths , desc = "Scan " , disable = not show_progress ):
183
184
vcf = cyvcf2 .VCF (path )
@@ -215,6 +216,9 @@ def scan_vcfs(paths, show_progress):
215
216
216
217
if vcf_metadata is None :
217
218
vcf_metadata = metadata
219
+ # We just take the first header, assuming the others
220
+ # are compatible.
221
+ header = vcf .raw_header
218
222
else :
219
223
if metadata != vcf_metadata :
220
224
raise ValueError ("Incompatible VCF chunks" )
@@ -230,7 +234,7 @@ def scan_vcfs(paths, show_progress):
230
234
)
231
235
partitions .sort (key = lambda x : x .first_position )
232
236
vcf_metadata .partitions = partitions
233
- return vcf_metadata
237
+ return vcf_metadata , header
234
238
235
239
236
240
def sanitise_value_bool (buff , j , value ):
@@ -668,9 +672,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
668
672
669
673
670
674
class PickleChunkedVcf (collections .abc .Mapping ):
671
- def __init__ (self , path , metadata ):
675
+ def __init__ (self , path , metadata , vcf_header ):
672
676
self .path = path
673
677
self .metadata = metadata
678
+ self .vcf_header = vcf_header
674
679
675
680
self .columns = {}
676
681
for field in self .metadata .fields :
@@ -753,7 +758,9 @@ def load(path):
753
758
path = pathlib .Path (path )
754
759
with open (path / "metadata.json" ) as f :
755
760
metadata = VcfMetadata .fromdict (json .load (f ))
756
- return PickleChunkedVcf (path , metadata )
761
+ with open (path / "header.txt" ) as f :
762
+ header = f .read ()
763
+ return PickleChunkedVcf (path , metadata , header )
757
764
758
765
@staticmethod
759
766
def convert_partition (
@@ -820,8 +827,8 @@ def convert(
820
827
):
821
828
out_path = pathlib .Path (out_path )
822
829
# TODO make scan work in parallel using general progress code too
823
- vcf_metadata = scan_vcfs (vcfs , show_progress = show_progress )
824
- pcvcf = PickleChunkedVcf (out_path , vcf_metadata )
830
+ vcf_metadata , header = scan_vcfs (vcfs , show_progress = show_progress )
831
+ pcvcf = PickleChunkedVcf (out_path , vcf_metadata , header )
825
832
pcvcf .mkdirs ()
826
833
827
834
total_variants = sum (
@@ -855,6 +862,8 @@ def convert(
855
862
856
863
with open (out_path / "metadata.json" , "w" ) as f :
857
864
json .dump (vcf_metadata .asdict (), f , indent = 4 )
865
+ with open (out_path / "header.txt" , "w" ) as f :
866
+ f .write (header )
858
867
return pcvcf
859
868
860
869
@@ -1214,7 +1223,6 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
1214
1223
logger .debug ("Contig done" )
1215
1224
1216
1225
def encode_filters (self , pcvcf , filter_names ):
1217
- self .root .attrs ["filters" ] = filter_names
1218
1226
array = self .root .array (
1219
1227
"filter_id" ,
1220
1228
filter_names ,
@@ -1277,6 +1285,9 @@ def convert(
1277
1285
for column in conversion_spec .columns .values ():
1278
1286
sgvcf .create_array (column )
1279
1287
1288
+ sgvcf .root .attrs ["vcf_zarr_version" ] = "0.2"
1289
+ sgvcf .root .attrs ["vcf_header" ] = pcvcf .vcf_header
1290
+
1280
1291
progress_config = core .ProgressConfig (
1281
1292
total = pcvcf .total_uncompressed_bytes ,
1282
1293
title = "Encode" ,
0 commit comments