@@ -159,7 +159,6 @@ class VcfMetadata:
159
159
fields : list
160
160
partitions : list = None
161
161
contig_lengths : list = None
162
- finalised : bool = False
163
162
164
163
@property
165
164
def info_fields (self ):
@@ -884,10 +883,15 @@ def mkdirs(self):
884
883
part_path = col .path / f"p{ j } "
885
884
part_path .mkdir ()
886
885
887
- def write_metadata (self ):
888
- logger .info (f"Writing metadata" )
889
- with open (self .path / "metadata.json" , "w" ) as f :
886
+ def write_metadata (self , final = False ):
887
+ logger .info (f"Writing metadata ( { 'final' if final else 'initial' } ) " )
888
+ with open (self .path / f "metadata.{ 'wip.' if not final else '' } json" , "w" ) as f :
890
889
json .dump (self .metadata .asdict (), f , indent = 4 )
890
+ if final :
891
+ try :
892
+ os .remove (self .path / "metadata.wip.json" )
893
+ except FileNotFoundError :
894
+ pass
891
895
892
896
def write_header (self ):
893
897
logger .info (f"Writing header" )
@@ -899,7 +903,7 @@ def load_partition_summaries(self):
899
903
not_found = []
900
904
for j in range (self .num_partitions ):
901
905
try :
902
- with open (self .path / f"partition_ { j } _metadata.json" ) as f :
906
+ with open (self .path / f"p { j } _metadata.json" ) as f :
903
907
summary = json .load (f )
904
908
for k , v in summary ['field_summaries' ].items ():
905
909
summary ['field_summaries' ][k ] = VcfFieldSummary (** v )
@@ -916,14 +920,20 @@ def load_partition_summaries(self):
916
920
@staticmethod
917
921
def load (path ):
918
922
path = pathlib .Path (path )
919
- with open (path / "metadata.json" ) as f :
920
- metadata = VcfMetadata .fromdict (json .load (f ))
923
+ final = True
924
+ try :
925
+ with open (path / "metadata.json" ) as f :
926
+ metadata = VcfMetadata .fromdict (json .load (f ))
927
+ except FileNotFoundError :
928
+ with open (path / "metadata.wip.json" ) as f :
929
+ metadata = VcfMetadata .fromdict (json .load (f ))
930
+ final = False
921
931
with open (path / "header.txt" ) as f :
922
932
header = f .read ()
923
933
pcvcf = PickleChunkedVcf (path , metadata , header )
924
934
logger .info (
925
935
f"Loaded PickleChunkedVcf(partitions={ pcvcf .num_partitions } , "
926
- f"records={ pcvcf .num_records } , columns={ pcvcf .num_columns } )"
936
+ f"records={ pcvcf .num_records } , columns={ pcvcf .num_columns } ), final= { final } "
927
937
)
928
938
return pcvcf
929
939
@@ -987,7 +997,7 @@ def convert_partition(
987
997
"num_records" : num_records ,
988
998
"field_summaries" : {k :v .asdict () for k ,v in tcw .field_summaries .items ()}
989
999
}
990
- with open (out_path / f"partition_ { partition_index } _metadata.json" , "w" ) as f :
1000
+ with open (out_path / f"p { partition_index } _metadata.json" , "w" ) as f :
991
1001
json .dump (partition_metadata , f , indent = 4 )
992
1002
logger .info (
993
1003
f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } ="
@@ -1007,7 +1017,7 @@ def convert_init(vcfs, out_path, *, num_partitions=1, worker_processes=1, show_p
1007
1017
pcvcf = PickleChunkedVcf (out_path , vcf_metadata , header )
1008
1018
pcvcf .mkdirs ()
1009
1019
1010
- pcvcf .write_metadata ()
1020
+ pcvcf .write_metadata (final = False )
1011
1021
pcvcf .write_header ()
1012
1022
return pcvcf
1013
1023
@@ -1035,30 +1045,19 @@ def convert_slice(self, start, stop, *, worker_processes=1, show_progress=False,
1035
1045
title = "Explode" ,
1036
1046
show = show_progress ,
1037
1047
)
1038
- if stop - start == 1 :
1039
- PickleChunkedVcf .convert_partition (
1040
- self .metadata ,
1041
- start ,
1042
- self .path ,
1043
- column_chunk_size = column_chunk_size ,
1044
- )
1045
- else :
1046
- with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1047
- for j in range (start , stop ):
1048
- pwm .submit (
1049
- PickleChunkedVcf .convert_partition ,
1050
- self .metadata ,
1051
- j ,
1052
- self .path ,
1053
- column_chunk_size = column_chunk_size ,
1054
- )
1055
- for _ in pwm .results_as_completed ():
1056
- pass
1048
+ with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1049
+ for j in range (start , stop ):
1050
+ pwm .submit (
1051
+ PickleChunkedVcf .convert_partition ,
1052
+ self .metadata ,
1053
+ j ,
1054
+ self .path ,
1055
+ column_chunk_size = column_chunk_size ,
1056
+ )
1057
+ for _ in pwm .results_as_completed ():
1058
+ pass
1057
1059
1058
1060
def convert_finalise (self ):
1059
- if self .metadata .finalised :
1060
- raise ValueError ("Already finalised" )
1061
-
1062
1061
partition_summaries = self .load_partition_summaries ()
1063
1062
for index , summary in enumerate (partition_summaries ):
1064
1063
self .metadata .partitions [index ].num_records = summary ['num_records' ]
@@ -1074,14 +1073,13 @@ def convert_finalise(self):
1074
1073
for summary in partition_summaries :
1075
1074
field .summary .update (summary ["field_summaries" ][field .full_name ])
1076
1075
1077
- self .metadata .finalised = True
1078
- self .write_metadata ()
1076
+ self .write_metadata (final = True )
1079
1077
1080
1078
@staticmethod
1081
1079
def convert (
1082
1080
vcfs , out_path , * , column_chunk_size = 16 , worker_processes = 1 , show_progress = False
1083
1081
):
1084
- pcvcf = PickleChunkedVcf .convert_init (vcfs , out_path , num_partitions = max (1 , worker_processes * 40 ), worker_processes = worker_processes , show_progress = show_progress )
1082
+ pcvcf = PickleChunkedVcf .convert_init (vcfs , out_path , num_partitions = max (1 , worker_processes * 4 ), worker_processes = worker_processes , show_progress = show_progress )
1085
1083
pcvcf .convert_slice (0 , len (pcvcf .metadata .partitions ), worker_processes = worker_processes , show_progress = show_progress , column_chunk_size = column_chunk_size )
1086
1084
pcvcf .convert_finalise ()
1087
1085
0 commit comments