@@ -69,6 +69,13 @@ def update(self, other):
69
69
self .min_value = min (self .min_value , other .min_value )
70
70
self .max_value = max (self .max_value , other .max_value )
71
71
72
+ def asdict (self ):
73
+ return dataclasses .asdict (self )
74
+
75
+ @staticmethod
76
+ def fromdict (d ):
77
+ return VcfFieldSummary (** d )
78
+
72
79
73
80
@dataclasses .dataclass
74
81
class VcfField :
@@ -185,6 +192,8 @@ def fromdict(d):
185
192
)
186
193
fields = [VcfField .fromdict (fd ) for fd in d ["fields" ]]
187
194
partitions = [VcfPartition (** pd ) for pd in d ["partitions" ]]
195
+ for p in partitions :
196
+ p .region = vcf_utils .Region (** p .region )
188
197
d = d .copy ()
189
198
d ["fields" ] = fields
190
199
d ["partitions" ] = partitions
@@ -270,7 +279,7 @@ def scan_vcf(path, target_num_partitions):
270
279
271
280
272
281
def scan_vcfs (paths , show_progress , target_num_partitions , worker_processes = 1 ):
273
- logger .info (f"Scanning { len (paths )} VCFs" )
282
+ logger .info (f"Scanning { len (paths )} VCFs attempting to split into { target_num_partitions } partitions. " )
274
283
progress_config = core .ProgressConfig (
275
284
total = len (paths ),
276
285
units = "files" ,
@@ -279,7 +288,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
279
288
)
280
289
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
281
290
for path in paths :
282
- pwm .submit (scan_vcf , path , target_num_partitions )
291
+ pwm .submit (scan_vcf , path , max ( 1 , target_num_partitions // len ( paths )) )
283
292
results = list (pwm .results_as_completed ())
284
293
285
294
# Sort to make the ordering deterministic
@@ -308,6 +317,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
308
317
key = lambda x : (contig_index_map [x .region .contig ], x .region .start )
309
318
)
310
319
vcf_metadata .partitions = all_partitions
320
+ logger .info (f"Scan complete, resulting in { len (all_partitions )} partitions." )
311
321
return vcf_metadata , header
312
322
313
323
@@ -872,24 +882,65 @@ def num_columns(self):
872
882
return len (self .columns )
873
883
874
884
def mkdirs (self ):
885
+ logger .info (f"Creating { len (self .columns ) * self .num_partitions } directories" )
875
886
self .path .mkdir ()
876
887
for col in self .columns .values ():
877
888
col .path .mkdir (parents = True )
878
889
for j in range (self .num_partitions ):
879
890
part_path = col .path / f"p{ j } "
880
891
part_path .mkdir ()
881
892
893
+ def write_metadata (self , final = False ):
894
+ logger .info (f"Writing metadata ({ 'final' if final else 'initial' } )" )
895
+ with open (self .path / f"metadata.{ 'wip.' if not final else '' } json" , "w" ) as f :
896
+ json .dump (self .metadata .asdict (), f , indent = 4 )
897
+ if final :
898
+ try :
899
+ os .remove (self .path / "metadata.wip.json" )
900
+ except FileNotFoundError :
901
+ pass
902
+
903
+ def write_header (self ):
904
+ logger .info (f"Writing header" )
905
+ with open (self .path / "header.txt" , "w" ) as f :
906
+ f .write (self .vcf_header )
907
+
908
+ def load_partition_summaries (self ):
909
+ summaries = []
910
+ not_found = []
911
+ for j in range (self .num_partitions ):
912
+ try :
913
+ with open (self .path / f"p{ j } _metadata.json" ) as f :
914
+ summary = json .load (f )
915
+ for k , v in summary ['field_summaries' ].items ():
916
+ summary ['field_summaries' ][k ] = VcfFieldSummary (** v )
917
+ summaries .append (summary )
918
+ except FileNotFoundError :
919
+ not_found .append (j )
920
+ if not_found :
921
+ raise FileNotFoundError (
922
+ f"Partition metadata not found for { len (not_found )} partitions: { not_found } "
923
+ )
924
+ return summaries
925
+
926
+
882
927
@staticmethod
883
928
def load (path ):
884
929
path = pathlib .Path (path )
885
- with open (path / "metadata.json" ) as f :
886
- metadata = VcfMetadata .fromdict (json .load (f ))
930
+ final = True
931
+ try :
932
+ with open (path / "metadata.json" ) as f :
933
+ metadata = VcfMetadata .fromdict (json .load (f ))
934
+ except FileNotFoundError :
935
+ with open (path / "metadata.wip.json" ) as f :
936
+ metadata = VcfMetadata .fromdict (json .load (f ))
937
+ final = False
887
938
with open (path / "header.txt" ) as f :
888
939
header = f .read ()
889
940
pcvcf = PickleChunkedVcf (path , metadata , header )
890
941
logger .info (
891
942
f"Loaded PickleChunkedVcf(partitions={ pcvcf .num_partitions } , "
892
- f"records={ pcvcf .num_records } , columns={ pcvcf .num_columns } )"
943
+ f"records={ pcvcf .num_records } , columns={ pcvcf .num_columns } ), final= { final } "
893
944
)
894
945
return pcvcf
895
946
@@ -949,70 +1000,96 @@ def convert_partition(
949
1000
# is that we get a significant pause at the end of the counter while
950
1001
# all the "small" fields get flushed. Possibly not much to be done about it.
951
1002
core .update_progress (1 )
952
-
1003
+ partition_metadata = {
1004
+ "num_records" : num_records ,
1005
+ "field_summaries" : {k :v .asdict () for k ,v in tcw .field_summaries .items ()}
1006
+ }
1007
+ with open (out_path / f"p{ partition_index } _metadata.json" , "w" ) as f :
1008
+ json .dump (partition_metadata , f , indent = 4 )
953
1009
logger .info (
954
1010
f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } ="
955
1011
f"{ num_records } records"
956
1012
)
957
- return partition_index , tcw .field_summaries , num_records
958
1013
959
1014
@staticmethod
960
- def convert (
961
- vcfs , out_path , * , column_chunk_size = 16 , worker_processes = 1 , show_progress = False
962
- ):
1015
+ def convert_init (vcfs , out_path , * , num_partitions = 1 , worker_processes = 1 , show_progress = False ):
963
1016
out_path = pathlib .Path (out_path )
964
1017
# TODO make scan work in parallel using general progress code too
965
- target_num_partitions = max (1 , worker_processes * 4 )
966
1018
vcf_metadata , header = scan_vcfs (
967
1019
vcfs ,
968
1020
worker_processes = worker_processes ,
969
1021
show_progress = show_progress ,
970
- target_num_partitions = target_num_partitions ,
1022
+ target_num_partitions = num_partitions ,
971
1023
)
972
1024
pcvcf = PickleChunkedVcf (out_path , vcf_metadata , header )
973
1025
pcvcf .mkdirs ()
974
1026
975
- logger .info (
976
- f"Exploding { pcvcf .num_columns } columns { vcf_metadata .num_records } variants "
977
- f"{ pcvcf .num_samples } samples"
978
- )
1027
+ pcvcf .write_metadata (final = False )
1028
+ pcvcf .write_header ()
1029
+ return pcvcf
1030
+
1031
+ def convert_slice (self , start , stop , * , worker_processes = 1 , show_progress = False , column_chunk_size = 16 ):
1032
+ if start < 0 :
1033
+ raise ValueError (f"start={ start } must be non-negative" )
1034
+ if stop > self .num_partitions :
1035
+ raise ValueError (f"stop={ stop } must be less than the number of partitions" )
1036
+ if start == 0 and stop == self .num_partitions :
1037
+ num_records_to_process = self .num_records
1038
+ logger .info (
1039
+ f"Exploding { self .num_columns } columns { self .metadata .num_records } variants "
1040
+ f"{ self .num_samples } samples"
1041
+ )
1042
+ else :
1043
+ num_records_to_process = None
1044
+ logger .info (
1045
+ f"Exploding { self .num_columns } columns { self .num_samples } samples"
1046
+ f" from partitions { start } to { stop } "
1047
+ )
1048
+
979
1049
progress_config = core .ProgressConfig (
980
- total = vcf_metadata . num_records ,
1050
+ total = num_records_to_process ,
981
1051
units = "vars" ,
982
1052
title = "Explode" ,
983
1053
show = show_progress ,
984
1054
)
985
1055
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
986
- for j , partition in enumerate ( vcf_metadata . partitions ):
1056
+ for j in range ( start , stop ):
987
1057
pwm .submit (
988
1058
PickleChunkedVcf .convert_partition ,
989
- vcf_metadata ,
1059
+ self . metadata ,
990
1060
j ,
991
- out_path ,
1061
+ self . path ,
992
1062
column_chunk_size = column_chunk_size ,
993
1063
)
994
- num_records = 0
995
- partition_summaries = []
996
- for index , summary , num_records in pwm .results_as_completed ():
997
- partition_summaries .append (summary )
998
- vcf_metadata .partitions [index ].num_records = num_records
1064
+ for _ in pwm .results_as_completed ():
1065
+ pass
999
1066
1067
+ def convert_finalise (self ):
1068
+ partition_summaries = self .load_partition_summaries ()
1069
+ for index , summary in enumerate (partition_summaries ):
1070
+ self .metadata .partitions [index ].num_records = summary ['num_records' ]
1000
1071
total_records = sum (
1001
- partition .num_records for partition in vcf_metadata .partitions
1072
+ partition .num_records for partition in self . metadata .partitions
1002
1073
)
1003
- assert total_records == pcvcf .num_records
1074
+ assert total_records == self .num_records
1004
1075
1005
- for field in vcf_metadata .fields :
1076
+ for field in self . metadata .fields :
1006
1077
# Clear the summary to avoid problems when running in debug
1007
- # syncronous mode
1078
+ # synchronous mode
1008
1079
field .summary = VcfFieldSummary ()
1009
1080
for summary in partition_summaries :
1010
- field .summary .update (summary [field .full_name ])
1081
+ field .summary .update (summary ["field_summaries" ][field .full_name ])
1082
+
1083
+ self .write_metadata (final = True )
1084
+
1085
+ @staticmethod
1086
+ def convert (
1087
+ vcfs , out_path , * , column_chunk_size = 16 , worker_processes = 1 , show_progress = False
1088
+ ):
1089
+ pcvcf = PickleChunkedVcf .convert_init (vcfs , out_path , num_partitions = max (1 , worker_processes * 4 ), worker_processes = worker_processes , show_progress = show_progress )
1090
+ pcvcf .convert_slice (0 , len (pcvcf .metadata .partitions ), worker_processes = worker_processes , show_progress = show_progress , column_chunk_size = column_chunk_size )
1091
+ pcvcf .convert_finalise ()
1011
1092
1012
- with open (out_path / "metadata.json" , "w" ) as f :
1013
- json .dump (vcf_metadata .asdict (), f , indent = 4 )
1014
- with open (out_path / "header.txt" , "w" ) as f :
1015
- f .write (header )
1016
1093
1017
1094
1018
1095
def explode (
@@ -1036,6 +1113,33 @@ def explode(
1036
1113
)
1037
1114
return PickleChunkedVcf .load (out_path )
1038
1115
1116
+ def explode_init (vcfs , out_path , * , target_num_partitions = 1 , worker_processes = 1 , show_progress = False ):
1117
+ out_path = pathlib .Path (out_path )
1118
+ if out_path .exists ():
1119
+ shutil .rmtree (out_path )
1120
+ # Error if num_parts less than number of files
1121
+ if target_num_partitions < len (vcfs ):
1122
+ raise ValueError ("num_partitions must be greater than or equal to the number of input VCFs" )
1123
+ return PickleChunkedVcf .convert_init (
1124
+ vcfs ,
1125
+ out_path ,
1126
+ num_partitions = target_num_partitions ,
1127
+ worker_processes = worker_processes ,
1128
+ show_progress = show_progress ,
1129
+ )
1130
+
1131
+ def explode_partition_count (out_path ):
1132
+ pcvcf = PickleChunkedVcf .load (out_path )
1133
+ return pcvcf .num_partitions
1134
+
1135
+
1136
+ def explode_slice (out_path , start , stop , * , worker_processes = 1 , show_progress = False , column_chunk_size = 16 ):
1137
+ pcvcf = PickleChunkedVcf .load (out_path )
1138
+ pcvcf .convert_slice (start , stop , worker_processes = worker_processes , show_progress = show_progress , column_chunk_size = column_chunk_size )
1139
+
1140
+ def explode_finalise (out_path ):
1141
+ pcvcf = PickleChunkedVcf .load (out_path )
1142
+ pcvcf .convert_finalise ()
1039
1143
1040
1144
def inspect (path ):
1041
1145
path = pathlib .Path (path )
0 commit comments