@@ -325,12 +325,15 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
325
325
all_partitions .append (partition )
326
326
total_records += metadata .num_records
327
327
metadata .num_records = 0
328
+ metadata .partitions = []
328
329
329
330
icf_metadata , header = results [0 ]
330
331
for metadata , _ in results [1 :]:
331
332
if metadata != icf_metadata :
332
333
raise ValueError ("Incompatible VCF chunks" )
333
334
335
+ # Note: this will be infinity here if any of the chunks has an index
336
+ # that doesn't keep track of the number of records per-contig
334
337
icf_metadata .num_records = total_records
335
338
336
339
# Sort by contig (in the order they appear in the header) first,
@@ -1057,32 +1060,20 @@ def process_partition(self, partition_index):
1057
1060
f"{ num_records } records"
1058
1061
)
1059
1062
1060
- def process_partition_slice (
1061
- self ,
1062
- start ,
1063
- stop ,
1064
- * ,
1065
- worker_processes = 1 ,
1066
- show_progress = False ,
1067
- ):
1063
+ def explode (self , * , worker_processes = 1 , show_progress = False ):
1068
1064
self .load_metadata ()
1069
- if start == 0 and stop == self .num_partitions :
1070
- num_records = self .metadata .num_records
1071
- if np .isinf (num_records ):
1072
- logger .warning (
1073
- "Total records unknown, cannot show progress; "
1074
- "reindex VCFs with bcftools index to fix"
1075
- )
1076
- num_records = None
1077
- else :
1078
- # We only know the number of records if all partitions are done at once,
1079
- # and we signal this to tqdm by passing None as the total.
1065
+ num_records = self .metadata .num_records
1066
+ if np .isinf (num_records ):
1067
+ logger .warning (
1068
+ "Total records unknown, cannot show progress; "
1069
+ "reindex VCFs with bcftools index to fix"
1070
+ )
1080
1071
num_records = None
1081
1072
num_columns = len (self .metadata .fields )
1082
1073
num_samples = len (self .metadata .samples )
1083
1074
logger .info (
1084
1075
f"Exploding columns={ num_columns } samples={ num_samples } ; "
1085
- f"partitions={ stop - start } "
1076
+ f"partitions={ self . num_partitions } "
1086
1077
f"variants={ 'unknown' if num_records is None else num_records } "
1087
1078
)
1088
1079
progress_config = core .ProgressConfig (
@@ -1092,30 +1083,16 @@ def process_partition_slice(
1092
1083
show = show_progress ,
1093
1084
)
1094
1085
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1095
- for j in range (start , stop ):
1086
+ for j in range (self . num_partitions ):
1096
1087
pwm .submit (self .process_partition , j )
1097
1088
1098
- def explode (self , * , worker_processes = 1 , show_progress = False ):
1099
- self .load_metadata ()
1100
- return self .process_partition_slice (
1101
- 0 ,
1102
- self .num_partitions ,
1103
- worker_processes = worker_processes ,
1104
- show_progress = show_progress ,
1105
- )
1106
-
1107
- def explode_partition (self , partition , * , show_progress = False , worker_processes = 1 ):
1089
+ def explode_partition (self , partition ):
1108
1090
self .load_metadata ()
1109
1091
if partition < 0 or partition >= self .num_partitions :
1110
1092
raise ValueError (
1111
1093
"Partition index must be in the range 0 <= index < num_partitions"
1112
1094
)
1113
- return self .process_partition_slice (
1114
- partition ,
1115
- partition + 1 ,
1116
- worker_processes = worker_processes ,
1117
- show_progress = show_progress ,
1118
- )
1095
+ self .process_partition (partition )
1119
1096
1120
1097
def finalise (self ):
1121
1098
self .load_metadata ()
@@ -1190,14 +1167,9 @@ def explode_init(
1190
1167
)
1191
1168
1192
1169
1193
- # NOTE only including worker_processes here so we can use the 0 option to get the
1194
- # work done syncronously and so we can get test coverage on it. Should find a
1195
- # better way to do this.
1196
- def explode_partition (icf_path , partition , * , show_progress = False , worker_processes = 1 ):
1170
+ def explode_partition (icf_path , partition ):
1197
1171
writer = IntermediateColumnarFormatWriter (icf_path )
1198
- writer .explode_partition (
1199
- partition , show_progress = show_progress , worker_processes = worker_processes
1200
- )
1172
+ writer .explode_partition (partition )
1201
1173
1202
1174
1203
1175
def explode_finalise (icf_path ):
0 commit comments