@@ -745,9 +745,9 @@ def convert(
745
745
progress_config = None
746
746
if show_progress :
747
747
progress_config = core .ProgressConfig (
748
- total = total_variants , units = "vars" , title = "Explode" )
748
+ total = total_variants , units = "vars" , title = "Explode"
749
+ )
749
750
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
750
-
751
751
futures = []
752
752
for j , partition in enumerate (vcf_metadata .partitions ):
753
753
futures .append (
@@ -763,44 +763,6 @@ def convert(
763
763
future .result () for future in cf .as_completed (futures )
764
764
]
765
765
766
- # global progress_counter
767
- # progress_counter = multiprocessing.Value("Q", 0)
768
-
769
- # # start update progress bar process
770
- # bar_thread = None
771
- # if show_progress:
772
- # bar_thread = threading.Thread(
773
- # target=update_bar,
774
- # args=(progress_counter, total_variants, "Explode", "vars"),
775
- # name="progress",
776
- # daemon=True,
777
- # )
778
- # bar_thread.start()
779
-
780
- # with cf.ProcessPoolExecutor(
781
- # max_workers=worker_processes,
782
- # initializer=init_workers,
783
- # initargs=(progress_counter,),
784
- # ) as executor:
785
- # futures = []
786
- # for j, partition in enumerate(vcf_metadata.partitions):
787
- # futures.append(
788
- # executor.submit(
789
- # PickleChunkedVcf.convert_partition,
790
- # vcf_metadata,
791
- # j,
792
- # out_path,
793
- # column_chunk_size=column_chunk_size,
794
- # )
795
- # )
796
- # partition_summaries = [
797
- # future.result() for future in cf.as_completed(futures)
798
- # ]
799
-
800
- # assert progress_counter.value == total_variants
801
- # if bar_thread is not None:
802
- # bar_thread.join()
803
-
804
766
for field in vcf_metadata .fields :
805
767
for summary in partition_summaries :
806
768
field .summary .update (summary [field .full_name ])
@@ -884,7 +846,6 @@ def service_futures(max_waiting=2 * flush_threads):
884
846
885
847
service_futures ()
886
848
887
-
888
849
# Note: an issue with updating the progress per variant here like this
889
850
# is that we get a significant pause at the end of the counter while
890
851
# all the "small" fields get flushed. Possibly not much to be done about it.
@@ -898,23 +859,6 @@ def service_futures(max_waiting=2 * flush_threads):
898
859
return summaries
899
860
900
861
901
- # def update_bar(progress_counter, total, title, units):
902
- # pbar = tqdm.tqdm(
903
- # total=total, desc=title, unit_scale=True, unit=units, smoothing=0.1
904
- # )
905
-
906
- # while (current := progress_counter.value) < total:
907
- # inc = current - pbar.n
908
- # pbar.update(inc)
909
- # time.sleep(0.1)
910
- # pbar.close()
911
-
912
-
913
- # def init_workers(counter):
914
- # global progress_counter
915
- # progress_counter = counter
916
-
917
-
918
862
def explode (
919
863
vcfs ,
920
864
out_path ,
@@ -1160,9 +1104,9 @@ def encode_column(self, pcvcf, column, encoder_threads=4):
1160
1104
for value , bytes_read in source_col .iter_values_bytes ():
1161
1105
j = te .next_buffer_row ()
1162
1106
sanitiser (ba .buff , j , value )
1107
+ # print(bytes_read, last_bytes_read, value)
1163
1108
if last_bytes_read != bytes_read :
1164
- with progress_counter .get_lock ():
1165
- progress_counter .value += bytes_read - last_bytes_read
1109
+ core .update_progress (bytes_read - last_bytes_read )
1166
1110
last_bytes_read = bytes_read
1167
1111
1168
1112
def encode_genotypes (self , pcvcf , encoder_threads = 4 ):
@@ -1181,10 +1125,8 @@ def encode_genotypes(self, pcvcf, encoder_threads=4):
1181
1125
# TODO check is this the correct semantics when we are padding
1182
1126
# with mixed ploidies?
1183
1127
gt_mask .buff [j ] = gt .buff [j ] < 0
1184
-
1185
1128
if last_bytes_read != bytes_read :
1186
- with progress_counter .get_lock ():
1187
- progress_counter .value += bytes_read - last_bytes_read
1129
+ core .update_progress (bytes_read - last_bytes_read )
1188
1130
last_bytes_read = bytes_read
1189
1131
1190
1132
def encode_alleles (self , pcvcf ):
@@ -1200,10 +1142,10 @@ def encode_alleles(self, pcvcf):
1200
1142
alleles [j , 0 ] = ref
1201
1143
alleles [j , 1 : 1 + len (alt )] = alt
1202
1144
allele_array [:] = alleles
1203
-
1204
- with progress_counter . get_lock ():
1205
- for col in [ ref_col , alt_col ]:
1206
- progress_counter . value += col . vcf_field . summary . uncompressed_size
1145
+ size = sum (
1146
+ col . vcf_field . summary . uncompressed_size for col in [ ref_col , alt_col ]
1147
+ )
1148
+ core . update_progress ( size )
1207
1149
logger .debug ("alleles done" )
1208
1150
1209
1151
def encode_samples (self , pcvcf , sample_id , chunk_width ):
@@ -1249,8 +1191,7 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
1249
1191
1250
1192
array [:] = buff
1251
1193
1252
- with progress_counter .get_lock ():
1253
- progress_counter .value += col .vcf_field .summary .uncompressed_size
1194
+ core .update_progress (col .vcf_field .summary .uncompressed_size )
1254
1195
logger .debug ("Contig done" )
1255
1196
1256
1197
def encode_filters (self , pcvcf , filter_names ):
@@ -1277,8 +1218,7 @@ def encode_filters(self, pcvcf, filter_names):
1277
1218
1278
1219
array [:] = buff
1279
1220
1280
- with progress_counter .get_lock ():
1281
- progress_counter .value += col .vcf_field .summary .uncompressed_size
1221
+ core .update_progress (col .vcf_field .summary .uncompressed_size )
1282
1222
logger .debug ("Filters done" )
1283
1223
1284
1224
def encode_id (self , pcvcf ):
@@ -1298,8 +1238,7 @@ def encode_id(self, pcvcf):
1298
1238
id_array [:] = id_buff
1299
1239
id_mask_array [:] = id_mask_buff
1300
1240
1301
- with progress_counter .get_lock ():
1302
- progress_counter .value += col .vcf_field .summary .uncompressed_size
1241
+ core .update_progress (col .vcf_field .summary .uncompressed_size )
1303
1242
logger .debug ("ID done" )
1304
1243
1305
1244
@staticmethod
@@ -1319,41 +1258,30 @@ def convert(
1319
1258
for variable in conversion_spec .variables [:]:
1320
1259
sgvcf .create_array (variable )
1321
1260
1322
- global progress_counter
1323
- progress_counter = multiprocessing .Value ("Q" , 0 )
1324
-
1325
- # start update progress bar process
1326
- bar_thread = None
1261
+ progress_config = None
1327
1262
if show_progress :
1328
- bar_thread = threading .Thread (
1329
- target = update_bar ,
1330
- args = (progress_counter , pcvcf .total_uncompressed_bytes , "Encode" , "b" ),
1331
- name = "progress" ,
1332
- daemon = True ,
1263
+ progress_config = core .ProgressConfig (
1264
+ total = pcvcf .total_uncompressed_bytes , title = "Encode" , units = "b"
1333
1265
)
1334
- bar_thread .start ()
1335
-
1336
- with cf .ProcessPoolExecutor (
1337
- max_workers = worker_processes ,
1338
- initializer = init_workers ,
1339
- initargs = (progress_counter ,),
1340
- ) as executor :
1266
+ with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1341
1267
futures = [
1342
- executor .submit (
1268
+ pwm . executor .submit (
1343
1269
sgvcf .encode_samples ,
1344
1270
pcvcf ,
1345
1271
conversion_spec .sample_id ,
1346
1272
conversion_spec .chunk_width ,
1347
1273
),
1348
- executor .submit (sgvcf .encode_alleles , pcvcf ),
1349
- executor .submit (sgvcf .encode_id , pcvcf ),
1350
- executor .submit (
1274
+ pwm . executor .submit (sgvcf .encode_alleles , pcvcf ),
1275
+ pwm . executor .submit (sgvcf .encode_id , pcvcf ),
1276
+ pwm . executor .submit (
1351
1277
sgvcf .encode_contig ,
1352
1278
pcvcf ,
1353
1279
conversion_spec .contig_id ,
1354
1280
conversion_spec .contig_length ,
1355
1281
),
1356
- executor .submit (sgvcf .encode_filters , pcvcf , conversion_spec .filter_id ),
1282
+ pwm .executor .submit (
1283
+ sgvcf .encode_filters , pcvcf , conversion_spec .filter_id
1284
+ ),
1357
1285
]
1358
1286
has_gt = False
1359
1287
for variable in conversion_spec .variables [:]:
@@ -1364,14 +1292,14 @@ def convert(
1364
1292
# long wait for the largest GT columns to finish.
1365
1293
# Straightforward to do because we can chunk-align the work
1366
1294
# packages.
1367
- future = executor .submit (sgvcf .encode_column , pcvcf , variable )
1295
+ future = pwm . executor .submit (sgvcf .encode_column , pcvcf , variable )
1368
1296
futures .append (future )
1369
1297
else :
1370
1298
if variable .name == "call_genotype" :
1371
1299
has_gt = True
1372
1300
if has_gt :
1373
1301
# TODO add mixed ploidy
1374
- futures .append (executor .submit (sgvcf .encode_genotypes , pcvcf ))
1302
+ futures .append (pwm . executor .submit (sgvcf .encode_genotypes , pcvcf ))
1375
1303
1376
1304
flush_futures (futures )
1377
1305
@@ -1471,8 +1399,7 @@ def encode_bed_partition_genotypes(
1471
1399
dest [values == 1 , 0 ] = 1
1472
1400
gt_phased .buff [j ] = False
1473
1401
gt_mask .buff [j ] = dest == - 1
1474
- with progress_counter .get_lock ():
1475
- progress_counter .value += 1
1402
+ core .update_progress (1 )
1476
1403
start = stop
1477
1404
1478
1405
@@ -1669,21 +1596,7 @@ def convert_plink(
1669
1596
)
1670
1597
a .attrs ["_ARRAY_DIMENSIONS" ] = list (dimensions )
1671
1598
1672
- global progress_counter
1673
- progress_counter = multiprocessing .Value ("Q" , 0 )
1674
-
1675
- # start update progress bar process
1676
- bar_thread = None
1677
- if show_progress :
1678
- bar_thread = threading .Thread (
1679
- target = update_bar ,
1680
- args = (progress_counter , m , "Write" , "vars" ),
1681
- name = "progress" ,
1682
- daemon = True ,
1683
- )
1684
- bar_thread .start ()
1685
-
1686
- num_chunks = m // chunk_length
1599
+ num_chunks = max (1 , m // chunk_length )
1687
1600
worker_processes = min (worker_processes , num_chunks )
1688
1601
if num_chunks == 1 or worker_processes == 1 :
1689
1602
partitions = [(0 , m )]
@@ -1704,19 +1617,14 @@ def convert_plink(
1704
1617
partitions .append ((last_stop , m ))
1705
1618
# print(partitions)
1706
1619
1707
- with cf .ProcessPoolExecutor (
1708
- max_workers = worker_processes ,
1709
- initializer = init_workers ,
1710
- initargs = (progress_counter ,),
1711
- ) as executor :
1620
+ progress_config = None
1621
+ if show_progress :
1622
+ progress_config = core .ProgressConfig (total = m , title = "Convert" , units = "vars" )
1623
+ with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1712
1624
futures = [
1713
- executor .submit (
1625
+ pwm . executor .submit (
1714
1626
encode_bed_partition_genotypes , bed_path , zarr_path , start , end
1715
1627
)
1716
1628
for start , end in partitions
1717
1629
]
1718
1630
flush_futures (futures )
1719
- # print("progress counter = ", m, progress_counter.value)
1720
- assert progress_counter .value == m
1721
-
1722
- # print(root["call_genotype"][:])
0 commit comments