1
1
import concurrent .futures as cf
2
2
import dataclasses
3
- import multiprocessing
4
3
import functools
5
4
import logging
6
5
import os
7
- import threading
8
6
import pathlib
9
- import time
10
7
import pickle
11
8
import sys
12
9
import shutil
@@ -111,15 +108,6 @@ def assert_prefix_float_equal_2d(vcf_val, zarr_val):
111
108
# nt.assert_array_equal(v, z[:k])
112
109
113
110
114
- # TODO rename to wait_and_check_futures
115
- def flush_futures (futures ):
116
- # Make sure previous futures have completed
117
- for future in cf .as_completed (futures ):
118
- exception = future .exception ()
119
- if exception is not None :
120
- raise exception
121
-
122
-
123
111
@dataclasses .dataclass
124
112
class VcfFieldSummary :
125
113
num_chunks : int = 0
@@ -742,26 +730,19 @@ def convert(
742
730
f"Exploding { pcvcf .num_columns } columns { total_variants } variants "
743
731
f"{ pcvcf .num_samples } samples"
744
732
)
745
- progress_config = None
746
- if show_progress :
747
- progress_config = core .ProgressConfig (
748
- total = total_variants , units = "vars" , title = "Explode"
749
- )
733
+ progress_config = core .ProgressConfig (
734
+ total = total_variants , units = "vars" , title = "Explode" , show = show_progress
735
+ )
750
736
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
751
- futures = []
752
737
for j , partition in enumerate (vcf_metadata .partitions ):
753
- futures .append (
754
- pwm .executor .submit (
755
- PickleChunkedVcf .convert_partition ,
756
- vcf_metadata ,
757
- j ,
758
- out_path ,
759
- column_chunk_size = column_chunk_size ,
760
- )
738
+ pwm .submit (
739
+ PickleChunkedVcf .convert_partition ,
740
+ vcf_metadata ,
741
+ j ,
742
+ out_path ,
743
+ column_chunk_size = column_chunk_size ,
761
744
)
762
- partition_summaries = [
763
- future .result () for future in cf .as_completed (futures )
764
- ]
745
+ partition_summaries = list (pwm .results_as_completed ())
765
746
766
747
for field in vcf_metadata .fields :
767
748
for summary in partition_summaries :
@@ -1258,31 +1239,28 @@ def convert(
1258
1239
for variable in conversion_spec .variables [:]:
1259
1240
sgvcf .create_array (variable )
1260
1241
1261
- progress_config = None
1262
- if show_progress :
1263
- progress_config = core .ProgressConfig (
1264
- total = pcvcf .total_uncompressed_bytes , title = "Encode" , units = "b"
1265
- )
1242
+ progress_config = core .ProgressConfig (
1243
+ total = pcvcf .total_uncompressed_bytes ,
1244
+ title = "Encode" ,
1245
+ units = "b" ,
1246
+ show = show_progress ,
1247
+ )
1266
1248
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1267
- futures = [
1268
- pwm .executor .submit (
1269
- sgvcf .encode_samples ,
1270
- pcvcf ,
1271
- conversion_spec .sample_id ,
1272
- conversion_spec .chunk_width ,
1273
- ),
1274
- pwm .executor .submit (sgvcf .encode_alleles , pcvcf ),
1275
- pwm .executor .submit (sgvcf .encode_id , pcvcf ),
1276
- pwm .executor .submit (
1277
- sgvcf .encode_contig ,
1278
- pcvcf ,
1279
- conversion_spec .contig_id ,
1280
- conversion_spec .contig_length ,
1281
- ),
1282
- pwm .executor .submit (
1283
- sgvcf .encode_filters , pcvcf , conversion_spec .filter_id
1284
- ),
1285
- ]
1249
+ pwm .submit (
1250
+ sgvcf .encode_samples ,
1251
+ pcvcf ,
1252
+ conversion_spec .sample_id ,
1253
+ conversion_spec .chunk_width ,
1254
+ )
1255
+ pwm .submit (sgvcf .encode_alleles , pcvcf )
1256
+ pwm .submit (sgvcf .encode_id , pcvcf )
1257
+ pwm .submit (
1258
+ sgvcf .encode_contig ,
1259
+ pcvcf ,
1260
+ conversion_spec .contig_id ,
1261
+ conversion_spec .contig_length ,
1262
+ )
1263
+ pwm .submit (sgvcf .encode_filters , pcvcf , conversion_spec .filter_id )
1286
1264
has_gt = False
1287
1265
for variable in conversion_spec .variables [:]:
1288
1266
if variable .vcf_field is not None :
@@ -1292,21 +1270,14 @@ def convert(
1292
1270
# long wait for the largest GT columns to finish.
1293
1271
# Straightforward to do because we can chunk-align the work
1294
1272
# packages.
1295
- future = pwm .executor .submit (sgvcf .encode_column , pcvcf , variable )
1296
- futures .append (future )
1273
+ pwm .submit (sgvcf .encode_column , pcvcf , variable )
1297
1274
else :
1298
1275
if variable .name == "call_genotype" :
1299
1276
has_gt = True
1300
1277
if has_gt :
1301
1278
# TODO add mixed ploidy
1302
- futures .append (pwm .executor .submit (sgvcf .encode_genotypes , pcvcf ))
1303
-
1304
- flush_futures (futures )
1279
+ pwm .executor .submit (sgvcf .encode_genotypes , pcvcf )
1305
1280
1306
- # FIXME can't join the bar_thread because we never get to the correct
1307
- # number of bytes
1308
- # if bar_thread is not None:
1309
- # bar_thread.join()
1310
1281
zarr .consolidate_metadata (write_path )
1311
1282
# Atomic swap, now we've completely finished.
1312
1283
logger .info (f"Moving to final path { path } " )
@@ -1617,14 +1588,9 @@ def convert_plink(
1617
1588
partitions .append ((last_stop , m ))
1618
1589
# print(partitions)
1619
1590
1620
- progress_config = None
1621
- if show_progress :
1622
- progress_config = core . ProgressConfig ( total = m , title = "Convert" , units = "vars" )
1591
+ progress_config = core . ProgressConfig (
1592
+ total = m , title = "Convert" , units = "vars" , show = show_progress
1593
+ )
1623
1594
with core .ParallelWorkManager (worker_processes , progress_config ) as pwm :
1624
- futures = [
1625
- pwm .executor .submit (
1626
- encode_bed_partition_genotypes , bed_path , zarr_path , start , end
1627
- )
1628
- for start , end in partitions
1629
- ]
1630
- flush_futures (futures )
1595
+ for start , end in partitions :
1596
+ pwm .submit (encode_bed_partition_genotypes , bed_path , zarr_path , start , end )
0 commit comments