@@ -932,10 +932,10 @@ def init(
932
932
# dependencies as well.
933
933
self .metadata .provenance = {"source" : f"bio2zarr-{ provenance .__version__ } " }
934
934
935
- self .mkdirs ()
935
+ self .mkdirs (worker_processes )
936
936
937
937
# Note: this is needed for the current version of the vcfzarr spec, but it's
938
- # probably goint to be dropped.
938
+ # probably going to be dropped.
939
939
# https://github.com/pystatgen/vcf-zarr-spec/issues/15
940
940
# May be useful to keep lying around still though?
941
941
logger .info (f"Writing VCF header" )
@@ -947,20 +947,30 @@ def init(
947
947
json .dump (self .metadata .asdict (), f , indent = 4 )
948
948
return self .num_partitions
949
949
950
- def mkdirs (self ):
951
- # TODO add worker_processes here and do this with the ParallelWorkManager
950
+ def mkdirs (self , worker_processes = 1 ):
952
951
logger .info (
953
952
f"Creating { len (self .metadata .fields ) * self .num_partitions } directories"
954
953
)
955
954
self .path .mkdir ()
956
955
self .wip_path .mkdir ()
957
- for field in self .metadata .fields :
958
- col_path = get_vcf_field_path (self .path , field )
959
- logger .debug (f"Make directories for { field .full_name } at { col_path } " )
960
- col_path .mkdir (parents = True )
961
- for j in range (self .num_partitions ):
962
- part_path = col_path / f"p{ j } "
963
- part_path .mkdir ()
956
+ # Due to high latency batch system filesystems, we create all the directories in
957
+ # parallel
958
+ progress_config = core .ProgressConfig (
959
+ total = len (self .metadata .fields ) * self .num_partitions ,
960
+ units = "dir" ,
961
+ title = "Creating directories" ,
962
+ show = True
963
+ )
964
+ with core .ParallelWorkManager (
965
+ worker_processes = worker_processes ,
966
+ progress_config = progress_config
967
+ ) as manager :
968
+ for field in self .metadata .fields :
969
+ col_path = get_vcf_field_path (self .path , field )
970
+ manager .submit (col_path .mkdir , parents = True )
971
+ for j in range (self .num_partitions ):
972
+ part_path = col_path / f"p{ j } "
973
+ manager .submit (part_path .mkdir , parents = True )
964
974
965
975
def load_partition_summaries (self ):
966
976
summaries = []
0 commit comments