@@ -151,7 +151,7 @@ class VcfPartition:
151151
152152ICF_METADATA_FORMAT_VERSION = "0.2"
153153ICF_DEFAULT_COMPRESSOR = numcodecs .Blosc (
154- cname = "lz4 " , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
154+ cname = "zstd " , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
155155)
156156
157157
@@ -890,6 +890,15 @@ def num_columns(self):
890890 return len (self .columns )
891891
892892
893+
894+ def mkdir_with_progress (path ):
895+ logger .debug (f"mkdir f{ path } " )
896+ # NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
897+ # parents=True will take care of it.
898+ path .mkdir (parents = True )
899+ core .update_progress (1 )
900+
901+
893902class IntermediateColumnarFormatWriter :
894903 def __init__ (self , path ):
895904 self .path = pathlib .Path (path )
@@ -932,7 +941,7 @@ def init(
932941 # dependencies as well.
933942 self .metadata .provenance = {"source" : f"bio2zarr-{ provenance .__version__ } " }
934943
935- self .mkdirs (worker_processes )
944+ self .mkdirs (worker_processes , show_progress = show_progress )
936945
937946 # Note: this is needed for the current version of the vcfzarr spec, but it's
938947 # probably going to be dropped.
@@ -947,30 +956,30 @@ def init(
947956 json .dump (self .metadata .asdict (), f , indent = 4 )
948957 return self .num_partitions
949958
950- def mkdirs (self , worker_processes = 1 ):
951- logger .info (
952- f"Creating { len (self .metadata .fields ) * self .num_partitions } directories"
953- )
959+ def mkdirs (self , worker_processes = 1 , show_progress = False ):
960+ num_dirs = len (self .metadata .fields ) * self .num_partitions
961+ logger .info (f"Creating { num_dirs } directories" )
954962 self .path .mkdir ()
955963 self .wip_path .mkdir ()
956964 # Due to high latency batch system filesystems, we create all the directories in
957965 # parallel
958966 progress_config = core .ProgressConfig (
959- total = len ( self . metadata . fields ) * self . num_partitions ,
960- units = "dir " ,
961- title = "Creating directories " ,
962- show = True
967+ total = num_dirs ,
968+ units = "dirs " ,
969+ title = "Mkdirs " ,
970+ show = show_progress ,
963971 )
964972 with core .ParallelWorkManager (
965- worker_processes = worker_processes ,
966- progress_config = progress_config
973+ worker_processes = worker_processes , progress_config = progress_config
967974 ) as manager :
968975 for field in self .metadata .fields :
969976 col_path = get_vcf_field_path (self .path , field )
977+ # Don't bother trying to count the intermediate directories towards
978+ # progress
970979 manager .submit (col_path .mkdir , parents = True )
971980 for j in range (self .num_partitions ):
972981 part_path = col_path / f"p{ j } "
973- manager .submit (part_path . mkdir , parents = True )
982+ manager .submit (mkdir_with_progress , part_path )
974983
975984 def load_partition_summaries (self ):
976985 summaries = []
@@ -1499,15 +1508,17 @@ def parse_max_memory(max_memory):
14991508
15001509
15011510class VcfZarrWriter :
1502- def __init__ (self , path , icf , schema ):
1511+ def __init__ (self , path , icf , schema , dimension_separator = None ):
15031512 self .path = pathlib .Path (path )
15041513 self .icf = icf
15051514 self .schema = schema
1515+ # Default to using nested directories following the Zarr v3 default.
1516+ # This seems to require version 2.17+ to work properly
1517+ self .dimension_separator = "/" if dimension_separator is None else dimension_separator
15061518 store = zarr .DirectoryStore (self .path )
15071519 self .root = zarr .group (store = store )
15081520
15091521 def init_array (self , variable ):
1510- # print("CREATE", variable)
15111522 object_codec = None
15121523 if variable .dtype == "O" :
15131524 object_codec = numcodecs .VLenUTF8 ()
@@ -1519,7 +1530,9 @@ def init_array(self, variable):
15191530 compressor = numcodecs .get_codec (variable .compressor ),
15201531 filters = [numcodecs .get_codec (filt ) for filt in variable .filters ],
15211532 object_codec = object_codec ,
1533+ dimension_separator = self .dimension_separator ,
15221534 )
1535+ # Dimension names are part of the spec in Zarr v3
15231536 a .attrs ["_ARRAY_DIMENSIONS" ] = variable .dimensions
15241537
15251538 def get_array (self , name ):
@@ -1657,6 +1670,7 @@ def encode_contig_id(self):
16571670 "contig_length" ,
16581671 self .schema .contig_length ,
16591672 dtype = np .int64 ,
1673+ compressor = DEFAULT_ZARR_COMPRESSOR ,
16601674 )
16611675 array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
16621676 return {v : j for j , v in enumerate (self .schema .contig_id )}
@@ -1849,6 +1863,7 @@ def encode(
18491863 variants_chunk_size = None ,
18501864 samples_chunk_size = None ,
18511865 max_v_chunks = None ,
1866+ dimension_separator = None ,
18521867 max_memory = None ,
18531868 worker_processes = 1 ,
18541869 show_progress = False ,
@@ -1872,7 +1887,7 @@ def encode(
18721887 if zarr_path .exists ():
18731888 logger .warning (f"Deleting existing { zarr_path } " )
18741889 shutil .rmtree (zarr_path )
1875- vzw = VcfZarrWriter (zarr_path , icf , schema )
1890+ vzw = VcfZarrWriter (zarr_path , icf , schema , dimension_separator = dimension_separator )
18761891 vzw .init ()
18771892 vzw .encode (
18781893 max_v_chunks = max_v_chunks ,
0 commit comments