@@ -1323,12 +1323,9 @@ def variant_chunk_nbytes(self):
13231323 """
13241324 Returns the nbytes for a single variant chunk of this array.
13251325 """
1326- chunk_items = 1
1327- for dim , size in enumerate (self .shape ):
1328- chunk_dim_size = size
1329- if dim < len (self .chunks ):
1330- chunk_dim_size = self .chunks [dim ]
1331- chunk_items *= chunk_dim_size
1326+ chunk_items = self .chunks [0 ]
1327+ for size in self .shape [1 :]:
1328+ chunk_items *= size
13321329 dt = np .dtype (self .dtype )
13331330 return chunk_items * dt .itemsize
13341331
@@ -1616,6 +1613,10 @@ def __init__(self, path):
16161613 def schema (self ):
16171614 return self .metadata .schema
16181615
1616+ @property
1617+ def num_partitions (self ):
1618+ return len (self .metadata .partitions )
1619+
16191620 #######################
16201621 # init
16211622 #######################
@@ -1778,10 +1779,10 @@ def encode_partition(self, partition_index):
17781779 self .encode_id_partition (partition_index )
17791780 self .encode_filters_partition (partition_index )
17801781 self .encode_contig_partition (partition_index )
1781- for col in self .metadata . schema .columns .values ():
1782+ for col in self .schema .columns .values ():
17821783 if col .vcf_field is not None :
17831784 self .encode_array_partition (col , partition_index )
1784- if "call_genotype" in self .metadata . schema .columns :
1785+ if "call_genotype" in self .schema .columns :
17851786 self .encode_genotypes_partition (partition_index )
17861787
17871788 def init_partition_array (self , partition_index , name ):
@@ -1954,6 +1955,7 @@ def finalise_array(self, name):
19541955 # Move all the files in partition dir to dest dir
19551956 src = self .partition_array_path (partition , name )
19561957 if not src .exists ():
1958+ # Needs test
19571959 raise ValueError (f"Partition { partition } of { name } does not exist" )
19581960 dest = self .arrays_path / name
19591961 # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
@@ -1977,7 +1979,7 @@ def finalise(self, show_progress=False):
19771979 self .load_metadata ()
19781980
19791981 progress_config = core .ProgressConfig (
1980- total = len (self .metadata . schema .columns ),
1982+ total = len (self .schema .columns ),
19811983 title = "Finalise" ,
19821984 units = "array" ,
19831985 show = show_progress ,
@@ -1991,7 +1993,7 @@ def finalise(self, show_progress=False):
19911993 # for multiple workers, or making a standard wrapper for tqdm
19921994 # that allows us to have a consistent look and feel.
19931995 with core .ParallelWorkManager (0 , progress_config ) as pwm :
1994- for name in self .metadata . schema .columns :
1996+ for name in self .schema .columns :
19951997 pwm .submit (self .finalise_array , name )
19961998 zarr .consolidate_metadata (self .path )
19971999
@@ -2003,16 +2005,28 @@ def get_max_encoding_memory(self):
20032005 """
20042006 Return the approximate maximum memory used to encode a variant chunk.
20052007 """
2006- return max (
2007- col .variant_chunk_nbytes for col in self .metadata . schema .columns .values ()
2008+ max_encoding_mem = max (
2009+ col .variant_chunk_nbytes for col in self .schema .columns .values ()
20082010 )
2011+ gt_mem = 0
2012+ if "call_genotype" in self .schema .columns :
2013+ encoded_together = [
2014+ "call_genotype" ,
2015+ "call_genotype_phased" ,
2016+ "call_genotype_mask" ,
2017+ ]
2018+ gt_mem = sum (
2019+ self .schema .columns [col ].variant_chunk_nbytes
2020+ for col in encoded_together
2021+ )
2022+ return max (max_encoding_mem , gt_mem )
20092023
20102024 def encode_all_partitions (
20112025 self , * , worker_processes = 1 , show_progress = False , max_memory = None
20122026 ):
20132027 max_memory = parse_max_memory (max_memory )
20142028 self .load_metadata ()
2015- num_partitions = len (self .metadata . partitions )
2029+ num_partitions = len (self .num_partitions )
20162030 per_worker_memory = self .get_max_encoding_memory ()
20172031 logger .info (
20182032 f"Encoding Zarr over { num_partitions } partitions with "
@@ -2120,13 +2134,14 @@ def encode_init(
21202134 schema = VcfZarrSchema .fromjson (f .read ())
21212135 zarr_path = pathlib .Path (zarr_path )
21222136 vzw = VcfZarrWriter (zarr_path )
2123- return vzw .init (
2137+ vzw .init (
21242138 icf ,
21252139 target_num_partitions = target_num_partitions ,
21262140 schema = schema ,
21272141 dimension_separator = dimension_separator ,
21282142 max_variant_chunks = max_variant_chunks ,
21292143 )
2144+ return vzw .num_partitions , vzw .get_max_encoding_memory ()
21302145
21312146
21322147def encode_partition (zarr_path , partition ):
0 commit comments