@@ -204,6 +204,10 @@ def num_contigs(self):
204
204
def num_filters (self ):
205
205
return len (self .filters )
206
206
207
+ @property
208
+ def num_samples (self ):
209
+ return len (self .samples )
210
+
207
211
@staticmethod
208
212
def fromdict (d ):
209
213
if d ["format_version" ] != ICF_METADATA_FORMAT_VERSION :
@@ -982,6 +986,19 @@ def check_field_clobbering(icf_metadata):
982
986
)
983
987
984
988
989
+ @dataclasses .dataclass
990
+ class IcfWriteSummary :
991
+ num_partitions : int
992
+ num_samples : int
993
+ num_variants : int
994
+
995
+ def asdict (self ):
996
+ return dataclasses .asdict (self )
997
+
998
+ def asjson (self ):
999
+ return json .dumps (self .asdict (), indent = 4 )
1000
+
1001
+
985
1002
class IntermediateColumnarFormatWriter :
986
1003
def __init__ (self , path ):
987
1004
self .path = pathlib .Path (path )
@@ -1038,7 +1055,11 @@ def init(
1038
1055
logger .info ("Writing WIP metadata" )
1039
1056
with open (self .wip_path / "metadata.json" , "w" ) as f :
1040
1057
json .dump (self .metadata .asdict (), f , indent = 4 )
1041
- return self .num_partitions
1058
+ return IcfWriteSummary (
1059
+ num_partitions = self .num_partitions ,
1060
+ num_variants = icf_metadata .num_records ,
1061
+ num_samples = icf_metadata .num_samples ,
1062
+ )
1042
1063
1043
1064
def mkdirs (self ):
1044
1065
num_dirs = len (self .metadata .fields )
@@ -1371,6 +1392,7 @@ def variant_chunk_nbytes(self):
1371
1392
"""
1372
1393
Returns the nbytes for a single variant chunk of this array.
1373
1394
"""
1395
+ # TODO WARNING IF this is a string
1374
1396
chunk_items = self .chunks [0 ]
1375
1397
for size in self .shape [1 :]:
1376
1398
chunk_items *= size
@@ -1643,6 +1665,21 @@ def fromdict(d):
1643
1665
return ret
1644
1666
1645
1667
1668
+ @dataclasses .dataclass
1669
+ class VcfZarrWriteSummary :
1670
+ num_partitions : int
1671
+ num_samples : int
1672
+ num_variants : int
1673
+ num_chunks : int
1674
+ max_encoding_memory : str
1675
+
1676
+ def asdict (self ):
1677
+ return dataclasses .asdict (self )
1678
+
1679
+ def asjson (self ):
1680
+ return json .dumps (self .asdict (), indent = 4 )
1681
+
1682
+
1646
1683
class VcfZarrWriter :
1647
1684
def __init__ (self , path ):
1648
1685
self .path = pathlib .Path (path )
@@ -1718,13 +1755,22 @@ def init(
1718
1755
store = zarr .DirectoryStore (self .arrays_path )
1719
1756
root = zarr .group (store = store )
1720
1757
1721
- for column in self .schema .fields .values ():
1722
- self .init_array (root , column , partitions [- 1 ].stop )
1758
+ total_chunks = 0
1759
+ for field in self .schema .fields .values ():
1760
+ a = self .init_array (root , field , partitions [- 1 ].stop )
1761
+ total_chunks += a .nchunks
1723
1762
1724
1763
logger .info ("Writing WIP metadata" )
1725
1764
with open (self .wip_path / "metadata.json" , "w" ) as f :
1726
1765
json .dump (self .metadata .asdict (), f , indent = 4 )
1727
- return len (partitions )
1766
+
1767
+ return VcfZarrWriteSummary (
1768
+ num_variants = self .icf .num_records ,
1769
+ num_samples = self .icf .num_samples ,
1770
+ num_partitions = self .num_partitions ,
1771
+ num_chunks = total_chunks ,
1772
+ max_encoding_memory = display_size (self .get_max_encoding_memory ()),
1773
+ )
1728
1774
1729
1775
def encode_samples (self , root ):
1730
1776
if self .schema .samples != self .icf .metadata .samples :
@@ -1794,6 +1840,7 @@ def init_array(self, root, variable, variants_dim_size):
1794
1840
}
1795
1841
)
1796
1842
logger .debug (f"Initialised { a } " )
1843
+ return a
1797
1844
1798
1845
#######################
1799
1846
# encode_partition
@@ -2062,6 +2109,9 @@ def get_max_encoding_memory(self):
2062
2109
"""
2063
2110
Return the approximate maximum memory used to encode a variant chunk.
2064
2111
"""
2112
+ # NOTE This size number is also not quite enough, you need a bit of
2113
+ # headroom with it (probably 10% or so). We should include this.
2114
+ # FIXME this is actively wrong for String columns. See if we can do better.
2065
2115
max_encoding_mem = max (
2066
2116
col .variant_chunk_nbytes for col in self .schema .fields .values ()
2067
2117
)
@@ -2190,14 +2240,13 @@ def encode_init(
2190
2240
schema = VcfZarrSchema .fromjson (f .read ())
2191
2241
zarr_path = pathlib .Path (zarr_path )
2192
2242
vzw = VcfZarrWriter (zarr_path )
2193
- vzw .init (
2243
+ return vzw .init (
2194
2244
icf ,
2195
2245
target_num_partitions = target_num_partitions ,
2196
2246
schema = schema ,
2197
2247
dimension_separator = dimension_separator ,
2198
2248
max_variant_chunks = max_variant_chunks ,
2199
2249
)
2200
- return vzw .num_partitions , vzw .get_max_encoding_memory ()
2201
2250
2202
2251
2203
2252
def encode_partition (zarr_path , partition ):
0 commit comments