Improve dencode interface

jeromekelleher · jeromekelleher · commit 6eb33a77f272 · 2024-04-24T12:39:37.000+01:00
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -5,6 +5,7 @@
 
 import click
 import coloredlogs
+import humanfriendly
 import numcodecs
 import tabulate
 
@@ -335,7 +336,7 @@ def dencode_init(
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
-    num_partitions = vcf.encode_init(
+    num_partitions, max_memory = vcf.encode_init(
         icf_path,
         zarr_path,
         target_num_partitions=num_partitions,
@@ -345,7 +346,15 @@ def dencode_init(
         max_variant_chunks=max_variant_chunks,
         show_progress=True,
     )
-    click.echo(num_partitions)
+    formatted_size = humanfriendly.format_size(max_memory, binary=True)
+    # NOTE adding the size to the stdout here so that users can parse it
+    # and use in their submission scripts. This is a first pass, and
+    # will most likely change as we see what works and doesn't.
+    # NOTE we probably want to format this as a table, which lists
+    # some other properties, line by line
+    # NOTE This size number is also not quite enough, you need a bit of
+    # headroom with it (probably 10% or so). We should include this.
+    click.echo(f"{num_partitions}\t{formatted_size}")
 
 
 @click.command
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -110,6 +110,7 @@ def flush(self):
                 sync_flush_2d_array(
                     self.buff[: self.buffer_row], self.array, self.array_offset
                 )
+            # FIXME the array.name doesn't seem to be working here for some reason
             logger.debug(
                 f"Flushed <{self.array.name} {self.array.shape} "
                 f"{self.array.dtype}> "
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1323,12 +1323,9 @@ def variant_chunk_nbytes(self):
         """
         Returns the nbytes for a single variant chunk of this array.
         """
-        chunk_items = 1
-        for dim, size in enumerate(self.shape):
-            chunk_dim_size = size
-            if dim < len(self.chunks):
-                chunk_dim_size = self.chunks[dim]
-            chunk_items *= chunk_dim_size
+        chunk_items = self.chunks[0]
+        for size in self.shape[1:]:
+            chunk_items *= size
         dt = np.dtype(self.dtype)
         return chunk_items * dt.itemsize
 
@@ -1616,6 +1613,10 @@ def __init__(self, path):
     def schema(self):
         return self.metadata.schema
 
+    @property
+    def num_partitions(self):
+        return len(self.metadata.partitions)
+
     #######################
     # init
     #######################
@@ -1778,10 +1779,10 @@ def encode_partition(self, partition_index):
         self.encode_id_partition(partition_index)
         self.encode_filters_partition(partition_index)
         self.encode_contig_partition(partition_index)
-        for col in self.metadata.schema.columns.values():
+        for col in self.schema.columns.values():
             if col.vcf_field is not None:
                 self.encode_array_partition(col, partition_index)
-        if "call_genotype" in self.metadata.schema.columns:
+        if "call_genotype" in self.schema.columns:
             self.encode_genotypes_partition(partition_index)
 
     def init_partition_array(self, partition_index, name):
@@ -1954,6 +1955,7 @@ def finalise_array(self, name):
             # Move all the files in partition dir to dest dir
             src = self.partition_array_path(partition, name)
             if not src.exists():
+                # Needs test
                 raise ValueError(f"Partition {partition} of {name} does not exist")
             dest = self.arrays_path / name
             # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
@@ -1977,7 +1979,7 @@ def finalise(self, show_progress=False):
         self.load_metadata()
 
         progress_config = core.ProgressConfig(
-            total=len(self.metadata.schema.columns),
+            total=len(self.schema.columns),
             title="Finalise",
             units="array",
             show=show_progress,
@@ -1991,7 +1993,7 @@ def finalise(self, show_progress=False):
         # for multiple workers, or making a standard wrapper for tqdm
         # that allows us to have a consistent look and feel.
         with core.ParallelWorkManager(0, progress_config) as pwm:
-            for name in self.metadata.schema.columns:
+            for name in self.schema.columns:
                 pwm.submit(self.finalise_array, name)
         zarr.consolidate_metadata(self.path)
 
@@ -2003,16 +2005,28 @@ def get_max_encoding_memory(self):
         """
         Return the approximate maximum memory used to encode a variant chunk.
         """
-        return max(
-            col.variant_chunk_nbytes for col in self.metadata.schema.columns.values()
+        max_encoding_mem = max(
+            col.variant_chunk_nbytes for col in self.schema.columns.values()
         )
+        gt_mem = 0
+        if "call_genotype" in self.schema.columns:
+            encoded_together = [
+                "call_genotype",
+                "call_genotype_phased",
+                "call_genotype_mask",
+            ]
+            gt_mem = sum(
+                self.schema.columns[col].variant_chunk_nbytes
+                for col in encoded_together
+            )
+        return max(max_encoding_mem, gt_mem)
 
     def encode_all_partitions(
         self, *, worker_processes=1, show_progress=False, max_memory=None
     ):
         max_memory = parse_max_memory(max_memory)
         self.load_metadata()
-        num_partitions = len(self.metadata.partitions)
+        num_partitions = len(self.num_partitions)
         per_worker_memory = self.get_max_encoding_memory()
         logger.info(
             f"Encoding Zarr over {num_partitions} partitions with "
@@ -2120,13 +2134,14 @@ def encode_init(
             schema = VcfZarrSchema.fromjson(f.read())
     zarr_path = pathlib.Path(zarr_path)
     vzw = VcfZarrWriter(zarr_path)
-    return vzw.init(
+    vzw.init(
         icf,
         target_num_partitions=target_num_partitions,
         schema=schema,
         dimension_separator=dimension_separator,
         max_variant_chunks=max_variant_chunks,
     )
+    return vzw.num_partitions, vzw.get_max_encoding_memory()
 
 
 def encode_partition(zarr_path, partition):

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,7 @@ def flush(self):`
`110`	`110`	`sync_flush_2d_array(`
`111`	`111`	`self.buff[: self.buffer_row], self.array, self.array_offset`
`112`	`112`	`)`
	`113`	`+ # FIXME the array.name doesn't seem to be working here for some reason`
`113`	`114`	`logger.debug(`
`114`	`115`	`f"Flushed <{self.array.name} {self.array.shape} "`
`115`	`116`	`f"{self.array.dtype}> "`