Add memory budget

jeromekelleher · jeromekelleher · commit 526cce1e81d6 · 2024-03-20T23:37:52.000Z
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -109,6 +109,13 @@ def mkschema(if_path):
         "schema tuning."
     ),
 )
+@click.option(
+    "-M",
+    "--max-memory",
+    type=int,
+    default=None,
+    help="An approximate bound on overall memory usage in megabytes",
+)
 @worker_processes
 def encode(
     if_path,
@@ -118,6 +125,7 @@ def encode(
     chunk_length,
     chunk_width,
     max_variant_chunks,
+    max_memory,
     worker_processes,
 ):
     """
@@ -132,6 +140,7 @@ def encode(
         chunk_width=chunk_width,
         max_v_chunks=max_variant_chunks,
         worker_processes=worker_processes,
+        max_memory=max_memory,
         show_progress=True,
     )
 
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -180,7 +180,7 @@ def __init__(self, worker_processes=1, progress_config=None):
             self.executor = cf.ProcessPoolExecutor(
                 max_workers=worker_processes,
             )
-        self.futures = []
+        self.futures = set()
 
         set_progress(0)
         if progress_config is None:
@@ -219,7 +219,19 @@ def _update_progress_worker(self):
         logger.debug("Exit progress thread")
 
     def submit(self, *args, **kwargs):
-        self.futures.append(self.executor.submit(*args, **kwargs))
+        future = self.executor.submit(*args, **kwargs)
+        self.futures.add(future)
+        return future
+
+    def wait_for_completed(self, timeout):
+        done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
+        for future in done:
+            exception = future.exception()
+            # TODO do the check for BrokenProcessPool here
+            if exception is not None:
+                raise exception
+        self.futures = not_done
+        return done
 
     def results_as_completed(self):
         for future in cf.as_completed(self.futures):
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -48,7 +48,7 @@ def display_number(x):
 
 
 def display_size(n):
-    return humanfriendly.format_size(n)
+    return humanfriendly.format_size(n, binary=True)
 
 
 @dataclasses.dataclass
@@ -1289,6 +1289,7 @@ class EncodingWork:
     func: callable
     start: int
     stop: int
+    memory: int = 0
 
 
 class VcfZarrWriter:
@@ -1478,7 +1479,17 @@ def encode(
         worker_processes=1,
         max_v_chunks=None,
         show_progress=False,
+        max_memory=None,
     ):
+        if max_memory is None:
+            # Unbounded
+            max_memory = 2**63
+        else:
+            # Value is specified in Mibibytes
+            max_memory *= 2**20
+
+        # TODO this will move into the setup logic later when we're making it possible
+        # to split the work by slice
         num_slices = max(1, worker_processes * 4)
         # Using POS arbitrarily to get the array slices
         slices = core.chunk_aligned_slices(
@@ -1492,8 +1503,16 @@ def encode(
                 array.resize(shape)
 
         total_bytes = 0
+        encoding_memory_requirements = {}
         for col in self.schema.columns.values():
             array = self.get_array(col.name)
+            # NOTE!! this is bad, we're potentially creating quite a large
+            # numpy array for basically nothing. We can compute this.
+            variant_chunk_size = array.blocks[0].nbytes
+            encoding_memory_requirements[col.name] = variant_chunk_size
+            logger.debug(
+                f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
+            )
             total_bytes += array.nbytes
 
         filter_id_map = self.encode_filter_id()
@@ -1504,7 +1523,11 @@ def encode(
             for col in self.schema.columns.values():
                 if col.vcf_field is not None:
                     f = functools.partial(self.encode_array_slice, col)
-                    work.append(EncodingWork(f, start, stop))
+                    work.append(
+                        EncodingWork(
+                            f, start, stop, encoding_memory_requirements[col.name]
+                        )
+                    )
             work.append(EncodingWork(self.encode_alleles_slice, start, stop))
             work.append(EncodingWork(self.encode_id_slice, start, stop))
             work.append(
@@ -1522,18 +1545,51 @@ def encode(
                 )
             )
             if "call_genotype" in self.schema.columns:
-                work.append(EncodingWork(self.encode_genotypes_slice, start, stop))
+                gt_memory = sum(
+                    encoding_memory_requirements[name]
+                    for name in [
+                        "call_genotype",
+                        "call_genotype_phased",
+                        "call_genotype_mask",
+                    ]
+                )
+                work.append(
+                    EncodingWork(self.encode_genotypes_slice, start, stop, gt_memory)
+                )
+        # Fail early if we can't fit a particular column into memory
+        for wp in work:
+            if wp.memory >= max_memory:
+                raise ValueError(f"Insufficient memory for {wp.func}: "
+                    f"{display_size(wp.memory)} > {display_size(max_memory)}")
+
 
         progress_config = core.ProgressConfig(
             total=total_bytes,
             title="Encode",
             units="B",
             show=show_progress,
         )
+        used_memory = 0
         with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-            pwm.submit(self.encode_samples)
+            future = pwm.submit(self.encode_samples)
+            future_to_memory_use = {future: 0}
             for wp in work:
-                pwm.submit(wp.func, wp.start, wp.stop)
+                while used_memory + wp.memory >= max_memory:
+                    logger.info(
+                        f"Memory budget {display_size(max_memory)} exceeded: "
+                        f"used={display_size(used_memory)} needed={display_size(wp.memory)}"
+                    )
+                    futures = pwm.wait_for_completed(timeout=5)
+                    released_mem = sum(
+                        future_to_memory_use.pop(future) for future in futures
+                    )
+                    logger.info(
+                        f"{len(futures)} completed, released {display_size(released_mem)}"
+                    )
+                    used_memory -= released_mem
+                future = pwm.submit(wp.func, wp.start, wp.stop)
+                used_memory += wp.memory
+                future_to_memory_use[future] = wp.memory
 
 
 def mkschema(if_path, out):
@@ -1549,6 +1605,7 @@ def encode(
     chunk_length=None,
     chunk_width=None,
     max_v_chunks=None,
+    max_memory=None,
     worker_processes=1,
     show_progress=False,
 ):
@@ -1574,6 +1631,7 @@ def encode(
     vzw.encode(
         max_v_chunks=max_v_chunks,
         worker_processes=worker_processes,
+        max_memory=max_memory,
         show_progress=show_progress,
     )
     vzw.finalise()
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -68,6 +68,7 @@ def test_encode(self):
                 chunk_width=None,
                 max_v_chunks=None,
                 worker_processes=1,
+                max_memory=None,
                 show_progress=True,
             )
 

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def test_encode(self):`
`68`	`68`	`chunk_width=None,`
`69`	`69`	`max_v_chunks=None,`
`70`	`70`	`worker_processes=1,`
	`71`	`+ max_memory=None,`
`71`	`72`	`show_progress=True,`
`72`	`73`	`)`
`73`	`74`