Refactored encode write path

jeromekelleher · jeromekelleher · commit 01559ce8906c · 2024-03-01T16:22:16.000Z
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -24,6 +24,23 @@
 )
 
 
+def chunk_aligned_slices(z, n):
+    """
+    Returns at n slices in the specified zarr array, aligned
+    with its chunks
+    """
+    chunk_size = z.chunks[0]
+    num_chunks = int(np.ceil(z.shape[0] / chunk_size))
+    slices = []
+    splits = np.array_split(np.arange(num_chunks), min(n, num_chunks))
+    for split in splits:
+        start = split[0] * chunk_size
+        stop = (split[-1] + 1) * chunk_size
+        stop = min(stop, z.shape[0])
+        slices.append((start, stop))
+    return slices
+
+
 class SynchronousExecutor(cf.Executor):
     def submit(self, fn, /, *args, **kwargs):
         future = cf.Future()
@@ -46,23 +63,38 @@ def cancel_futures(futures):
 @dataclasses.dataclass
 class BufferedArray:
     array: zarr.Array
+    array_offset: int
     buff: np.ndarray
+    buffer_row: int
 
-    def __init__(self, array):
+    def __init__(self, array, offset):
         self.array = array
+        self.array_offset = offset
+        assert offset % array.chunks[0] == 0
         dims = list(array.shape)
         dims[0] = min(array.chunks[0], array.shape[0])
         self.buff = np.zeros(dims, dtype=array.dtype)
+        self.buffer_row = 0
 
     @property
     def chunk_length(self):
         return self.buff.shape[0]
 
-    def swap_buffers(self):
-        self.buff = np.zeros_like(self.buff)
-
-    def async_flush(self, executor, offset, buff_stop=None):
-        return async_flush_array(executor, self.buff[:buff_stop], self.array, offset)
+    def next_buffer_row(self):
+        if self.buffer_row == self.chunk_length:
+            self.flush()
+        row = self.buffer_row
+        self.buffer_row += 1
+        return row
+
+    def flush(self):
+        # TODO just move sync_flush_array in here
+        if self.buffer_row != 0:
+            sync_flush_array(
+                self.buff[: self.buffer_row], self.array, self.array_offset
+            )
+            self.array_offset += self.chunk_length
+            self.buffer_row = 0
 
 
 # TODO: factor these functions into the BufferedArray class
@@ -72,82 +104,6 @@ def sync_flush_array(np_buffer, zarr_array, offset):
     zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
 
 
-def async_flush_array(executor, np_buffer, zarr_array, offset):
-    """
-    Flush the specified chunk aligned buffer to the specified zarr array.
-    """
-    logger.debug(f"Schedule flush {zarr_array} @ {offset}")
-    assert zarr_array.shape[1:] == np_buffer.shape[1:]
-    # print("sync", zarr_array, np_buffer)
-
-    if len(np_buffer.shape) == 1:
-        futures = [executor.submit(sync_flush_array, np_buffer, zarr_array, offset)]
-    else:
-        futures = async_flush_2d_array(executor, np_buffer, zarr_array, offset)
-    return futures
-
-
-def async_flush_2d_array(executor, np_buffer, zarr_array, offset):
-    # Flush each of the chunks in the second dimension separately
-    s = slice(offset, offset + np_buffer.shape[0])
-
-    def flush_chunk(start, stop):
-        zarr_array[s, start:stop] = np_buffer[:, start:stop]
-
-    chunk_width = zarr_array.chunks[1]
-    zarr_array_width = zarr_array.shape[1]
-    start = 0
-    futures = []
-    while start < zarr_array_width:
-        stop = min(start + chunk_width, zarr_array_width)
-        future = executor.submit(flush_chunk, start, stop)
-        futures.append(future)
-        start = stop
-
-    return futures
-
-
-class ThreadedZarrEncoder(contextlib.AbstractContextManager):
-    # TODO (maybe) add option with encoder_threads=None to run synchronously for
-    # debugging using a mock Executor
-    def __init__(self, buffered_arrays, encoder_threads=1):
-        self.buffered_arrays = buffered_arrays
-        self.executor = cf.ThreadPoolExecutor(max_workers=encoder_threads)
-        self.chunk_length = buffered_arrays[0].chunk_length
-        assert all(ba.chunk_length == self.chunk_length for ba in self.buffered_arrays)
-        self.futures = []
-        self.array_offset = 0
-        self.next_row = -1
-
-    def next_buffer_row(self):
-        self.next_row += 1
-        if self.next_row == self.chunk_length:
-            self.swap_buffers()
-            self.array_offset += self.chunk_length
-            self.next_row = 0
-        return self.next_row
-
-    def swap_buffers(self):
-        wait_on_futures(self.futures)
-        self.futures = []
-        for ba in self.buffered_arrays:
-            self.futures.extend(
-                ba.async_flush(self.executor, self.array_offset, self.next_row)
-            )
-            ba.swap_buffers()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is None:
-            # Normal exit condition
-            self.next_row += 1
-            self.swap_buffers()
-            wait_on_futures(self.futures)
-        else:
-            cancel_futures(self.futures)
-        self.executor.shutdown()
-        return False
-
-
 @dataclasses.dataclass
 class ProgressConfig:
     total: int = 0
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1262,54 +1262,39 @@ def create_array(self, variable):
         )
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
 
-    def encode_column(self, pcvcf, column, encoder_threads=4):
-        # TODO we're doing this the wrong way at the moment, overcomplicating
-        # things by having the ThreadedZarrEncoder. It would be simpler if
-        # we split the columns into vertical chunks, and just pushed a bunch
-        # of futures for encoding start:end slices of each column. The
-        # complicating factor here is that we need to get these slices
-        # out of the pcvcf, which takes a little bit of doing (but fine,
-        # because we know the number of records in each partition).
-        # An annoying factor then is how to update the progess meter
-        # because the "bytes read" approach becomes problematic
-        # when we might access the same chunk several times.
-        # Would perhaps be better to call sys.getsizeof() on the stored
-        # value each time.
-
+    def encode_column_slice(self, pcvcf, column, start, stop):
         source_col = pcvcf.columns[column.vcf_field]
         array = self.root[column.name]
-        ba = core.BufferedArray(array)
+        ba = core.BufferedArray(array, start)
         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
 
-        with core.ThreadedZarrEncoder([ba], encoder_threads) as te:
-            last_bytes_read = 0
-            for value, bytes_read in source_col.iter_values_bytes():
-                j = te.next_buffer_row()
-                sanitiser(ba.buff, j, value)
-                # print(bytes_read, last_bytes_read, value)
-                if last_bytes_read != bytes_read:
-                    core.update_progress(bytes_read - last_bytes_read)
-                    last_bytes_read = bytes_read
-
-    def encode_genotypes(self, pcvcf, encoder_threads=4):
+        for value in source_col.iter_values(start, stop):
+            # We write directly into the buffer in the sanitiser function
+            # to make it easier to reason about dimension padding
+            j = ba.next_buffer_row()
+            sanitiser(ba.buff, j, value)
+            core.update_progress(sys.getsizeof(value))
+        ba.flush()
+
+    def encode_genotypes_slice(self, pcvcf, start, stop):
         source_col = pcvcf.columns["FORMAT/GT"]
-        gt = core.BufferedArray(self.root["call_genotype"])
-        gt_mask = core.BufferedArray(self.root["call_genotype_mask"])
-        gt_phased = core.BufferedArray(self.root["call_genotype_phased"])
-        buffered_arrays = [gt, gt_phased, gt_mask]
-
-        with core.ThreadedZarrEncoder(buffered_arrays, encoder_threads) as te:
-            last_bytes_read = 0
-            for value, bytes_read in source_col.iter_values_bytes():
-                j = te.next_buffer_row()
-                sanitise_value_int_2d(gt.buff, j, value[:, :-1])
-                sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
-                # TODO check is this the correct semantics when we are padding
-                # with mixed ploidies?
-                gt_mask.buff[j] = gt.buff[j] < 0
-                if last_bytes_read != bytes_read:
-                    core.update_progress(bytes_read - last_bytes_read)
-                    last_bytes_read = bytes_read
+        gt = core.BufferedArray(self.root["call_genotype"], start)
+        gt_mask = core.BufferedArray(self.root["call_genotype_mask"], start)
+        gt_phased = core.BufferedArray(self.root["call_genotype_phased"], start)
+
+        for value in source_col.iter_values(start, stop):
+            j = gt.next_buffer_row()
+            sanitise_value_int_2d(gt.buff, j, value[:, :-1])
+            j = gt_phased.next_buffer_row()
+            sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
+            # TODO check is this the correct semantics when we are padding
+            # with mixed ploidies?
+            j = gt_mask.next_buffer_row()
+            gt_mask.buff[j] = gt.buff[j] < 0
+            core.update_progress(sys.getsizeof(value))
+        gt.flush()
+        gt_phased.flush()
+        gt_mask.flush()
 
     def encode_alleles(self, pcvcf):
         ref_col = pcvcf.columns["REF"]
@@ -1449,6 +1434,7 @@ def convert(
             units="b",
             show=show_progress,
         )
+        num_slices = max(1, worker_processes * 4)
         with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
             pwm.submit(
                 sgvcf.encode_samples,
@@ -1465,22 +1451,23 @@ def convert(
                 conversion_spec.contig_length,
             )
             pwm.submit(sgvcf.encode_filters, pcvcf, conversion_spec.filter_id)
+            # Using POS arbitrarily to get the array slices
+            slices = core.chunk_aligned_slices(
+                sgvcf.root["variant_position"], num_slices
+            )
             has_gt = False
             for variable in conversion_spec.columns.values():
                 if variable.vcf_field is not None:
-                    # print("Encode", variable.name)
-                    # TODO for large columns it's probably worth splitting up
-                    # these into vertical chunks. Otherwise we tend to get a
-                    # long wait for the largest GT columns to finish.
-                    # Straightforward to do because we can chunk-align the work
-                    # packages.
-                    pwm.submit(sgvcf.encode_column, pcvcf, variable)
+                    for start, stop in slices:
+                        pwm.submit(
+                            sgvcf.encode_column_slice, pcvcf, variable, start, stop
+                        )
                 else:
                     if variable.name == "call_genotype":
                         has_gt = True
             if has_gt:
-                # TODO add mixed ploidy
-                pwm.executor.submit(sgvcf.encode_genotypes, pcvcf)
+                for start, stop in slices:
+                    pwm.submit(sgvcf.encode_genotypes_slice, pcvcf, start, stop)
 
         zarr.consolidate_metadata(write_path)
         # Atomic swap, now we've completely finished.
diff --git a/tests/test_core.py b/tests/test_core.py