Merge pull request #21 from jeromekelleher/more-refactoring

jeromekelleher · web-flow · commit 75e803904343 · 2024-02-21T09:44:37.000Z
Abstract threaded zarr encoding to class
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -1,4 +1,6 @@
 import dataclasses
+import contextlib
+import concurrent.futures as cf
 import logging
 
 import zarr
@@ -19,6 +21,10 @@ def __init__(self, array):
         dims[0] = min(array.chunks[0], array.shape[0])
         self.buff = np.zeros(dims, dtype=array.dtype)
 
+    @property
+    def chunk_length(self):
+        return self.buff.shape[0]
+
     def swap_buffers(self):
         self.buff = np.zeros_like(self.buff)
 
@@ -63,3 +69,52 @@ def flush_chunk(start, stop):
         start = stop
 
     return futures
+
+
+class ThreadedZarrEncoder(contextlib.AbstractContextManager):
+    def __init__(self, buffered_arrays, encoder_threads):
+        self.buffered_arrays = buffered_arrays
+        self.executor = cf.ThreadPoolExecutor(max_workers=encoder_threads)
+        self.chunk_length = buffered_arrays[0].chunk_length
+        assert all(ba.chunk_length == self.chunk_length for ba in self.buffered_arrays)
+        self.futures = []
+        self.array_offset = 0
+        self.next_row = -1
+
+    def next_buffer_row(self):
+        self.next_row += 1
+        if self.next_row == self.chunk_length:
+            self.swap_buffers()
+            self.array_offset += self.chunk_length
+            self.next_row = 0
+        return self.next_row
+
+    def wait_on_futures(self):
+        for future in cf.as_completed(self.futures):
+            exception = future.exception()
+            if exception is not None:
+                raise exception
+
+    def swap_buffers(self):
+        self.wait_on_futures()
+        self.futures = []
+        for ba in self.buffered_arrays:
+            # TODO add debug log
+            # print("Scheduling", ba.array, offset, buff_stop)
+            self.futures.extend(
+                ba.async_flush(self.executor, self.array_offset, self.next_row)
+            )
+            ba.swap_buffers()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is None:
+            # Normal exit condition
+            self.next_row += 1
+            self.swap_buffers()
+            self.wait_on_futures()
+        # TODO add arguments to wait and cancel_futures appropriate
+        # for the an error condition occuring here. Generally need
+        # to think about the error exit condition here (like running
+        # out of disk space) to see what the right behaviour is.
+        self.executor.shutdown()
+        return False
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1127,77 +1127,44 @@ def create_array(self, variable):
         )
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
 
-    def encode_column(self, pcvcf, column):
+    def encode_column(self, pcvcf, column, encoder_threads=4):
         source_col = pcvcf.columns[column.vcf_field]
         array = self.root[column.name]
         ba = core.BufferedArray(array)
         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
-        chunk_length = array.chunks[0]
 
-        with cf.ThreadPoolExecutor(max_workers=4) as executor:
-            futures = []
-            chunk_start = 0
-            j = 0
+        with core.ThreadedZarrEncoder([ba], encoder_threads) as te:
             last_bytes_read = 0
             for value, bytes_read in source_col.iter_values_bytes():
+                j = te.next_buffer_row()
                 sanitiser(ba.buff, j, value)
-                j += 1
-                if j == chunk_length:
-                    flush_futures(futures)
-                    futures.extend(ba.async_flush(executor, chunk_start))
-                    ba.swap_buffers()
-                    j = 0
-                    chunk_start += chunk_length
                 if last_bytes_read != bytes_read:
                     with progress_counter.get_lock():
                         progress_counter.value += bytes_read - last_bytes_read
                     last_bytes_read = bytes_read
 
-            if j != 0:
-                flush_futures(futures)
-                futures.extend(ba.async_flush(executor, chunk_start, j))
-            flush_futures(futures)
-
-    def encode_genotypes(self, pcvcf):
+    def encode_genotypes(self, pcvcf, encoder_threads=4):
         source_col = pcvcf.columns["FORMAT/GT"]
         gt = core.BufferedArray(self.root["call_genotype"])
         gt_mask = core.BufferedArray(self.root["call_genotype_mask"])
         gt_phased = core.BufferedArray(self.root["call_genotype_phased"])
-        chunk_length = gt.array.chunks[0]
-
         buffered_arrays = [gt, gt_phased, gt_mask]
 
-        with cf.ThreadPoolExecutor(max_workers=4) as executor:
-            futures = []
-            chunk_start = 0
-            j = 0
+        with core.ThreadedZarrEncoder(buffered_arrays, encoder_threads) as te:
             last_bytes_read = 0
             for value, bytes_read in source_col.iter_values_bytes():
+                j = te.next_buffer_row()
                 sanitise_value_int_2d(gt.buff, j, value[:, :-1])
                 sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
                 # TODO check is this the correct semantics when we are padding
                 # with mixed ploidies?
                 gt_mask.buff[j] = gt.buff[j] < 0
 
-                j += 1
-                if j == chunk_length:
-                    flush_futures(futures)
-                    for ba in buffered_arrays:
-                        futures.extend(ba.async_flush(executor, chunk_start))
-                        ba.swap_buffers()
-                    j = 0
-                    chunk_start += chunk_length
                 if last_bytes_read != bytes_read:
                     with progress_counter.get_lock():
                         progress_counter.value += bytes_read - last_bytes_read
                     last_bytes_read = bytes_read
 
-            if j != 0:
-                flush_futures(futures)
-                for ba in buffered_arrays:
-                    futures.extend(ba.async_flush(executor, chunk_start, j))
-            flush_futures(futures)
-
     def encode_alleles(self, pcvcf):
         ref_col = pcvcf.columns["REF"]
         alt_col = pcvcf.columns["ALT"]
@@ -1451,7 +1418,7 @@ def convert_vcf(
         )
 
 
-def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_variant):
+def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_variant, encoder_threads=8):
     bed = bed_reader.open_bed(bed_path, num_threads=1)
 
     store = zarr.DirectoryStore(zarr_path)
@@ -1464,8 +1431,7 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
 
     buffered_arrays = [gt, gt_phased, gt_mask]
 
-    with cf.ThreadPoolExecutor(max_workers=8) as executor:
-        futures = []
+    with core.ThreadedZarrEncoder(buffered_arrays, encoder_threads) as te:
 
         start = start_variant
         while start < end_variant:
@@ -1474,7 +1440,8 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
             # Note could do this without iterating over rows, but it's a bit
             # simpler and the bottleneck is in the encoding step anyway. It's
             # also nice to have updates on the progress monitor.
-            for j, values in enumerate(bed_chunk):
+            for values in bed_chunk:
+                j = te.next_buffer_row()
                 dest = gt.buff[j]
                 dest[values == -127] = -1
                 dest[values == 2] = 1
@@ -1483,14 +1450,7 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
                 gt_mask.buff[j] = dest == -1
                 with progress_counter.get_lock():
                     progress_counter.value += 1
-
-            assert j <= chunk_length
-            flush_futures(futures)
-            for ba in buffered_arrays:
-                ba.async_flush(extend, start, j)
-                ba.swap_buffers()
             start = stop
-        flush_futures(futures)
 
 
 def validate(vcf_path, zarr_path, show_progress=False):
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
@@ -247,6 +247,8 @@ def test_chunk_size(
         out = tmp_path / "example.vcf.zarr"
         vcf.convert_vcf([path], out, chunk_length=chunk_length, chunk_width=chunk_width)
         ds2 = sg.load_dataset(out)
+        # print(ds2.call_genotype.values)
+        # print(ds.call_genotype.values)
         xt.assert_equal(ds, ds2)
         assert ds2.call_DP.chunks == (y_chunks, x_chunks)
         assert ds2.call_GQ.chunks == (y_chunks, x_chunks)