Add tests for ZarrEncoder

jeromekelleher · jeromekelleher · commit 4ab8dec7e956 · 2024-02-21T15:54:01.000Z
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -31,6 +31,7 @@ def swap_buffers(self):
     def async_flush(self, executor, offset, buff_stop=None):
         return async_flush_array(executor, self.buff[:buff_stop], self.array, offset)
 
+# TODO: factor these functions into the BufferedArray class
 
 def sync_flush_array(np_buffer, zarr_array, offset):
     zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
@@ -72,7 +73,9 @@ def flush_chunk(start, stop):
 
 
 class ThreadedZarrEncoder(contextlib.AbstractContextManager):
-    def __init__(self, buffered_arrays, encoder_threads):
+    # TODO (maybe) add option with encoder_threads=None to run synchronously for
+    # debugging using a mock Executor
+    def __init__(self, buffered_arrays, encoder_threads=1):
         self.buffered_arrays = buffered_arrays
         self.executor = cf.ThreadPoolExecutor(max_workers=encoder_threads)
         self.chunk_length = buffered_arrays[0].chunk_length
@@ -99,8 +102,6 @@ def swap_buffers(self):
         self.wait_on_futures()
         self.futures = []
         for ba in self.buffered_arrays:
-            # TODO add debug log
-            # print("Scheduling", ba.array, offset, buff_stop)
             self.futures.extend(
                 ba.async_flush(self.executor, self.array_offset, self.next_row)
             )
@@ -112,9 +113,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             self.next_row += 1
             self.swap_buffers()
             self.wait_on_futures()
-        # TODO add arguments to wait and cancel_futures appropriate
-        # for the an error condition occuring here. Generally need
-        # to think about the error exit condition here (like running
-        # out of disk space) to see what the right behaviour is.
+        else:
+            for future in self.futures:
+                future.cancel()
         self.executor.shutdown()
         return False
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -0,0 +1,95 @@
+import numpy as np
+import numpy.testing as nt
+import pytest
+import zarr
+
+from bio2zarr import core
+
+
+def encode_arrays(arrays, data, encoder_threads=1):
+    buffered_arrays = [core.BufferedArray(a) for a in arrays]
+    assert len(arrays) == len(data)
+    for a, d in zip(arrays, data):
+        assert a.shape == d.shape
+        assert a.shape[0] == arrays[0].shape[0]
+    data_row = 0
+    with core.ThreadedZarrEncoder(buffered_arrays, encoder_threads) as tze:
+        for data_row in range(len(data[0])):
+            j = tze.next_buffer_row()
+            for ba, data_array in zip(buffered_arrays, data):
+                ba.buff[j] = data_array[data_row]
+
+
+class TestZarrEncoder:
+    @pytest.mark.parametrize(
+        ["data", "chunk_size"],
+        [
+            (np.arange(10), (1,)),
+            (np.arange(10), (3,)),
+            (np.arange(10), (5,)),
+            (np.arange(10), (10,)),
+            (np.arange(10, dtype=np.int8), (3,)),
+            (np.arange(10, dtype=np.int32), (3,)),
+            (np.arange(10, dtype=np.float32), (3,)),
+            (np.arange(10, dtype=np.float64), (3,)),
+            (-1 * np.arange(100, dtype=np.int32)[::-1], (7,)),
+            # 2D arrays
+            (np.arange(16).reshape((4, 4)), (1, 4)),
+            (np.arange(16).reshape((4, 4)), (3, 3)),
+            (np.arange(16).reshape((4, 4)), (16, 1)),
+            # 3D arrays
+            (np.arange(32).reshape((8, 2, 2)), (1, 4, 2)),
+        ],
+    )
+    def test_single_array(self, data, chunk_size):
+        a = zarr.empty_like(data, chunks=chunk_size)
+        encode_arrays([a], [data])
+        nt.assert_array_equal(a[:], data)
+
+    @pytest.mark.parametrize("chunk_size", range(1, 6))
+    def test_multi_array(self, chunk_size):
+        n = 33
+        data = [
+            np.arange(n),
+            np.arange(n, dtype=np.int32),
+            np.arange(n, dtype=np.float64),
+        ]
+        arrays = [zarr.empty_like(d, chunks=(chunk_size,)) for d in data]
+        encode_arrays(arrays, data)
+
+    @pytest.mark.parametrize("threads", range(1, 6))
+    def test_single_array_threads(self, threads):
+        data = np.arange(10_333)
+        a = zarr.empty_like(data, chunks=(100,))
+        encode_arrays([a], [data], threads)
+        nt.assert_array_equal(a[:], data)
+
+    def test_error_in_user_code(self):
+        data = list(range(10)) + ["string"]
+        a = zarr.empty(len(data), chunks=(1,), dtype=int)
+        ba = core.BufferedArray(a)
+
+        with pytest.raises(ValueError, match="int()"):
+            with core.ThreadedZarrEncoder([ba]) as tze:
+                for d in data:
+                    j = tze.next_buffer_row()
+                    # This raises an error when "string" inserted to buffer
+                    ba.buff[j] = d
+
+    def test_error_in_encode(self):
+        data = np.array([1])
+        a = zarr.empty_like(data, chunks=(1,))
+        ba = core.BufferedArray(a)
+
+        with pytest.raises(ValueError, match="int()"):
+            with core.ThreadedZarrEncoder([ba]) as tze:
+                for d in data:
+                    j = tze.next_buffer_row()
+                    # This raises an error when "string" inserted to buffer
+                    ba.buff[j] = d
+                # We only flush on exiting the context manager, so switch the
+                # buffer for something nasty.
+                # NB: this is the only reliable way I can think of raising
+                # an error in the futures. In reality these will happen
+                # when we run out of disk space, but this is hard to simulate
+                ba.buff = np.array(["not an integer"])