sgkit-dev
diff --git a/‎bio2zarr/cli.py‎
Lines changed: 24 additions & 3 deletions b/‎bio2zarr/cli.py‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎bio2zarr/core.py‎
Lines changed: 67 additions & 78 deletions b/‎bio2zarr/core.py‎
Lines changed: 67 additions & 78 deletions
@@ -14,6 +14,11 @@
     "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
 )
 
+# TODO help text
+chunk_length = click.option("-l", "--chunk-length", type=int, default=None)
+
+chunk_width = click.option("-w", "--chunk-width", type=int, default=None)
+
 version = click.version_option(version=provenance.__version__)
 
 
@@ -79,8 +84,13 @@ def mkschema(if_path):
 @click.argument("zarr_path", type=click.Path())
 @verbose
 @click.option("-s", "--schema", default=None)
+# TODO: these are mutually exclusive with schema, tell click this
+@chunk_length
+@chunk_width
 @worker_processes
-def encode(if_path, zarr_path, verbose, schema, worker_processes):
+def encode(
+    if_path, zarr_path, verbose, schema, chunk_length, chunk_width, worker_processes
+):
     """
     Encode intermediate format (see explode) to vcfzarr
     """
@@ -89,6 +99,8 @@ def encode(if_path, zarr_path, verbose, schema, worker_processes):
         if_path,
         zarr_path,
         schema,
+        chunk_length=chunk_length,
+        chunk_width=chunk_width,
         worker_processes=worker_processes,
         show_progress=True,
     )
@@ -97,14 +109,23 @@ def encode(if_path, zarr_path, verbose, schema, worker_processes):
 @click.command(name="convert")
 @click.argument("vcfs", nargs=-1, required=True)
 @click.argument("out_path", type=click.Path())
+@chunk_length
+@chunk_width
 @verbose
 @worker_processes
-def convert_vcf(vcfs, out_path, verbose, worker_processes):
+def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
     """
     Convert input VCF(s) directly to vcfzarr (not recommended for large files)
     """
     setup_logging(verbose)
-    vcf.convert(vcfs, out_path, show_progress=True, worker_processes=worker_processes)
+    vcf.convert(
+        vcfs,
+        out_path,
+        chunk_length=chunk_length,
+        chunk_width=chunk_width,
+        show_progress=True,
+        worker_processes=worker_processes,
+    )
 
 
 @click.command
 
@@ -24,6 +24,23 @@
 )
 
 
+def chunk_aligned_slices(z, n):
+    """
+    Returns at n slices in the specified zarr array, aligned
+    with its chunks
+    """
+    chunk_size = z.chunks[0]
+    num_chunks = int(np.ceil(z.shape[0] / chunk_size))
+    slices = []
+    splits = np.array_split(np.arange(num_chunks), min(n, num_chunks))
+    for split in splits:
+        start = split[0] * chunk_size
+        stop = (split[-1] + 1) * chunk_size
+        stop = min(stop, z.shape[0])
+        slices.append((start, stop))
+    return slices
+
+
 class SynchronousExecutor(cf.Executor):
     def submit(self, fn, /, *args, **kwargs):
         future = cf.Future()
@@ -46,107 +63,66 @@ def cancel_futures(futures):
 @dataclasses.dataclass
 class BufferedArray:
     array: zarr.Array
+    array_offset: int
     buff: np.ndarray
+    buffer_row: int
 
-    def __init__(self, array):
+    def __init__(self, array, offset):
         self.array = array
+        self.array_offset = offset
+        assert offset % array.chunks[0] == 0
         dims = list(array.shape)
         dims[0] = min(array.chunks[0], array.shape[0])
         self.buff = np.zeros(dims, dtype=array.dtype)
+        self.buffer_row = 0
 
     @property
     def chunk_length(self):
         return self.buff.shape[0]
 
-    def swap_buffers(self):
-        self.buff = np.zeros_like(self.buff)
-
-    def async_flush(self, executor, offset, buff_stop=None):
-        return async_flush_array(executor, self.buff[:buff_stop], self.array, offset)
-
-
-# TODO: factor these functions into the BufferedArray class
+    def next_buffer_row(self):
+        if self.buffer_row == self.chunk_length:
+            self.flush()
+        row = self.buffer_row
+        self.buffer_row += 1
+        return row
+
+    def flush(self):
+        # TODO just move sync_flush_array in here
+        if self.buffer_row != 0:
+            if len(self.array.chunks) <= 1:
+                sync_flush_1d_array(
+                    self.buff[: self.buffer_row], self.array, self.array_offset
+                )
+            else:
+                sync_flush_2d_array(
+                    self.buff[: self.buffer_row], self.array, self.array_offset
+                )
+            logger.debug(
+                f"Flushed chunk {self.array} {self.array_offset} + {self.buffer_row}")
+            self.array_offset += self.chunk_length
+            self.buffer_row = 0
 
 
-def sync_flush_array(np_buffer, zarr_array, offset):
+def sync_flush_1d_array(np_buffer, zarr_array, offset):
     zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
+    update_progress(1)
 
 
-def async_flush_array(executor, np_buffer, zarr_array, offset):
-    """
-    Flush the specified chunk aligned buffer to the specified zarr array.
-    """
-    logger.debug(f"Schedule flush {zarr_array} @ {offset}")
-    assert zarr_array.shape[1:] == np_buffer.shape[1:]
-    # print("sync", zarr_array, np_buffer)
-
-    if len(np_buffer.shape) == 1:
-        futures = [executor.submit(sync_flush_array, np_buffer, zarr_array, offset)]
-    else:
-        futures = async_flush_2d_array(executor, np_buffer, zarr_array, offset)
-    return futures
-
-
-def async_flush_2d_array(executor, np_buffer, zarr_array, offset):
-    # Flush each of the chunks in the second dimension separately
+def sync_flush_2d_array(np_buffer, zarr_array, offset):
+    # Write chunks in the second dimension 1-by-1 to make progress more
+    # incremental, and to avoid large memcopies in the underlying
+    # encoder implementations.
     s = slice(offset, offset + np_buffer.shape[0])
-
-    def flush_chunk(start, stop):
-        zarr_array[s, start:stop] = np_buffer[:, start:stop]
-
     chunk_width = zarr_array.chunks[1]
     zarr_array_width = zarr_array.shape[1]
     start = 0
-    futures = []
     while start < zarr_array_width:
         stop = min(start + chunk_width, zarr_array_width)
-        future = executor.submit(flush_chunk, start, stop)
-        futures.append(future)
+        zarr_array[s, start:stop] = np_buffer[:, start:stop]
+        update_progress(1)
         start = stop
 
-    return futures
-
-
-class ThreadedZarrEncoder(contextlib.AbstractContextManager):
-    # TODO (maybe) add option with encoder_threads=None to run synchronously for
-    # debugging using a mock Executor
-    def __init__(self, buffered_arrays, encoder_threads=1):
-        self.buffered_arrays = buffered_arrays
-        self.executor = cf.ThreadPoolExecutor(max_workers=encoder_threads)
-        self.chunk_length = buffered_arrays[0].chunk_length
-        assert all(ba.chunk_length == self.chunk_length for ba in self.buffered_arrays)
-        self.futures = []
-        self.array_offset = 0
-        self.next_row = -1
-
-    def next_buffer_row(self):
-        self.next_row += 1
-        if self.next_row == self.chunk_length:
-            self.swap_buffers()
-            self.array_offset += self.chunk_length
-            self.next_row = 0
-        return self.next_row
-
-    def swap_buffers(self):
-        wait_on_futures(self.futures)
-        self.futures = []
-        for ba in self.buffered_arrays:
-            self.futures.extend(
-                ba.async_flush(self.executor, self.array_offset, self.next_row)
-            )
-            ba.swap_buffers()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is None:
-            # Normal exit condition
-            self.next_row += 1
-            self.swap_buffers()
-            wait_on_futures(self.futures)
-        else:
-            cancel_futures(self.futures)
-        self.executor.shutdown()
-        return False
-
 
 @dataclasses.dataclass
 class ProgressConfig:
@@ -157,6 +133,10 @@ class ProgressConfig:
     poll_interval: float = 0.001
 
 
+# NOTE: this approach means that we cannot have more than one
+# progressable thing happening per source process. This is
+# probably fine in practise, but there could be corner cases
+# where it's not. Something to watch out for.
 _progress_counter = multiprocessing.Value("Q", 0)
 
 
@@ -190,7 +170,16 @@ def progress_thread_worker(config):
         inc = current - pbar.n
         pbar.update(inc)
         time.sleep(config.poll_interval)
+    # TODO figure out why we're sometimes going over total
+    # if get_progress() != config.total:
+    #     print("HOW DID THIS HAPPEN!!")
+    #     print(get_progress())
+    #     print(config)
+    # assert get_progress() == config.total
+    inc = config.total - pbar.n
+    pbar.update(inc)
     pbar.close()
+    # print("EXITING PROGRESS THREAD")
 
 
 class ParallelWorkManager(contextlib.AbstractContextManager):
@@ -228,7 +217,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             # Note: this doesn't seem to be working correctly. If
             # we set a timeout of None we get deadlocks
             set_progress(self.progress_config.total)
-            timeout = 1
+            timeout = None
         else:
             cancel_futures(self.futures)
             timeout = 0