Add CLI for explode slices

benjeffery · benjeffery · commit fdaaefb351e6 · 2024-03-22T13:50:32.000Z
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -14,8 +14,17 @@
     "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
 )
 
+column_chunk_size = click.option(
+    "-c",
+    "--column-chunk-size",
+    type=int,
+    default=64,
+    help="Chunk size in the columns dimension",
+)
+
 # Note: -l and -w were chosen when these were called "width" and "length".
 # possibly there are better letters now.
+# TODO help text
 variants_chunk_size = click.option(
     "-l",
     "--variants-chunk-size",
@@ -55,7 +64,7 @@ def setup_logging(verbosity):
 @click.argument("out_path", type=click.Path())
 @verbose
 @worker_processes
-@click.option("-c", "--column-chunk-size", type=int, default=64)
+@column_chunk_size
 def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
     """
     Convert VCF(s) to columnar intermediate format
@@ -69,6 +78,53 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
         show_progress=True,
     )
 
+@click.command
+@click.argument("vcfs", nargs=-1, required=True)
+@click.argument("out_path", type=click.Path())
+@verbose
+@worker_processes
+def explode_init(vcfs, out_path, verbose, worker_processes):
+    """
+    Initial step for parallel conversion of VCF(s) to columnar intermediate format
+    """
+    setup_logging(verbose)
+    vcf.explode_init(
+        vcfs,
+        out_path,
+        worker_processes=worker_processes,
+        show_progress=True,
+    )
+
+@click.command
+@click.argument("out_path", type=click.Path(), required=True)
+@click.argument("start", type=int, required=True)
+@click.argument("end", type=int, required=True)
+@verbose
+@worker_processes
+@column_chunk_size
+def explode_slice(out_path, start, end, verbose, worker_processes, column_chunk_size):
+    """
+    Convert VCF(s) to columnar intermediate format
+    """
+    setup_logging(verbose)
+    vcf.explode_slice(
+        out_path,
+        start,
+        end,
+        worker_processes=worker_processes,
+        column_chunk_size=column_chunk_size,
+        show_progress=True,
+    )
+
+@click.command
+@click.argument("out_path", type=click.Path(), required=True)
+@verbose
+def explode_finalise(out_path, verbose):
+    """
+    Final step for parallel conversion of VCF(s) to columnar intermediate format
+    """
+    setup_logging(verbose)
+    vcf.explode_finalise(out_path)
 
 @click.command
 @click.argument("if_path", type=click.Path())
@@ -189,6 +245,9 @@ def vcf2zarr():
 
 # TODO figure out how to get click to list these in the given order.
 vcf2zarr.add_command(explode)
+vcf2zarr.add_command(explode_init)
+vcf2zarr.add_command(explode_slice)
+vcf2zarr.add_command(explode_finalise)
 vcf2zarr.add_command(inspect)
 vcf2zarr.add_command(mkschema)
 vcf2zarr.add_command(encode)
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -282,7 +282,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     )
     with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
         for path in paths:
-            pwm.submit(scan_vcf, path, target_num_partitions)
+            pwm.submit(scan_vcf, path, target_num_partitions//len(paths))
         results = list(pwm.results_as_completed())
 
     # Sort to make the ordering deterministic
@@ -885,6 +885,10 @@ def mkdirs(self):
     def write_metadata(self):
         with open(self.path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
+        # Write number of partitions in a convenience file for
+        # workflows
+        with open(self.path / "num_partitions.txt", "w") as f:
+            f.write(str(self.num_partitions))
 
     def write_header(self):
         with open(self.path / "header.txt", "w") as f:
@@ -983,15 +987,14 @@ def convert_partition(
         )
 
     @staticmethod
-    def convert_init(vcfs, out_path, *, worker_processes=1, show_progress=False):
+    def convert_init(vcfs, out_path, *, num_partitions=1, worker_processes=1, show_progress=False):
         out_path = pathlib.Path(out_path)
         # TODO make scan work in parallel using general progress code too
-        target_num_partitions = max(1, worker_processes * 4)
         vcf_metadata, header = scan_vcfs(
             vcfs,
             worker_processes=worker_processes,
             show_progress=show_progress,
-            target_num_partitions=target_num_partitions,
+            target_num_partitions=num_partitions,
         )
         pcvcf = PickleChunkedVcf(out_path, vcf_metadata, header)
         pcvcf.mkdirs()
@@ -1075,6 +1078,29 @@ def explode(
     )
     return PickleChunkedVcf.load(out_path)
 
+def explode_init(vcfs, out_path, *, num_partitions=1, worker_processes=1, show_progress=False):
+    out_path = pathlib.Path(out_path)
+    if out_path.exists():
+        shutil.rmtree(out_path)
+    # Error if num_parts less than number of files
+    if num_partitions < len(vcfs):
+        raise ValueError("num_partitions must be greater than or equal to the number of input VCFs")
+    return PickleChunkedVcf.convert_init(
+        vcfs,
+        out_path,
+        num_partitions=num_partitions,
+        worker_processes=worker_processes,
+        show_progress=show_progress,
+    )
+
+
+def explode_slice(out_path, start, stop, *, worker_processes=1, show_progress=False, column_chunk_size=16):
+    pcvcf = PickleChunkedVcf.load(out_path)
+    pcvcf.convert_slice(start, stop, worker_processes=worker_processes, show_progress=show_progress, column_chunk_size=column_chunk_size)
+
+def explode_finalise(out_path):
+    pcvcf = PickleChunkedVcf.load(out_path)
+    pcvcf.convert_finalise()
 
 def inspect(path):
     path = pathlib.Path(path)
diff --git a/bio2zarr/vcf_utils.py b/bio2zarr/vcf_utils.py
@@ -464,7 +464,6 @@ def partition_into_regions(
         elif target_part_size_bytes is not None:
             num_parts = ceildiv(file_length, target_part_size_bytes)
         part_lengths = np.array([i * target_part_size_bytes for i in range(num_parts)])
-
         file_offsets, region_contig_indexes, region_positions = self.index.offsets()
 
         # Search the file offsets to find which indexes the part lengths fall at
diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py
@@ -782,3 +782,23 @@ def test_by_validating_split(source, suffix, files, tmp_path):
     out = tmp_path / "test.zarr"
     vcf.convert(split_files, out, worker_processes=0)
     vcf.validate(source_path, out)
+
+
+def test_split_explode(tmp_path):
+    paths = [
+        "tests/data/vcf/sample.vcf.gz.3.split/19:1-.vcf.gz",
+        "tests/data/vcf/sample.vcf.gz.3.split/20.vcf.gz",
+        "tests/data/vcf/sample.vcf.gz.3.split/X.vcf.gz",
+    ]
+    out = tmp_path / "test.explode"
+    pcvcf = vcf.explode_init(paths, out, num_partitions=15)
+    with open(out / "num_partitions.txt", "r") as f:
+        num_partitions = int(f.read())
+    assert pcvcf.num_partitions == num_partitions
+    assert num_partitions == 3
+    vcf.explode_slice(out, 0, num_partitions)
+    vcf.explode_finalise(out)
+
+    vcf.encode(out, tmp_path / "test.zarr")
+
+    vcf.validate("tests/data/vcf/sample.vcf.gz", tmp_path / "test.zarr")