sgkit-dev
diff --git a/‎bio2zarr/cli.py‎
Lines changed: 106 additions & 56 deletions b/‎bio2zarr/cli.py‎
Lines changed: 106 additions & 56 deletions
@@ -7,9 +7,21 @@
 from . import plink
 from . import provenance
 
+
+class NaturalOrderGroup(click.Group):
+    """
+    List commands in the order they are provided in the help text.
+    """
+
+    def list_commands(self, ctx):
+        return self.commands.keys()
+
+
 # Common arguments/options
 verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
 
+version = click.version_option(version=f"{provenance.__version__}")
+
 worker_processes = click.option(
     "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
 )
@@ -19,12 +31,11 @@
     "--column-chunk-size",
     type=int,
     default=64,
-    help="Size of exploded column chunks",
+    help="Approximate uncompressed size of exploded column chunks in MiB",
 )
 
 # Note: -l and -w were chosen when these were called "width" and "length".
 # possibly there are better letters now.
-# TODO help text
 variants_chunk_size = click.option(
     "-l",
     "--variants-chunk-size",
@@ -41,69 +52,59 @@
     help="Chunk size in the samples dimension",
 )
 
-version = click.version_option(version=f"{provenance.__version__}")
-
 
-# Note: logging hasn't been implemented in the code at all, this is just
-# a first pass to try out some ways of doing things to see what works.
 def setup_logging(verbosity):
     level = "WARNING"
     if verbosity == 1:
         level = "INFO"
     elif verbosity >= 2:
         level = "DEBUG"
     # NOTE: I'm not that excited about coloredlogs, just trying it out
-    # as it is installed by cyvcf2 anyway. We will have some complicated
-    # stuff doing on with threads and processes, to logs might not work
-    # so well anyway.
+    # as it is installed by cyvcf2 anyway.
     coloredlogs.install(level=level)
 
 
 @click.command
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
 @verbose
 @worker_processes
 @column_chunk_size
-def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
+def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
     """
     Convert VCF(s) to columnar intermediate format
     """
     setup_logging(verbose)
     vcf.explode(
         vcfs,
-        out_path,
+        zarr_path,
         worker_processes=worker_processes,
         column_chunk_size=column_chunk_size,
         show_progress=True,
     )
 
+
 @click.command
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
-@click.option("-n", "--target-num-partitions", type=int, required=True)
+@click.argument("icf_path", type=click.Path())
+@click.argument("num_partitions", type=int)
 @verbose
 @worker_processes
-def explode_init(vcfs, out_path, target_num_partitions, verbose, worker_processes):
+def dexplode_init(vcfs, icf_path, num_partitions, verbose, worker_processes):
     """
     Initial step for parallel conversion of VCF(s) to columnar intermediate format
+    over the requested number of paritions.
     """
     setup_logging(verbose)
-    vcf.explode_init(
+    num_partitions = vcf.explode_init(
         vcfs,
-        out_path,
-        target_num_partitions=target_num_partitions,
+        icf_path,
+        target_num_partitions=num_partitions,
         worker_processes=worker_processes,
         show_progress=True,
     )
+    click.echo(num_partitions)
 
-@click.command
-@click.argument("path", type=click.Path())
-def explode_partition_count(path):
-    """
-    Count the actual number of partitions in a parallel conversion of VCF(s) to columnar intermediate format
-    """
-    print(vcf.explode_partition_count(path))
 
 @click.command
 @click.argument("path", type=click.Path(), required=True)
@@ -112,7 +113,7 @@ def explode_partition_count(path):
 @verbose
 @worker_processes
 @column_chunk_size
-def explode_slice(path, start, end, verbose, worker_processes, column_chunk_size):
+def dexplode_slice(path, start, end, verbose, worker_processes, column_chunk_size):
     """
     Convert VCF(s) to columnar intermediate format
     """
@@ -126,40 +127,42 @@ def explode_slice(path, start, end, verbose, worker_processes, column_chunk_size
         show_progress=True,
     )
 
+
 @click.command
 @click.argument("path", type=click.Path(), required=True)
 @verbose
-def explode_finalise(path, verbose):
+def dexplode_finalise(path, verbose):
     """
     Final step for parallel conversion of VCF(s) to columnar intermediate format
     """
     setup_logging(verbose)
     vcf.explode_finalise(path)
 
+
 @click.command
-@click.argument("if_path", type=click.Path())
+@click.argument("icf_path", type=click.Path())
 @verbose
-def inspect(if_path, verbose):
+def inspect(icf_path, verbose):
     """
-    Inspect an intermediate format file
+    Inspect an intermediate format or Zarr path.
     """
     setup_logging(verbose)
-    data = vcf.inspect(if_path)
+    data = vcf.inspect(icf_path)
     click.echo(tabulate.tabulate(data, headers="keys"))
 
 
 @click.command
-@click.argument("if_path", type=click.Path())
-def mkschema(if_path):
+@click.argument("icf_path", type=click.Path())
+def mkschema(icf_path):
     """
     Generate a schema for zarr encoding
     """
     stream = click.get_text_stream("stdout")
-    vcf.mkschema(if_path, stream)
+    vcf.mkschema(icf_path, stream)
 
 
 @click.command
-@click.argument("if_path", type=click.Path())
+@click.argument("icf_path", type=click.Path())
 @click.argument("zarr_path", type=click.Path())
 @verbose
 @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
@@ -185,7 +188,7 @@ def mkschema(if_path):
 )
 @worker_processes
 def encode(
-    if_path,
+    icf_path,
     zarr_path,
     verbose,
     schema,
@@ -196,11 +199,11 @@ def encode(
     worker_processes,
 ):
     """
-    Encode intermediate format (see explode) to vcfzarr
+    Encode intermediate columnar format (see explode) to vcfzarr.
     """
     setup_logging(verbose)
     vcf.encode(
-        if_path,
+        icf_path,
         zarr_path,
         schema,
         variants_chunk_size=variants_chunk_size,
@@ -214,21 +217,21 @@ def encode(
 
 @click.command(name="convert")
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
 @variants_chunk_size
 @samples_chunk_size
 @verbose
 @worker_processes
 def convert_vcf(
-    vcfs, out_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
+    vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
 ):
     """
-    Convert input VCF(s) directly to vcfzarr (not recommended for large files)
+    Convert input VCF(s) directly to vcfzarr (not recommended for large files).
     """
     setup_logging(verbose)
     vcf.convert(
         vcfs,
-        out_path,
+        zarr_path,
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
         show_progress=True,
@@ -238,44 +241,91 @@ def convert_vcf(
 
 @click.command
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
-def validate(vcfs, out_path):
+@click.argument("zarr_path", type=click.Path())
+def validate(vcfs, zarr_path):
     """
     Development only, do not use. Will be removed before release.
     """
     # FIXME! Will silently not look at remaining VCFs
-    vcf.validate(vcfs[0], out_path, show_progress=True)
+    vcf.validate(vcfs[0], zarr_path, show_progress=True)
 
 
 @version
-@click.group()
+@click.group(cls=NaturalOrderGroup)
 def vcf2zarr():
-    pass
+    """
+    Convert VCF file(s) to the vcfzarr format.
+
+    The simplest usage is:
+
+    $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
+
+    This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
+    step. As this writes the intermediate columnar format to a temporary directory,
+    we only recommend this approach for small files (< 1GB, say).
+
+    The recommended approach is to run the conversion in two passes, and
+    to keep the intermediate columnar format ("exploded") around to facilitate
+    experimentation with chunk sizes and compression settings:
+
+    \b
+    $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
+    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
+
+    The inspect command provides a way to view contents of an exploded ICF
+    or Zarr:
+
+    $ vcf2zarr inspect [PATH]
+
+    This is useful when tweaking chunk sizes and compression settings to suit
+    your dataset, using the mkschema command and --schema option to encode:
+
+    \b
+    $ vcf2zarr mkschema [ICF_PATH] > schema.json
+    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
+
+    By editing the schema.json file you can drop columns that are not of interest
+    and edit column specific compression settings. The --max-variant-chunks option
+    to encode allows you to try out these options on small subsets, hopefully
+    arriving at settings with the desired balance of compression and query
+    performance.
+
+    ADVANCED USAGE
+
+    For very large datasets (terabyte scale) it may be necessary to distribute the
+    explode and encode steps across a cluster:
+
+    \b
+    $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
+    $ vcf2zarr dexplode-slice [ICF_PATH] [START] [STOP]
+    $ vcf2zarr dexplode-finalise [ICF_PATH]
+
+    See the online documentation at [FIXME] for more details on distributed explode.
+    """
 
 
 # TODO figure out how to get click to list these in the given order.
-vcf2zarr.add_command(explode)
-vcf2zarr.add_command(explode_init)
-vcf2zarr.add_command(explode_partition_count)
-vcf2zarr.add_command(explode_slice)
-vcf2zarr.add_command(explode_finalise)
+vcf2zarr.add_command(convert_vcf)
 vcf2zarr.add_command(inspect)
+vcf2zarr.add_command(explode)
 vcf2zarr.add_command(mkschema)
 vcf2zarr.add_command(encode)
-vcf2zarr.add_command(convert_vcf)
+vcf2zarr.add_command(dexplode_init)
+vcf2zarr.add_command(dexplode_slice)
+vcf2zarr.add_command(dexplode_finalise)
 vcf2zarr.add_command(validate)
 
 
 @click.command(name="convert")
 @click.argument("in_path", type=click.Path())
-@click.argument("out_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
 @worker_processes
 @verbose
 @variants_chunk_size
 @samples_chunk_size
 def convert_plink(
     in_path,
-    out_path,
+    zarr_path,
     verbose,
     worker_processes,
     variants_chunk_size,
@@ -287,7 +337,7 @@ def convert_plink(
     setup_logging(verbose)
     plink.convert(
         in_path,
-        out_path,
+        zarr_path,
         show_progress=True,
         worker_processes=worker_processes,
         samples_chunk_size=samples_chunk_size,