Skip to content

Commit 73558fc

Browse files
Document the vcf2zarr command and refactor dexplode
1 parent fcffead commit 73558fc

File tree

4 files changed

+156
-80
lines changed

4 files changed

+156
-80
lines changed

bio2zarr/cli.py

Lines changed: 82 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from . import plink
88
from . import provenance
99

10+
1011
class NaturalOrderGroup(click.Group):
1112
"""
1213
List commands in the order they are provided in the help text.
@@ -65,18 +66,18 @@ def setup_logging(verbosity):
6566

6667
@click.command
6768
@click.argument("vcfs", nargs=-1, required=True)
68-
@click.argument("out_path", type=click.Path())
69+
@click.argument("zarr_path", type=click.Path())
6970
@verbose
7071
@worker_processes
7172
@column_chunk_size
72-
def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
73+
def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
7374
"""
7475
Convert VCF(s) to columnar intermediate format
7576
"""
7677
setup_logging(verbose)
7778
vcf.explode(
7879
vcfs,
79-
out_path,
80+
zarr_path,
8081
worker_processes=worker_processes,
8182
column_chunk_size=column_chunk_size,
8283
show_progress=True,
@@ -85,31 +86,24 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
8586

8687
@click.command
8788
@click.argument("vcfs", nargs=-1, required=True)
88-
@click.argument("out_path", type=click.Path())
89-
@click.option("-n", "--target-num-partitions", type=int, required=True)
89+
@click.argument("icf_path", type=click.Path())
90+
@click.argument("num_partitions", type=int)
9091
@verbose
9192
@worker_processes
92-
def dexplode_init(vcfs, out_path, target_num_partitions, verbose, worker_processes):
93+
def dexplode_init(vcfs, icf_path, num_partitions, verbose, worker_processes):
9394
"""
9495
Initial step for parallel conversion of VCF(s) to columnar intermediate format
96+
over the requested number of paritions.
9597
"""
9698
setup_logging(verbose)
97-
vcf.explode_init(
99+
num_partitions = vcf.explode_init(
98100
vcfs,
99-
out_path,
100-
target_num_partitions=target_num_partitions,
101+
icf_path,
102+
target_num_partitions=num_partitions,
101103
worker_processes=worker_processes,
102104
show_progress=True,
103105
)
104-
105-
106-
@click.command
107-
@click.argument("path", type=click.Path())
108-
def dexplode_partition_count(path):
109-
"""
110-
Count the actual number of partitions in a parallel conversion of VCF(s) to columnar intermediate format
111-
"""
112-
click.echo(vcf.explode_partition_count(path))
106+
click.echo(num_partitions)
113107

114108

115109
@click.command
@@ -146,29 +140,29 @@ def dexplode_finalise(path, verbose):
146140

147141

148142
@click.command
149-
@click.argument("if_path", type=click.Path())
143+
@click.argument("icf_path", type=click.Path())
150144
@verbose
151-
def inspect(if_path, verbose):
145+
def inspect(icf_path, verbose):
152146
"""
153147
Inspect an intermediate format or Zarr path.
154148
"""
155149
setup_logging(verbose)
156-
data = vcf.inspect(if_path)
150+
data = vcf.inspect(icf_path)
157151
click.echo(tabulate.tabulate(data, headers="keys"))
158152

159153

160154
@click.command
161-
@click.argument("if_path", type=click.Path())
162-
def mkschema(if_path):
155+
@click.argument("icf_path", type=click.Path())
156+
def mkschema(icf_path):
163157
"""
164158
Generate a schema for zarr encoding
165159
"""
166160
stream = click.get_text_stream("stdout")
167-
vcf.mkschema(if_path, stream)
161+
vcf.mkschema(icf_path, stream)
168162

169163

170164
@click.command
171-
@click.argument("if_path", type=click.Path())
165+
@click.argument("icf_path", type=click.Path())
172166
@click.argument("zarr_path", type=click.Path())
173167
@verbose
174168
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
@@ -194,7 +188,7 @@ def mkschema(if_path):
194188
)
195189
@worker_processes
196190
def encode(
197-
if_path,
191+
icf_path,
198192
zarr_path,
199193
verbose,
200194
schema,
@@ -205,11 +199,11 @@ def encode(
205199
worker_processes,
206200
):
207201
"""
208-
Encode intermediate format (see explode) to vcfzarr
202+
Encode intermediate columnar format (see explode) to vcfzarr.
209203
"""
210204
setup_logging(verbose)
211205
vcf.encode(
212-
if_path,
206+
icf_path,
213207
zarr_path,
214208
schema,
215209
variants_chunk_size=variants_chunk_size,
@@ -223,21 +217,21 @@ def encode(
223217

224218
@click.command(name="convert")
225219
@click.argument("vcfs", nargs=-1, required=True)
226-
@click.argument("out_path", type=click.Path())
220+
@click.argument("zarr_path", type=click.Path())
227221
@variants_chunk_size
228222
@samples_chunk_size
229223
@verbose
230224
@worker_processes
231225
def convert_vcf(
232-
vcfs, out_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
226+
vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
233227
):
234228
"""
235-
Convert input VCF(s) directly to vcfzarr (not recommended for large files)
229+
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
236230
"""
237231
setup_logging(verbose)
238232
vcf.convert(
239233
vcfs,
240-
out_path,
234+
zarr_path,
241235
variants_chunk_size=variants_chunk_size,
242236
samples_chunk_size=samples_chunk_size,
243237
show_progress=True,
@@ -247,44 +241,91 @@ def convert_vcf(
247241

248242
@click.command
249243
@click.argument("vcfs", nargs=-1, required=True)
250-
@click.argument("out_path", type=click.Path())
251-
def validate(vcfs, out_path):
244+
@click.argument("zarr_path", type=click.Path())
245+
def validate(vcfs, zarr_path):
252246
"""
253247
Development only, do not use. Will be removed before release.
254248
"""
255249
# FIXME! Will silently not look at remaining VCFs
256-
vcf.validate(vcfs[0], out_path, show_progress=True)
250+
vcf.validate(vcfs[0], zarr_path, show_progress=True)
257251

258252

259253
@version
260254
@click.group(cls=NaturalOrderGroup)
261255
def vcf2zarr():
262-
pass
256+
"""
257+
Convert VCF file(s) to the vcfzarr format.
258+
259+
The simplest usage is:
260+
261+
$ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
262+
263+
This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
264+
step. As this writes the intermediate columnar format to a temporary directory,
265+
we only recommend this approach for small files (< 1GB, say).
266+
267+
The recommended approach is to run the conversion in two passes, and
268+
to keep the intermediate columnar format ("exploded") around to facilitate
269+
experimentation with chunk sizes and compression settings:
270+
271+
\b
272+
$ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
273+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
274+
275+
The inspect command provides a way to view contents of an exploded ICF
276+
or Zarr:
277+
278+
$ vcf2zarr inspect [PATH]
279+
280+
This is useful when tweaking chunk sizes and compression settings to suit
281+
your dataset, using the mkschema command and --schema option to encode:
282+
283+
\b
284+
$ vcf2zarr mkschema [ICF_PATH] > schema.json
285+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
286+
287+
By editing the schema.json file you can drop columns that are not of interest
288+
and edit column specific compression settings. The --max-variant-chunks option
289+
to encode allows you to try out these options on small subsets, hopefully
290+
arriving at settings with the desired balance of compression and query
291+
performance.
292+
293+
ADVANCED USAGE
294+
295+
For very large datasets (terabyte scale) it may be necessary to distribute the
296+
explode and encode steps across a cluster:
297+
298+
\b
299+
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
300+
$ vcf2zarr dexplode-slice [ICF_PATH] [START] [STOP]
301+
$ vcf2zarr dexplode-finalise [ICF_PATH]
302+
303+
See the online documentation at [FIXME] for more details on distributed explode.
304+
"""
263305

264306

265307
# TODO figure out how to get click to list these in the given order.
266308
vcf2zarr.add_command(convert_vcf)
267-
vcf2zarr.add_command(explode)
268309
vcf2zarr.add_command(inspect)
310+
vcf2zarr.add_command(explode)
269311
vcf2zarr.add_command(mkschema)
270312
vcf2zarr.add_command(encode)
271313
vcf2zarr.add_command(dexplode_init)
272-
vcf2zarr.add_command(dexplode_partition_count)
273314
vcf2zarr.add_command(dexplode_slice)
274315
vcf2zarr.add_command(dexplode_finalise)
275316
vcf2zarr.add_command(validate)
276317

277318

278319
@click.command(name="convert")
279320
@click.argument("in_path", type=click.Path())
280-
@click.argument("out_path", type=click.Path())
321+
@click.argument("zarr_path", type=click.Path())
281322
@worker_processes
282323
@verbose
283324
@variants_chunk_size
284325
@samples_chunk_size
285326
def convert_plink(
286327
in_path,
287-
out_path,
328+
zarr_path,
288329
verbose,
289330
worker_processes,
290331
variants_chunk_size,
@@ -296,7 +337,7 @@ def convert_plink(
296337
setup_logging(verbose)
297338
plink.convert(
298339
in_path,
299-
out_path,
340+
zarr_path,
300341
show_progress=True,
301342
worker_processes=worker_processes,
302343
samples_chunk_size=samples_chunk_size,

0 commit comments

Comments
 (0)