Skip to content

Commit 427c1b4

Browse files
Merge pull request #88 from jeromekelleher/explode-refactors
Explode refactors
2 parents 384e92f + 73558fc commit 427c1b4

File tree

4 files changed

+180
-95
lines changed

4 files changed

+180
-95
lines changed

bio2zarr/cli.py

Lines changed: 106 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,21 @@
77
from . import plink
88
from . import provenance
99

10+
11+
class NaturalOrderGroup(click.Group):
12+
"""
13+
List commands in the order they are provided in the help text.
14+
"""
15+
16+
def list_commands(self, ctx):
17+
return self.commands.keys()
18+
19+
1020
# Common arguments/options
1121
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
1222

23+
version = click.version_option(version=f"{provenance.__version__}")
24+
1325
worker_processes = click.option(
1426
"-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
1527
)
@@ -19,12 +31,11 @@
1931
"--column-chunk-size",
2032
type=int,
2133
default=64,
22-
help="Size of exploded column chunks",
34+
help="Approximate uncompressed size of exploded column chunks in MiB",
2335
)
2436

2537
# Note: -l and -w were chosen when these were called "width" and "length".
2638
# possibly there are better letters now.
27-
# TODO help text
2839
variants_chunk_size = click.option(
2940
"-l",
3041
"--variants-chunk-size",
@@ -41,69 +52,59 @@
4152
help="Chunk size in the samples dimension",
4253
)
4354

44-
version = click.version_option(version=f"{provenance.__version__}")
45-
4655

47-
# Note: logging hasn't been implemented in the code at all, this is just
48-
# a first pass to try out some ways of doing things to see what works.
4956
def setup_logging(verbosity):
5057
level = "WARNING"
5158
if verbosity == 1:
5259
level = "INFO"
5360
elif verbosity >= 2:
5461
level = "DEBUG"
5562
# NOTE: I'm not that excited about coloredlogs, just trying it out
56-
# as it is installed by cyvcf2 anyway. We will have some complicated
57-
# stuff doing on with threads and processes, to logs might not work
58-
# so well anyway.
63+
# as it is installed by cyvcf2 anyway.
5964
coloredlogs.install(level=level)
6065

6166

6267
@click.command
6368
@click.argument("vcfs", nargs=-1, required=True)
64-
@click.argument("out_path", type=click.Path())
69+
@click.argument("zarr_path", type=click.Path())
6570
@verbose
6671
@worker_processes
6772
@column_chunk_size
68-
def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
73+
def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
6974
"""
7075
Convert VCF(s) to columnar intermediate format
7176
"""
7277
setup_logging(verbose)
7378
vcf.explode(
7479
vcfs,
75-
out_path,
80+
zarr_path,
7681
worker_processes=worker_processes,
7782
column_chunk_size=column_chunk_size,
7883
show_progress=True,
7984
)
8085

86+
8187
@click.command
8288
@click.argument("vcfs", nargs=-1, required=True)
83-
@click.argument("out_path", type=click.Path())
84-
@click.option("-n", "--target-num-partitions", type=int, required=True)
89+
@click.argument("icf_path", type=click.Path())
90+
@click.argument("num_partitions", type=int)
8591
@verbose
8692
@worker_processes
87-
def explode_init(vcfs, out_path, target_num_partitions, verbose, worker_processes):
93+
def dexplode_init(vcfs, icf_path, num_partitions, verbose, worker_processes):
8894
"""
8995
Initial step for parallel conversion of VCF(s) to columnar intermediate format
96+
over the requested number of paritions.
9097
"""
9198
setup_logging(verbose)
92-
vcf.explode_init(
99+
num_partitions = vcf.explode_init(
93100
vcfs,
94-
out_path,
95-
target_num_partitions=target_num_partitions,
101+
icf_path,
102+
target_num_partitions=num_partitions,
96103
worker_processes=worker_processes,
97104
show_progress=True,
98105
)
106+
click.echo(num_partitions)
99107

100-
@click.command
101-
@click.argument("path", type=click.Path())
102-
def explode_partition_count(path):
103-
"""
104-
Count the actual number of partitions in a parallel conversion of VCF(s) to columnar intermediate format
105-
"""
106-
print(vcf.explode_partition_count(path))
107108

108109
@click.command
109110
@click.argument("path", type=click.Path(), required=True)
@@ -112,7 +113,7 @@ def explode_partition_count(path):
112113
@verbose
113114
@worker_processes
114115
@column_chunk_size
115-
def explode_slice(path, start, end, verbose, worker_processes, column_chunk_size):
116+
def dexplode_slice(path, start, end, verbose, worker_processes, column_chunk_size):
116117
"""
117118
Convert VCF(s) to columnar intermediate format
118119
"""
@@ -126,40 +127,42 @@ def explode_slice(path, start, end, verbose, worker_processes, column_chunk_size
126127
show_progress=True,
127128
)
128129

130+
129131
@click.command
130132
@click.argument("path", type=click.Path(), required=True)
131133
@verbose
132-
def explode_finalise(path, verbose):
134+
def dexplode_finalise(path, verbose):
133135
"""
134136
Final step for parallel conversion of VCF(s) to columnar intermediate format
135137
"""
136138
setup_logging(verbose)
137139
vcf.explode_finalise(path)
138140

141+
139142
@click.command
140-
@click.argument("if_path", type=click.Path())
143+
@click.argument("icf_path", type=click.Path())
141144
@verbose
142-
def inspect(if_path, verbose):
145+
def inspect(icf_path, verbose):
143146
"""
144-
Inspect an intermediate format file
147+
Inspect an intermediate format or Zarr path.
145148
"""
146149
setup_logging(verbose)
147-
data = vcf.inspect(if_path)
150+
data = vcf.inspect(icf_path)
148151
click.echo(tabulate.tabulate(data, headers="keys"))
149152

150153

151154
@click.command
152-
@click.argument("if_path", type=click.Path())
153-
def mkschema(if_path):
155+
@click.argument("icf_path", type=click.Path())
156+
def mkschema(icf_path):
154157
"""
155158
Generate a schema for zarr encoding
156159
"""
157160
stream = click.get_text_stream("stdout")
158-
vcf.mkschema(if_path, stream)
161+
vcf.mkschema(icf_path, stream)
159162

160163

161164
@click.command
162-
@click.argument("if_path", type=click.Path())
165+
@click.argument("icf_path", type=click.Path())
163166
@click.argument("zarr_path", type=click.Path())
164167
@verbose
165168
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
@@ -185,7 +188,7 @@ def mkschema(if_path):
185188
)
186189
@worker_processes
187190
def encode(
188-
if_path,
191+
icf_path,
189192
zarr_path,
190193
verbose,
191194
schema,
@@ -196,11 +199,11 @@ def encode(
196199
worker_processes,
197200
):
198201
"""
199-
Encode intermediate format (see explode) to vcfzarr
202+
Encode intermediate columnar format (see explode) to vcfzarr.
200203
"""
201204
setup_logging(verbose)
202205
vcf.encode(
203-
if_path,
206+
icf_path,
204207
zarr_path,
205208
schema,
206209
variants_chunk_size=variants_chunk_size,
@@ -214,21 +217,21 @@ def encode(
214217

215218
@click.command(name="convert")
216219
@click.argument("vcfs", nargs=-1, required=True)
217-
@click.argument("out_path", type=click.Path())
220+
@click.argument("zarr_path", type=click.Path())
218221
@variants_chunk_size
219222
@samples_chunk_size
220223
@verbose
221224
@worker_processes
222225
def convert_vcf(
223-
vcfs, out_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
226+
vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
224227
):
225228
"""
226-
Convert input VCF(s) directly to vcfzarr (not recommended for large files)
229+
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
227230
"""
228231
setup_logging(verbose)
229232
vcf.convert(
230233
vcfs,
231-
out_path,
234+
zarr_path,
232235
variants_chunk_size=variants_chunk_size,
233236
samples_chunk_size=samples_chunk_size,
234237
show_progress=True,
@@ -238,44 +241,91 @@ def convert_vcf(
238241

239242
@click.command
240243
@click.argument("vcfs", nargs=-1, required=True)
241-
@click.argument("out_path", type=click.Path())
242-
def validate(vcfs, out_path):
244+
@click.argument("zarr_path", type=click.Path())
245+
def validate(vcfs, zarr_path):
243246
"""
244247
Development only, do not use. Will be removed before release.
245248
"""
246249
# FIXME! Will silently not look at remaining VCFs
247-
vcf.validate(vcfs[0], out_path, show_progress=True)
250+
vcf.validate(vcfs[0], zarr_path, show_progress=True)
248251

249252

250253
@version
251-
@click.group()
254+
@click.group(cls=NaturalOrderGroup)
252255
def vcf2zarr():
253-
pass
256+
"""
257+
Convert VCF file(s) to the vcfzarr format.
258+
259+
The simplest usage is:
260+
261+
$ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
262+
263+
This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
264+
step. As this writes the intermediate columnar format to a temporary directory,
265+
we only recommend this approach for small files (< 1GB, say).
266+
267+
The recommended approach is to run the conversion in two passes, and
268+
to keep the intermediate columnar format ("exploded") around to facilitate
269+
experimentation with chunk sizes and compression settings:
270+
271+
\b
272+
$ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
273+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
274+
275+
The inspect command provides a way to view contents of an exploded ICF
276+
or Zarr:
277+
278+
$ vcf2zarr inspect [PATH]
279+
280+
This is useful when tweaking chunk sizes and compression settings to suit
281+
your dataset, using the mkschema command and --schema option to encode:
282+
283+
\b
284+
$ vcf2zarr mkschema [ICF_PATH] > schema.json
285+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
286+
287+
By editing the schema.json file you can drop columns that are not of interest
288+
and edit column specific compression settings. The --max-variant-chunks option
289+
to encode allows you to try out these options on small subsets, hopefully
290+
arriving at settings with the desired balance of compression and query
291+
performance.
292+
293+
ADVANCED USAGE
294+
295+
For very large datasets (terabyte scale) it may be necessary to distribute the
296+
explode and encode steps across a cluster:
297+
298+
\b
299+
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
300+
$ vcf2zarr dexplode-slice [ICF_PATH] [START] [STOP]
301+
$ vcf2zarr dexplode-finalise [ICF_PATH]
302+
303+
See the online documentation at [FIXME] for more details on distributed explode.
304+
"""
254305

255306

256307
# TODO figure out how to get click to list these in the given order.
257-
vcf2zarr.add_command(explode)
258-
vcf2zarr.add_command(explode_init)
259-
vcf2zarr.add_command(explode_partition_count)
260-
vcf2zarr.add_command(explode_slice)
261-
vcf2zarr.add_command(explode_finalise)
308+
vcf2zarr.add_command(convert_vcf)
262309
vcf2zarr.add_command(inspect)
310+
vcf2zarr.add_command(explode)
263311
vcf2zarr.add_command(mkschema)
264312
vcf2zarr.add_command(encode)
265-
vcf2zarr.add_command(convert_vcf)
313+
vcf2zarr.add_command(dexplode_init)
314+
vcf2zarr.add_command(dexplode_slice)
315+
vcf2zarr.add_command(dexplode_finalise)
266316
vcf2zarr.add_command(validate)
267317

268318

269319
@click.command(name="convert")
270320
@click.argument("in_path", type=click.Path())
271-
@click.argument("out_path", type=click.Path())
321+
@click.argument("zarr_path", type=click.Path())
272322
@worker_processes
273323
@verbose
274324
@variants_chunk_size
275325
@samples_chunk_size
276326
def convert_plink(
277327
in_path,
278-
out_path,
328+
zarr_path,
279329
verbose,
280330
worker_processes,
281331
variants_chunk_size,
@@ -287,7 +337,7 @@ def convert_plink(
287337
setup_logging(verbose)
288338
plink.convert(
289339
in_path,
290-
out_path,
340+
zarr_path,
291341
show_progress=True,
292342
worker_processes=worker_processes,
293343
samples_chunk_size=samples_chunk_size,

0 commit comments

Comments
 (0)