7
7
from . import plink
8
8
from . import provenance
9
9
10
+
10
11
class NaturalOrderGroup (click .Group ):
11
12
"""
12
13
List commands in the order they are provided in the help text.
@@ -65,18 +66,18 @@ def setup_logging(verbosity):
65
66
66
67
@click .command
67
68
@click .argument ("vcfs" , nargs = - 1 , required = True )
68
- @click .argument ("out_path " , type = click .Path ())
69
+ @click .argument ("zarr_path " , type = click .Path ())
69
70
@verbose
70
71
@worker_processes
71
72
@column_chunk_size
72
- def explode (vcfs , out_path , verbose , worker_processes , column_chunk_size ):
73
+ def explode (vcfs , zarr_path , verbose , worker_processes , column_chunk_size ):
73
74
"""
74
75
Convert VCF(s) to columnar intermediate format
75
76
"""
76
77
setup_logging (verbose )
77
78
vcf .explode (
78
79
vcfs ,
79
- out_path ,
80
+ zarr_path ,
80
81
worker_processes = worker_processes ,
81
82
column_chunk_size = column_chunk_size ,
82
83
show_progress = True ,
@@ -85,31 +86,24 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
85
86
86
87
@click .command
87
88
@click .argument ("vcfs" , nargs = - 1 , required = True )
88
- @click .argument ("out_path " , type = click .Path ())
89
- @click .option ( "-n " , "--target-num-partitions" , type = int , required = True )
89
+ @click .argument ("icf_path " , type = click .Path ())
90
+ @click .argument ( "num_partitions " , type = int )
90
91
@verbose
91
92
@worker_processes
92
- def dexplode_init (vcfs , out_path , target_num_partitions , verbose , worker_processes ):
93
+ def dexplode_init (vcfs , icf_path , num_partitions , verbose , worker_processes ):
93
94
"""
94
95
Initial step for parallel conversion of VCF(s) to columnar intermediate format
96
+ over the requested number of paritions.
95
97
"""
96
98
setup_logging (verbose )
97
- vcf .explode_init (
99
+ num_partitions = vcf .explode_init (
98
100
vcfs ,
99
- out_path ,
100
- target_num_partitions = target_num_partitions ,
101
+ icf_path ,
102
+ target_num_partitions = num_partitions ,
101
103
worker_processes = worker_processes ,
102
104
show_progress = True ,
103
105
)
104
-
105
-
106
- @click .command
107
- @click .argument ("path" , type = click .Path ())
108
- def dexplode_partition_count (path ):
109
- """
110
- Count the actual number of partitions in a parallel conversion of VCF(s) to columnar intermediate format
111
- """
112
- click .echo (vcf .explode_partition_count (path ))
106
+ click .echo (num_partitions )
113
107
114
108
115
109
@click .command
@@ -146,29 +140,29 @@ def dexplode_finalise(path, verbose):
146
140
147
141
148
142
@click .command
149
- @click .argument ("if_path " , type = click .Path ())
143
+ @click .argument ("icf_path " , type = click .Path ())
150
144
@verbose
151
- def inspect (if_path , verbose ):
145
+ def inspect (icf_path , verbose ):
152
146
"""
153
147
Inspect an intermediate format or Zarr path.
154
148
"""
155
149
setup_logging (verbose )
156
- data = vcf .inspect (if_path )
150
+ data = vcf .inspect (icf_path )
157
151
click .echo (tabulate .tabulate (data , headers = "keys" ))
158
152
159
153
160
154
@click .command
161
- @click .argument ("if_path " , type = click .Path ())
162
- def mkschema (if_path ):
155
+ @click .argument ("icf_path " , type = click .Path ())
156
+ def mkschema (icf_path ):
163
157
"""
164
158
Generate a schema for zarr encoding
165
159
"""
166
160
stream = click .get_text_stream ("stdout" )
167
- vcf .mkschema (if_path , stream )
161
+ vcf .mkschema (icf_path , stream )
168
162
169
163
170
164
@click .command
171
- @click .argument ("if_path " , type = click .Path ())
165
+ @click .argument ("icf_path " , type = click .Path ())
172
166
@click .argument ("zarr_path" , type = click .Path ())
173
167
@verbose
174
168
@click .option ("-s" , "--schema" , default = None , type = click .Path (exists = True ))
@@ -194,7 +188,7 @@ def mkschema(if_path):
194
188
)
195
189
@worker_processes
196
190
def encode (
197
- if_path ,
191
+ icf_path ,
198
192
zarr_path ,
199
193
verbose ,
200
194
schema ,
@@ -205,11 +199,11 @@ def encode(
205
199
worker_processes ,
206
200
):
207
201
"""
208
- Encode intermediate format (see explode) to vcfzarr
202
+ Encode intermediate columnar format (see explode) to vcfzarr.
209
203
"""
210
204
setup_logging (verbose )
211
205
vcf .encode (
212
- if_path ,
206
+ icf_path ,
213
207
zarr_path ,
214
208
schema ,
215
209
variants_chunk_size = variants_chunk_size ,
@@ -223,21 +217,21 @@ def encode(
223
217
224
218
@click .command (name = "convert" )
225
219
@click .argument ("vcfs" , nargs = - 1 , required = True )
226
- @click .argument ("out_path " , type = click .Path ())
220
+ @click .argument ("zarr_path " , type = click .Path ())
227
221
@variants_chunk_size
228
222
@samples_chunk_size
229
223
@verbose
230
224
@worker_processes
231
225
def convert_vcf (
232
- vcfs , out_path , variants_chunk_size , samples_chunk_size , verbose , worker_processes
226
+ vcfs , zarr_path , variants_chunk_size , samples_chunk_size , verbose , worker_processes
233
227
):
234
228
"""
235
- Convert input VCF(s) directly to vcfzarr (not recommended for large files)
229
+ Convert input VCF(s) directly to vcfzarr (not recommended for large files).
236
230
"""
237
231
setup_logging (verbose )
238
232
vcf .convert (
239
233
vcfs ,
240
- out_path ,
234
+ zarr_path ,
241
235
variants_chunk_size = variants_chunk_size ,
242
236
samples_chunk_size = samples_chunk_size ,
243
237
show_progress = True ,
@@ -247,44 +241,91 @@ def convert_vcf(
247
241
248
242
@click .command
249
243
@click .argument ("vcfs" , nargs = - 1 , required = True )
250
- @click .argument ("out_path " , type = click .Path ())
251
- def validate (vcfs , out_path ):
244
+ @click .argument ("zarr_path " , type = click .Path ())
245
+ def validate (vcfs , zarr_path ):
252
246
"""
253
247
Development only, do not use. Will be removed before release.
254
248
"""
255
249
# FIXME! Will silently not look at remaining VCFs
256
- vcf .validate (vcfs [0 ], out_path , show_progress = True )
250
+ vcf .validate (vcfs [0 ], zarr_path , show_progress = True )
257
251
258
252
259
253
@version
260
254
@click .group (cls = NaturalOrderGroup )
261
255
def vcf2zarr ():
262
- pass
256
+ """
257
+ Convert VCF file(s) to the vcfzarr format.
258
+
259
+ The simplest usage is:
260
+
261
+ $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
262
+
263
+ This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
264
+ step. As this writes the intermediate columnar format to a temporary directory,
265
+ we only recommend this approach for small files (< 1GB, say).
266
+
267
+ The recommended approach is to run the conversion in two passes, and
268
+ to keep the intermediate columnar format ("exploded") around to facilitate
269
+ experimentation with chunk sizes and compression settings:
270
+
271
+ \b
272
+ $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
273
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
274
+
275
+ The inspect command provides a way to view contents of an exploded ICF
276
+ or Zarr:
277
+
278
+ $ vcf2zarr inspect [PATH]
279
+
280
+ This is useful when tweaking chunk sizes and compression settings to suit
281
+ your dataset, using the mkschema command and --schema option to encode:
282
+
283
+ \b
284
+ $ vcf2zarr mkschema [ICF_PATH] > schema.json
285
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
286
+
287
+ By editing the schema.json file you can drop columns that are not of interest
288
+ and edit column specific compression settings. The --max-variant-chunks option
289
+ to encode allows you to try out these options on small subsets, hopefully
290
+ arriving at settings with the desired balance of compression and query
291
+ performance.
292
+
293
+ ADVANCED USAGE
294
+
295
+ For very large datasets (terabyte scale) it may be necessary to distribute the
296
+ explode and encode steps across a cluster:
297
+
298
+ \b
299
+ $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
300
+ $ vcf2zarr dexplode-slice [ICF_PATH] [START] [STOP]
301
+ $ vcf2zarr dexplode-finalise [ICF_PATH]
302
+
303
+ See the online documentation at [FIXME] for more details on distributed explode.
304
+ """
263
305
264
306
265
307
# TODO figure out how to get click to list these in the given order.
266
308
vcf2zarr .add_command (convert_vcf )
267
- vcf2zarr .add_command (explode )
268
309
vcf2zarr .add_command (inspect )
310
+ vcf2zarr .add_command (explode )
269
311
vcf2zarr .add_command (mkschema )
270
312
vcf2zarr .add_command (encode )
271
313
vcf2zarr .add_command (dexplode_init )
272
- vcf2zarr .add_command (dexplode_partition_count )
273
314
vcf2zarr .add_command (dexplode_slice )
274
315
vcf2zarr .add_command (dexplode_finalise )
275
316
vcf2zarr .add_command (validate )
276
317
277
318
278
319
@click .command (name = "convert" )
279
320
@click .argument ("in_path" , type = click .Path ())
280
- @click .argument ("out_path " , type = click .Path ())
321
+ @click .argument ("zarr_path " , type = click .Path ())
281
322
@worker_processes
282
323
@verbose
283
324
@variants_chunk_size
284
325
@samples_chunk_size
285
326
def convert_plink (
286
327
in_path ,
287
- out_path ,
328
+ zarr_path ,
288
329
verbose ,
289
330
worker_processes ,
290
331
variants_chunk_size ,
@@ -296,7 +337,7 @@ def convert_plink(
296
337
setup_logging (verbose )
297
338
plink .convert (
298
339
in_path ,
299
- out_path ,
340
+ zarr_path ,
300
341
show_progress = True ,
301
342
worker_processes = worker_processes ,
302
343
samples_chunk_size = samples_chunk_size ,
0 commit comments