7
7
from . import plink
8
8
from . import provenance
9
9
10
+
11
+ class NaturalOrderGroup (click .Group ):
12
+ """
13
+ List commands in the order they are provided in the help text.
14
+ """
15
+
16
+ def list_commands (self , ctx ):
17
+ return self .commands .keys ()
18
+
19
+
10
20
# Common arguments/options
11
21
verbose = click .option ("-v" , "--verbose" , count = True , help = "Increase verbosity" )
12
22
23
+ version = click .version_option (version = f"{ provenance .__version__ } " )
24
+
13
25
worker_processes = click .option (
14
26
"-p" , "--worker-processes" , type = int , default = 1 , help = "Number of worker processes"
15
27
)
19
31
"--column-chunk-size" ,
20
32
type = int ,
21
33
default = 64 ,
22
- help = "Size of exploded column chunks" ,
34
+ help = "Approximate uncompressed size of exploded column chunks in MiB " ,
23
35
)
24
36
25
37
# Note: -l and -w were chosen when these were called "width" and "length".
26
38
# possibly there are better letters now.
27
- # TODO help text
28
39
variants_chunk_size = click .option (
29
40
"-l" ,
30
41
"--variants-chunk-size" ,
41
52
help = "Chunk size in the samples dimension" ,
42
53
)
43
54
44
- version = click .version_option (version = f"{ provenance .__version__ } " )
45
-
46
55
47
- # Note: logging hasn't been implemented in the code at all, this is just
48
- # a first pass to try out some ways of doing things to see what works.
49
56
def setup_logging (verbosity ):
50
57
level = "WARNING"
51
58
if verbosity == 1 :
52
59
level = "INFO"
53
60
elif verbosity >= 2 :
54
61
level = "DEBUG"
55
62
# NOTE: I'm not that excited about coloredlogs, just trying it out
56
- # as it is installed by cyvcf2 anyway. We will have some complicated
57
- # stuff doing on with threads and processes, to logs might not work
58
- # so well anyway.
63
+ # as it is installed by cyvcf2 anyway.
59
64
coloredlogs .install (level = level )
60
65
61
66
62
67
@click .command
63
68
@click .argument ("vcfs" , nargs = - 1 , required = True )
64
- @click .argument ("out_path " , type = click .Path ())
69
+ @click .argument ("zarr_path " , type = click .Path ())
65
70
@verbose
66
71
@worker_processes
67
72
@column_chunk_size
68
- def explode (vcfs , out_path , verbose , worker_processes , column_chunk_size ):
73
+ def explode (vcfs , zarr_path , verbose , worker_processes , column_chunk_size ):
69
74
"""
70
75
Convert VCF(s) to columnar intermediate format
71
76
"""
72
77
setup_logging (verbose )
73
78
vcf .explode (
74
79
vcfs ,
75
- out_path ,
80
+ zarr_path ,
76
81
worker_processes = worker_processes ,
77
82
column_chunk_size = column_chunk_size ,
78
83
show_progress = True ,
79
84
)
80
85
86
+
81
87
@click .command
82
88
@click .argument ("vcfs" , nargs = - 1 , required = True )
83
- @click .argument ("out_path " , type = click .Path ())
84
- @click .option ( "-n " , "--target-num-partitions" , type = int , required = True )
89
+ @click .argument ("icf_path " , type = click .Path ())
90
+ @click .argument ( "num_partitions " , type = int )
85
91
@verbose
86
92
@worker_processes
87
- def explode_init (vcfs , out_path , target_num_partitions , verbose , worker_processes ):
93
+ def dexplode_init (vcfs , icf_path , num_partitions , verbose , worker_processes ):
88
94
"""
89
95
Initial step for parallel conversion of VCF(s) to columnar intermediate format
96
+ over the requested number of paritions.
90
97
"""
91
98
setup_logging (verbose )
92
- vcf .explode_init (
99
+ num_partitions = vcf .explode_init (
93
100
vcfs ,
94
- out_path ,
95
- target_num_partitions = target_num_partitions ,
101
+ icf_path ,
102
+ target_num_partitions = num_partitions ,
96
103
worker_processes = worker_processes ,
97
104
show_progress = True ,
98
105
)
106
+ click .echo (num_partitions )
99
107
100
- @click .command
101
- @click .argument ("path" , type = click .Path ())
102
- def explode_partition_count (path ):
103
- """
104
- Count the actual number of partitions in a parallel conversion of VCF(s) to columnar intermediate format
105
- """
106
- print (vcf .explode_partition_count (path ))
107
108
108
109
@click .command
109
110
@click .argument ("path" , type = click .Path (), required = True )
@@ -112,7 +113,7 @@ def explode_partition_count(path):
112
113
@verbose
113
114
@worker_processes
114
115
@column_chunk_size
115
- def explode_slice (path , start , end , verbose , worker_processes , column_chunk_size ):
116
+ def dexplode_slice (path , start , end , verbose , worker_processes , column_chunk_size ):
116
117
"""
117
118
Convert VCF(s) to columnar intermediate format
118
119
"""
@@ -126,40 +127,42 @@ def explode_slice(path, start, end, verbose, worker_processes, column_chunk_size
126
127
show_progress = True ,
127
128
)
128
129
130
+
129
131
@click .command
130
132
@click .argument ("path" , type = click .Path (), required = True )
131
133
@verbose
132
- def explode_finalise (path , verbose ):
134
+ def dexplode_finalise (path , verbose ):
133
135
"""
134
136
Final step for parallel conversion of VCF(s) to columnar intermediate format
135
137
"""
136
138
setup_logging (verbose )
137
139
vcf .explode_finalise (path )
138
140
141
+
139
142
@click .command
140
- @click .argument ("if_path " , type = click .Path ())
143
+ @click .argument ("icf_path " , type = click .Path ())
141
144
@verbose
142
- def inspect (if_path , verbose ):
145
+ def inspect (icf_path , verbose ):
143
146
"""
144
- Inspect an intermediate format file
147
+ Inspect an intermediate format or Zarr path.
145
148
"""
146
149
setup_logging (verbose )
147
- data = vcf .inspect (if_path )
150
+ data = vcf .inspect (icf_path )
148
151
click .echo (tabulate .tabulate (data , headers = "keys" ))
149
152
150
153
151
154
@click .command
152
- @click .argument ("if_path " , type = click .Path ())
153
- def mkschema (if_path ):
155
+ @click .argument ("icf_path " , type = click .Path ())
156
+ def mkschema (icf_path ):
154
157
"""
155
158
Generate a schema for zarr encoding
156
159
"""
157
160
stream = click .get_text_stream ("stdout" )
158
- vcf .mkschema (if_path , stream )
161
+ vcf .mkschema (icf_path , stream )
159
162
160
163
161
164
@click .command
162
- @click .argument ("if_path " , type = click .Path ())
165
+ @click .argument ("icf_path " , type = click .Path ())
163
166
@click .argument ("zarr_path" , type = click .Path ())
164
167
@verbose
165
168
@click .option ("-s" , "--schema" , default = None , type = click .Path (exists = True ))
@@ -185,7 +188,7 @@ def mkschema(if_path):
185
188
)
186
189
@worker_processes
187
190
def encode (
188
- if_path ,
191
+ icf_path ,
189
192
zarr_path ,
190
193
verbose ,
191
194
schema ,
@@ -196,11 +199,11 @@ def encode(
196
199
worker_processes ,
197
200
):
198
201
"""
199
- Encode intermediate format (see explode) to vcfzarr
202
+ Encode intermediate columnar format (see explode) to vcfzarr.
200
203
"""
201
204
setup_logging (verbose )
202
205
vcf .encode (
203
- if_path ,
206
+ icf_path ,
204
207
zarr_path ,
205
208
schema ,
206
209
variants_chunk_size = variants_chunk_size ,
@@ -214,21 +217,21 @@ def encode(
214
217
215
218
@click .command (name = "convert" )
216
219
@click .argument ("vcfs" , nargs = - 1 , required = True )
217
- @click .argument ("out_path " , type = click .Path ())
220
+ @click .argument ("zarr_path " , type = click .Path ())
218
221
@variants_chunk_size
219
222
@samples_chunk_size
220
223
@verbose
221
224
@worker_processes
222
225
def convert_vcf (
223
- vcfs , out_path , variants_chunk_size , samples_chunk_size , verbose , worker_processes
226
+ vcfs , zarr_path , variants_chunk_size , samples_chunk_size , verbose , worker_processes
224
227
):
225
228
"""
226
- Convert input VCF(s) directly to vcfzarr (not recommended for large files)
229
+ Convert input VCF(s) directly to vcfzarr (not recommended for large files).
227
230
"""
228
231
setup_logging (verbose )
229
232
vcf .convert (
230
233
vcfs ,
231
- out_path ,
234
+ zarr_path ,
232
235
variants_chunk_size = variants_chunk_size ,
233
236
samples_chunk_size = samples_chunk_size ,
234
237
show_progress = True ,
@@ -238,44 +241,91 @@ def convert_vcf(
238
241
239
242
@click .command
240
243
@click .argument ("vcfs" , nargs = - 1 , required = True )
241
- @click .argument ("out_path " , type = click .Path ())
242
- def validate (vcfs , out_path ):
244
+ @click .argument ("zarr_path " , type = click .Path ())
245
+ def validate (vcfs , zarr_path ):
243
246
"""
244
247
Development only, do not use. Will be removed before release.
245
248
"""
246
249
# FIXME! Will silently not look at remaining VCFs
247
- vcf .validate (vcfs [0 ], out_path , show_progress = True )
250
+ vcf .validate (vcfs [0 ], zarr_path , show_progress = True )
248
251
249
252
250
253
@version
251
- @click .group ()
254
+ @click .group (cls = NaturalOrderGroup )
252
255
def vcf2zarr ():
253
- pass
256
+ """
257
+ Convert VCF file(s) to the vcfzarr format.
258
+
259
+ The simplest usage is:
260
+
261
+ $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
262
+
263
+ This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
264
+ step. As this writes the intermediate columnar format to a temporary directory,
265
+ we only recommend this approach for small files (< 1GB, say).
266
+
267
+ The recommended approach is to run the conversion in two passes, and
268
+ to keep the intermediate columnar format ("exploded") around to facilitate
269
+ experimentation with chunk sizes and compression settings:
270
+
271
+ \b
272
+ $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
273
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
274
+
275
+ The inspect command provides a way to view contents of an exploded ICF
276
+ or Zarr:
277
+
278
+ $ vcf2zarr inspect [PATH]
279
+
280
+ This is useful when tweaking chunk sizes and compression settings to suit
281
+ your dataset, using the mkschema command and --schema option to encode:
282
+
283
+ \b
284
+ $ vcf2zarr mkschema [ICF_PATH] > schema.json
285
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
286
+
287
+ By editing the schema.json file you can drop columns that are not of interest
288
+ and edit column specific compression settings. The --max-variant-chunks option
289
+ to encode allows you to try out these options on small subsets, hopefully
290
+ arriving at settings with the desired balance of compression and query
291
+ performance.
292
+
293
+ ADVANCED USAGE
294
+
295
+ For very large datasets (terabyte scale) it may be necessary to distribute the
296
+ explode and encode steps across a cluster:
297
+
298
+ \b
299
+ $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
300
+ $ vcf2zarr dexplode-slice [ICF_PATH] [START] [STOP]
301
+ $ vcf2zarr dexplode-finalise [ICF_PATH]
302
+
303
+ See the online documentation at [FIXME] for more details on distributed explode.
304
+ """
254
305
255
306
256
307
# TODO figure out how to get click to list these in the given order.
257
- vcf2zarr .add_command (explode )
258
- vcf2zarr .add_command (explode_init )
259
- vcf2zarr .add_command (explode_partition_count )
260
- vcf2zarr .add_command (explode_slice )
261
- vcf2zarr .add_command (explode_finalise )
308
+ vcf2zarr .add_command (convert_vcf )
262
309
vcf2zarr .add_command (inspect )
310
+ vcf2zarr .add_command (explode )
263
311
vcf2zarr .add_command (mkschema )
264
312
vcf2zarr .add_command (encode )
265
- vcf2zarr .add_command (convert_vcf )
313
+ vcf2zarr .add_command (dexplode_init )
314
+ vcf2zarr .add_command (dexplode_slice )
315
+ vcf2zarr .add_command (dexplode_finalise )
266
316
vcf2zarr .add_command (validate )
267
317
268
318
269
319
@click .command (name = "convert" )
270
320
@click .argument ("in_path" , type = click .Path ())
271
- @click .argument ("out_path " , type = click .Path ())
321
+ @click .argument ("zarr_path " , type = click .Path ())
272
322
@worker_processes
273
323
@verbose
274
324
@variants_chunk_size
275
325
@samples_chunk_size
276
326
def convert_plink (
277
327
in_path ,
278
- out_path ,
328
+ zarr_path ,
279
329
verbose ,
280
330
worker_processes ,
281
331
variants_chunk_size ,
@@ -287,7 +337,7 @@ def convert_plink(
287
337
setup_logging (verbose )
288
338
plink .convert (
289
339
in_path ,
290
- out_path ,
340
+ zarr_path ,
291
341
show_progress = True ,
292
342
worker_processes = worker_processes ,
293
343
samples_chunk_size = samples_chunk_size ,
0 commit comments