Skip to content

Commit c455d3c

Browse files
Merge pull request #3 from jeromekelleher/cli-fixup
Initial outline of CLI
2 parents c7f8831 + 64f0c36 commit c455d3c

File tree

6 files changed

+216
-118
lines changed

6 files changed

+216
-118
lines changed

bio2zarr/__main__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import click
2+
3+
from . import cli
4+
5+
@click.group()
6+
def top_level():
7+
pass
8+
9+
# Provide a single top-level interface to all of the functionality.
10+
# This probably isn't the recommended way of interacting, as we
11+
# install individual commands as console scripts. However, this
12+
# is handy for development and for those whose PATHs aren't set
13+
# up in the right way.
14+
top_level.add_command(cli.vcf2zarr)
15+
top_level.add_command(cli.plink2zarr)
16+
17+
if __name__ == "__main__":
18+
top_level()

bio2zarr/cli.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import click
2+
import tabulate
3+
import coloredlogs
4+
5+
from . import vcf
6+
7+
# Common arguments/options
8+
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
9+
10+
worker_processes = click.option(
11+
"-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
12+
)
13+
14+
15+
# Note: logging hasn't been implemented in the code at all, this is just
16+
# a first pass to try out some ways of doing things to see what works.
17+
def setup_logging(verbosity):
18+
level = "WARNING"
19+
if verbosity == 1:
20+
level = "INFO"
21+
elif verbosity >= 2:
22+
level = "DEBUG"
23+
# NOTE: I'm not that excited about coloredlogs, just trying it out
24+
# as it is installed by cyvcf2 anyway. We will have some complicated
25+
# stuff doing on with threads and processes, to logs might not work
26+
# so well anyway.
27+
coloredlogs.install(level=level)
28+
29+
30+
@click.command
31+
@click.argument("vcfs", nargs=-1, required=True)
32+
@click.argument("out_path", type=click.Path())
33+
@verbose
34+
@worker_processes
35+
@click.option("-c", "--column-chunk-size", type=int, default=64)
36+
def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
37+
setup_logging(verbose)
38+
vcf.explode(
39+
vcfs,
40+
out_path,
41+
worker_processes=worker_processes,
42+
column_chunk_size=column_chunk_size,
43+
show_progress=True,
44+
)
45+
46+
47+
@click.command
48+
@click.argument("columnarised", type=click.Path())
49+
@verbose
50+
def summarise(columnarised, verbose):
51+
setup_logging(verbose)
52+
pcvcf = vcf.PickleChunkedVcf.load(columnarised)
53+
data = pcvcf.summary_table()
54+
click.echo(tabulate.tabulate(data, headers="keys"))
55+
56+
57+
@click.command
58+
@click.argument("columnarised", type=click.Path())
59+
# @click.argument("specfile", type=click.Path())
60+
def genspec(columnarised):
61+
stream = click.get_text_stream("stdout")
62+
vcf.generate_spec(columnarised, stream)
63+
64+
65+
@click.command
66+
@click.argument("columnarised", type=click.Path())
67+
@click.argument("zarr_path", type=click.Path())
68+
@verbose
69+
@click.option("-s", "--conversion-spec", default=None)
70+
@worker_processes
71+
def to_zarr(columnarised, zarr_path, verbose, conversion_spec, worker_processes):
72+
setup_logging(verbose)
73+
vcf.to_zarr(
74+
columnarised,
75+
zarr_path,
76+
conversion_spec,
77+
worker_processes=worker_processes,
78+
show_progress=True,
79+
)
80+
81+
82+
@click.command(name="convert")
83+
@click.argument("vcfs", nargs=-1, required=True)
84+
@click.argument("out_path", type=click.Path())
85+
@verbose
86+
@worker_processes
87+
def convert_vcf(vcfs, out_path, verbose, worker_processes):
88+
setup_logging(verbose)
89+
vcf.convert_vcf(
90+
vcfs, out_path, show_progress=True, worker_processes=worker_processes
91+
)
92+
93+
94+
@click.command
95+
@click.argument("vcfs", nargs=-1, required=True)
96+
@click.argument("out_path", type=click.Path())
97+
def validate(vcfs, out_path):
98+
vcf.validate(vcfs[0], out_path, show_progress=True)
99+
100+
101+
@click.group()
102+
def vcf2zarr():
103+
pass
104+
105+
106+
vcf2zarr.add_command(explode)
107+
vcf2zarr.add_command(summarise)
108+
vcf2zarr.add_command(genspec)
109+
vcf2zarr.add_command(to_zarr)
110+
vcf2zarr.add_command(convert_vcf)
111+
vcf2zarr.add_command(validate)
112+
113+
114+
@click.command(name="convert")
115+
@click.argument("plink", type=click.Path())
116+
@click.argument("out_path", type=click.Path())
117+
@worker_processes
118+
@click.option("--chunk-width", type=int, default=None)
119+
@click.option("--chunk-length", type=int, default=None)
120+
def convert_plink(plink, out_path, worker_processes, chunk_width, chunk_length):
121+
vcf.convert_plink(
122+
plink,
123+
out_path,
124+
show_progress=True,
125+
worker_processes=worker_processes,
126+
chunk_width=chunk_width,
127+
chunk_length=chunk_length,
128+
)
129+
130+
131+
@click.group()
132+
def plink2zarr():
133+
pass
134+
135+
136+
plink2zarr.add_command(convert_plink)

bio2zarr/vcf.py

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import dataclasses
33
import multiprocessing
44
import functools
5+
import logging
6+
import os
57
import threading
68
import pathlib
79
import time
@@ -13,7 +15,7 @@
1315
import tempfile
1416
from typing import Any
1517

16-
import humanize
18+
import humanfriendly
1719
import cyvcf2
1820
import numcodecs
1921
import numpy as np
@@ -23,6 +25,8 @@
2325

2426
import bed_reader
2527

28+
logger = logging.getLogger(__name__)
29+
2630
INT_MISSING = -1
2731
INT_FILL = -2
2832
STR_MISSING = "."
@@ -271,8 +275,10 @@ def make_field_def(name, vcf_type, vcf_number):
271275
def scan_vcfs(paths, show_progress):
272276
partitions = []
273277
vcf_metadata = None
278+
logger.info(f"Scanning {len(paths)} VCFs")
274279
for path in tqdm.tqdm(paths, desc="Scan ", disable=not show_progress):
275280
vcf = cyvcf2.VCF(path)
281+
logger.debug(f"Scanning {path}")
276282

277283
filters = [
278284
h["ID"]
@@ -459,8 +465,12 @@ def __repr__(self):
459465
# TODO add class name
460466
return repr({"path": str(self.path), **self.vcf_field.summary.asdict()})
461467

468+
def chunk_path(self, partition_index, chunk_index):
469+
return self.path / f"p{partition_index}" / f"c{chunk_index}"
470+
462471
def write_chunk(self, partition_index, chunk_index, data):
463-
path = self.path / f"p{partition_index}" / f"c{chunk_index}"
472+
path = self.chunk_path(partition_index, chunk_index)
473+
logger.debug(f"Start write: {path}")
464474
pkl = pickle.dumps(data)
465475
# NOTE assuming that reusing the same compressor instance
466476
# from multiple threads is OK!
@@ -472,9 +482,10 @@ def write_chunk(self, partition_index, chunk_index, data):
472482
self.vcf_field.summary.num_chunks += 1
473483
self.vcf_field.summary.compressed_size += len(compressed)
474484
self.vcf_field.summary.uncompressed_size += len(pkl)
485+
logger.debug(f"Finish write: {path}")
475486

476487
def read_chunk(self, partition_index, chunk_index):
477-
path = self.path / f"p{partition_index}" / f"c{chunk_index}"
488+
path = self.chunk_path(partition_index, chunk_index)
478489
with open(path, "rb") as f:
479490
pkl = self.compressor.decode(f.read())
480491
return pickle.loads(pkl), len(pkl)
@@ -615,6 +626,8 @@ def append(self, val):
615626

616627
def flush(self):
617628
if len(self.buffer) > 0:
629+
path = self.column.chunk_path(self.partition_index, self.chunk_index)
630+
logger.debug(f"Schedule write: {path}")
618631
future = self.executor.submit(
619632
self.column.write_chunk,
620633
self.partition_index,
@@ -649,7 +662,7 @@ def display_number(x):
649662
return ret
650663

651664
def display_size(n):
652-
return humanize.naturalsize(n, binary=True)
665+
return humanfriendly.format_size(n)
653666

654667
data = []
655668
for name, col in self.columns.items():
@@ -688,6 +701,10 @@ def num_partitions(self):
688701
def num_samples(self):
689702
return len(self.metadata.samples)
690703

704+
@property
705+
def num_columns(self):
706+
return len(self.columns)
707+
691708
def mkdirs(self):
692709
self.path.mkdir()
693710
for col in self.columns.values():
@@ -716,6 +733,10 @@ def convert(
716733
partition.num_records for partition in vcf_metadata.partitions
717734
)
718735

736+
logger.info(
737+
f"Exploding {pcvcf.num_columns} columns {total_variants} variants "
738+
f"{pcvcf.num_samples} samples"
739+
)
719740
global progress_counter
720741
progress_counter = multiprocessing.Value("Q", 0)
721742

@@ -774,6 +795,7 @@ def convert_partition(
774795
partition = vcf_metadata.partitions[partition_index]
775796
vcf = cyvcf2.VCF(partition.vcf_path)
776797
futures = set()
798+
logger.info(f"Start partition {partition_index} {partition.vcf_path}")
777799

778800
def service_futures(max_waiting=2 * flush_threads):
779801
while len(futures) > max_waiting:
@@ -824,12 +846,7 @@ def service_futures(max_waiting=2 * flush_threads):
824846
gt.append(variant.genotype.array())
825847

826848
for name, buff in info_fields:
827-
val = None
828-
try:
829-
val = variant.INFO[name]
830-
except KeyError:
831-
pass
832-
buff.append(val)
849+
buff.append(variant.INFO.get(name, None))
833850

834851
for name, buff in format_fields:
835852
val = None
@@ -841,11 +858,15 @@ def service_futures(max_waiting=2 * flush_threads):
841858

842859
service_futures()
843860

861+
# Note: an issue with updating the progress per variant here like this
862+
# is that we get a significant pause at the end of the counter while
863+
# all the "small" fields get flushed. Possibly not much to be done about it.
844864
with progress_counter.get_lock():
845865
progress_counter.value += 1
846866

847867
for col in columns.values():
848868
col.flush()
869+
logger.info(f"VCF read finished; waiting on {len(futures)} chunk writes")
849870
service_futures(0)
850871

851872
return summaries
@@ -1213,6 +1234,7 @@ def encode_alleles(self, pcvcf):
12131234
with progress_counter.get_lock():
12141235
for col in [ref_col, alt_col]:
12151236
progress_counter.value += col.vcf_field.summary.uncompressed_size
1237+
logger.debug("alleles done")
12161238

12171239
def encode_samples(self, pcvcf, sample_id, chunk_width):
12181240
if not np.array_equal(sample_id, pcvcf.metadata.samples):
@@ -1225,6 +1247,7 @@ def encode_samples(self, pcvcf, sample_id, chunk_width):
12251247
chunks=(chunk_width,),
12261248
)
12271249
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
1250+
logger.debug("Samples done")
12281251

12291252
def encode_contig(self, pcvcf, contig_names, contig_lengths):
12301253
array = self.root.array(
@@ -1258,6 +1281,7 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
12581281

12591282
with progress_counter.get_lock():
12601283
progress_counter.value += col.vcf_field.summary.uncompressed_size
1284+
logger.debug("Contig done")
12611285

12621286
def encode_filters(self, pcvcf, filter_names):
12631287
self.root.attrs["filters"] = filter_names
@@ -1285,6 +1309,7 @@ def encode_filters(self, pcvcf, filter_names):
12851309

12861310
with progress_counter.get_lock():
12871311
progress_counter.value += col.vcf_field.summary.uncompressed_size
1312+
logger.debug("Filters done")
12881313

12891314
def encode_id(self, pcvcf):
12901315
col = pcvcf.columns["ID"]
@@ -1305,14 +1330,21 @@ def encode_id(self, pcvcf):
13051330

13061331
with progress_counter.get_lock():
13071332
progress_counter.value += col.vcf_field.summary.uncompressed_size
1333+
logger.debug("ID done")
13081334

13091335
@staticmethod
13101336
def convert(
13111337
pcvcf, path, conversion_spec, *, worker_processes=1, show_progress=False
13121338
):
1313-
store = zarr.DirectoryStore(path)
1314-
# FIXME
1315-
sgvcf = SgvcfZarr(path)
1339+
path = pathlib.Path(path)
1340+
# TODO: we should do this as a future to avoid blocking
1341+
if path.exists():
1342+
shutil.rmtree(path)
1343+
write_path = path.with_suffix(path.suffix + f".{os.getpid()}.build")
1344+
store = zarr.DirectoryStore(write_path)
1345+
# FIXME, duplicating logic about the store
1346+
logger.info(f"Create zarr at {write_path}")
1347+
sgvcf = SgvcfZarr(write_path)
13161348
sgvcf.root = zarr.group(store=store, overwrite=True)
13171349
for variable in conversion_spec.variables[:]:
13181350
sgvcf.create_array(variable)
@@ -1373,11 +1405,14 @@ def convert(
13731405

13741406
flush_futures(futures)
13751407

1376-
zarr.consolidate_metadata(path)
13771408
# FIXME can't join the bar_thread because we never get to the correct
13781409
# number of bytes
13791410
# if bar_thread is not None:
13801411
# bar_thread.join()
1412+
zarr.consolidate_metadata(write_path)
1413+
# Atomic swap, now we've completely finished.
1414+
logger.info(f"Moving to final path {path}")
1415+
os.rename(write_path, path)
13811416

13821417

13831418
def sync_flush_array(np_buffer, zarr_array, offset):
@@ -1388,6 +1423,7 @@ def async_flush_array(executor, np_buffer, zarr_array, offset):
13881423
"""
13891424
Flush the specified chunk aligned buffer to the specified zarr array.
13901425
"""
1426+
logger.debug(f"Schedule flush {zarr_array} @ {offset}")
13911427
assert zarr_array.shape[1:] == np_buffer.shape[1:]
13921428
# print("sync", zarr_array, np_buffer)
13931429

0 commit comments

Comments
 (0)