Skip to content

Commit 5f19063

Browse files
Merge pull request #37 from jeromekelleher/renaming-things
Renaming things
2 parents 37d618a + 6e5feb5 commit 5f19063

File tree

7 files changed

+184
-64
lines changed

7 files changed

+184
-64
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,22 +29,22 @@ python3 -m bio2zarr vcf2zarr explode tests/data/vcf/sample.vcf.gz tmp/sample.exp
2929

3030
Then, (optionally) inspect this representation to get a feel for your dataset
3131
```
32-
python3 -m bio2zarr vcf2zarr summarise tmp/sample.exploded
32+
python3 -m bio2zarr vcf2zarr inspec tmp/sample.exploded
3333
```
3434

3535
Then, (optionally) generate a conversion schema to describe the corresponding
3636
Zarr arrays:
3737

3838
```
39-
python3 -m bio2zarr vcf2zarr genspec tmp/sample.exploded > sample.schema.json
39+
python3 -m bio2zarr vcf2zarr mkschema tmp/sample.exploded > sample.schema.json
4040
```
4141

4242
View and edit the schema, deleting any columns you don't want.
4343

4444
Finally, convert to Zarr
4545

4646
```
47-
python3 -m bio2zarr vcf2zarr to-zarr tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
47+
python3 -m bio2zarr vcf2zarr encode tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
4848
```
4949

5050
Use the ``-p, --worker-processes`` argument to control the number of workers used

bio2zarr/cli.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ def setup_logging(verbosity):
3535
@worker_processes
3636
@click.option("-c", "--column-chunk-size", type=int, default=64)
3737
def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
38+
"""
39+
Convert VCF(s) to columnar intermediate format
40+
"""
3841
setup_logging(verbose)
3942
vcf.explode(
4043
vcfs,
@@ -46,34 +49,42 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
4649

4750

4851
@click.command
49-
@click.argument("columnarised", type=click.Path())
52+
@click.argument("if_path", type=click.Path())
5053
@verbose
51-
def summarise(columnarised, verbose):
54+
def inspect(if_path, verbose):
55+
"""
56+
Inspect an intermediate format file
57+
"""
5258
setup_logging(verbose)
53-
data = vcf.summarise(columnarised)
59+
data = vcf.inspect(if_path)
5460
click.echo(tabulate.tabulate(data, headers="keys"))
5561

5662

5763
@click.command
58-
@click.argument("columnarised", type=click.Path())
59-
# @click.argument("specfile", type=click.Path())
60-
def genspec(columnarised):
64+
@click.argument("if_path", type=click.Path())
65+
def mkschema(if_path):
66+
"""
67+
Generate a schema for zarr encoding
68+
"""
6169
stream = click.get_text_stream("stdout")
62-
vcf.generate_spec(columnarised, stream)
70+
vcf.mkschema(if_path, stream)
6371

6472

6573
@click.command
66-
@click.argument("columnarised", type=click.Path())
74+
@click.argument("if_path", type=click.Path())
6775
@click.argument("zarr_path", type=click.Path())
6876
@verbose
69-
@click.option("-s", "--conversion-spec", default=None)
77+
@click.option("-s", "--schema", default=None)
7078
@worker_processes
71-
def to_zarr(columnarised, zarr_path, verbose, conversion_spec, worker_processes):
79+
def encode(if_path, zarr_path, verbose, schema, worker_processes):
80+
"""
81+
Encode intermediate format (see explode) to vcfzarr
82+
"""
7283
setup_logging(verbose)
73-
vcf.to_zarr(
74-
columnarised,
84+
vcf.encode(
85+
if_path,
7586
zarr_path,
76-
conversion_spec,
87+
schema,
7788
worker_processes=worker_processes,
7889
show_progress=True,
7990
)
@@ -85,16 +96,18 @@ def to_zarr(columnarised, zarr_path, verbose, conversion_spec, worker_processes)
8596
@verbose
8697
@worker_processes
8798
def convert_vcf(vcfs, out_path, verbose, worker_processes):
99+
"""
100+
Convert input VCF(s) directly to vcfzarr (not recommended for large files)
101+
"""
88102
setup_logging(verbose)
89-
vcf.convert_vcf(
90-
vcfs, out_path, show_progress=True, worker_processes=worker_processes
91-
)
103+
vcf.convert(vcfs, out_path, show_progress=True, worker_processes=worker_processes)
92104

93105

94106
@click.command
95107
@click.argument("vcfs", nargs=-1, required=True)
96108
@click.argument("out_path", type=click.Path())
97109
def validate(vcfs, out_path):
110+
# FIXME! Will silently not look at remaining VCFs
98111
vcf.validate(vcfs[0], out_path, show_progress=True)
99112

100113

@@ -103,10 +116,11 @@ def vcf2zarr():
103116
pass
104117

105118

119+
# TODO figure out how to get click to list these in the given order.
106120
vcf2zarr.add_command(explode)
107-
vcf2zarr.add_command(summarise)
108-
vcf2zarr.add_command(genspec)
109-
vcf2zarr.add_command(to_zarr)
121+
vcf2zarr.add_command(inspect)
122+
vcf2zarr.add_command(mkschema)
123+
vcf2zarr.add_command(encode)
110124
vcf2zarr.add_command(convert_vcf)
111125
vcf2zarr.add_command(validate)
112126

@@ -118,6 +132,9 @@ def vcf2zarr():
118132
@click.option("--chunk-width", type=int, default=None)
119133
@click.option("--chunk-length", type=int, default=None)
120134
def convert_plink(in_path, out_path, worker_processes, chunk_width, chunk_length):
135+
"""
136+
In development; DO NOT USE!
137+
"""
121138
plink.convert(
122139
in_path,
123140
out_path,

bio2zarr/vcf.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -879,14 +879,14 @@ def explode(
879879
)
880880

881881

882-
def summarise(columnarised):
883-
pcvcf = vcf.PickleChunkedVcf.load(columnarised)
882+
def inspect(if_path):
883+
# TODO add support for the Zarr format also
884+
pcvcf = PickleChunkedVcf.load(if_path)
884885
return pcvcf.summary_table()
885886

886887

887888
@dataclasses.dataclass
888889
class ZarrColumnSpec:
889-
# TODO change to "variable_name"
890890
name: str
891891
dtype: str
892892
shape: tuple
@@ -897,6 +897,11 @@ class ZarrColumnSpec:
897897
compressor: dict
898898
# TODO add filters
899899

900+
def __post_init__(self):
901+
self.shape = tuple(self.shape)
902+
self.chunks = tuple(self.chunks)
903+
self.dimensions = tuple(self.dimensions)
904+
900905

901906
@dataclasses.dataclass
902907
class ZarrConversionSpec:
@@ -907,17 +912,26 @@ class ZarrConversionSpec:
907912
contig_id: list
908913
contig_length: list
909914
filter_id: list
910-
variables: list
915+
columns: dict
911916

912917
def asdict(self):
913918
return dataclasses.asdict(self)
914919

920+
def asjson(self):
921+
return json.dumps(self.asdict(), indent=4)
922+
915923
@staticmethod
916924
def fromdict(d):
917925
ret = ZarrConversionSpec(**d)
918-
ret.variables = [ZarrColumnSpec(**cd) for cd in d["variables"]]
926+
ret.columns = {
927+
key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
928+
}
919929
return ret
920930

931+
@staticmethod
932+
def fromjson(s):
933+
return ZarrConversionSpec.fromdict(json.loads(s))
934+
921935
@staticmethod
922936
def generate(pcvcf, chunk_length=None, chunk_width=None):
923937
m = pcvcf.num_records
@@ -1069,7 +1083,7 @@ def fixed_field_spec(
10691083
return ZarrConversionSpec(
10701084
chunk_width=chunk_width,
10711085
chunk_length=chunk_length,
1072-
variables=colspecs,
1086+
columns={col.name: col for col in colspecs},
10731087
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
10741088
sample_id=pcvcf.metadata.samples,
10751089
contig_id=pcvcf.metadata.contig_names,
@@ -1260,8 +1274,8 @@ def convert(
12601274
logger.info(f"Create zarr at {write_path}")
12611275
sgvcf = SgvcfZarr(write_path)
12621276
sgvcf.root = zarr.group(store=store, overwrite=True)
1263-
for variable in conversion_spec.variables[:]:
1264-
sgvcf.create_array(variable)
1277+
for column in conversion_spec.columns.values():
1278+
sgvcf.create_array(column)
12651279

12661280
progress_config = core.ProgressConfig(
12671281
total=pcvcf.total_uncompressed_bytes,
@@ -1286,7 +1300,7 @@ def convert(
12861300
)
12871301
pwm.submit(sgvcf.encode_filters, pcvcf, conversion_spec.filter_id)
12881302
has_gt = False
1289-
for variable in conversion_spec.variables[:]:
1303+
for variable in conversion_spec.columns.values():
12901304
if variable.vcf_field is not None:
12911305
# print("Encode", variable.name)
12921306
# TODO for large columns it's probably worth splitting up
@@ -1308,32 +1322,29 @@ def convert(
13081322
os.rename(write_path, path)
13091323

13101324

1311-
def generate_spec(columnarised, out):
1312-
pcvcf = PickleChunkedVcf.load(columnarised)
1325+
def mkschema(if_path, out):
1326+
pcvcf = PickleChunkedVcf.load(if_path)
13131327
spec = ZarrConversionSpec.generate(pcvcf)
1314-
json.dump(spec.asdict(), out, indent=4)
1328+
out.write(spec.asjson())
13151329

13161330

1317-
def to_zarr(
1318-
columnarised, zarr_path, conversion_spec, worker_processes=1, show_progress=False
1319-
):
1320-
pcvcf = PickleChunkedVcf.load(columnarised)
1321-
if conversion_spec is None:
1322-
spec = ZarrConversionSpec.generate(pcvcf)
1331+
def encode(if_path, zarr_path, schema_path, worker_processes=1, show_progress=False):
1332+
pcvcf = PickleChunkedVcf.load(if_path)
1333+
if schema_path is None:
1334+
schema = ZarrConversionSpec.generate(pcvcf)
13231335
else:
1324-
with open(conversion_spec, "r") as f:
1325-
d = json.load(f)
1326-
spec = ZarrConversionSpec.fromdict(d)
1336+
with open(schema_path, "r") as f:
1337+
schema = ZarrConversionSpec.fromjson(f.read())
13271338
SgvcfZarr.convert(
13281339
pcvcf,
13291340
zarr_path,
1330-
conversion_spec=spec,
1341+
conversion_spec=schema,
13311342
worker_processes=worker_processes,
13321343
show_progress=show_progress,
13331344
)
13341345

13351346

1336-
def convert_vcf(
1347+
def convert(
13371348
vcfs,
13381349
out_path,
13391350
*,

tests/test_cli.py

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@
44

55
from bio2zarr import cli
66

7-
class TestWithMocks:
87

8+
class TestWithMocks:
99
def test_vcf_explode(self):
1010
runner = ct.CliRunner(mix_stderr=False)
1111
with mock.patch("bio2zarr.vcf.explode") as mocked:
1212
result = runner.invoke(
1313
cli.vcf2zarr, ["explode", "source", "dest"], catch_exceptions=False
14-
1514
)
1615
assert result.exit_code == 0
1716
assert len(result.stdout) == 0
@@ -24,15 +23,97 @@ def test_vcf_explode(self):
2423
show_progress=True,
2524
)
2625

27-
def test_summarise(self):
26+
def test_inspect(self):
2827
runner = ct.CliRunner(mix_stderr=False)
29-
with mock.patch("bio2zarr.vcf.summarise", return_value={}) as mocked:
28+
with mock.patch("bio2zarr.vcf.inspect", return_value={}) as mocked:
3029
result = runner.invoke(
31-
cli.vcf2zarr, ["summarise", "path"], catch_exceptions=False
32-
30+
cli.vcf2zarr, ["inspect", "path"], catch_exceptions=False
3331
)
3432
assert result.exit_code == 0
3533
assert result.stdout == "\n"
3634
assert len(result.stderr) == 0
3735
mocked.assert_called_once_with("path")
3836

37+
def test_mkschema(self):
38+
runner = ct.CliRunner(mix_stderr=False)
39+
with mock.patch("bio2zarr.vcf.mkschema") as mocked:
40+
result = runner.invoke(
41+
cli.vcf2zarr, ["mkschema", "path"], catch_exceptions=False
42+
)
43+
assert result.exit_code == 0
44+
assert len(result.stdout) == 0
45+
assert len(result.stderr) == 0
46+
# TODO figure out how to test that we call it with stdout from
47+
# the CliRunner
48+
# mocked.assert_called_once_with("path", stdout)
49+
mocked.assert_called_once()
50+
51+
def test_encode(self):
52+
runner = ct.CliRunner(mix_stderr=False)
53+
with mock.patch("bio2zarr.vcf.encode") as mocked:
54+
result = runner.invoke(
55+
cli.vcf2zarr, ["encode", "if_path", "zarr_path"], catch_exceptions=False
56+
)
57+
assert result.exit_code == 0
58+
assert len(result.stdout) == 0
59+
assert len(result.stderr) == 0
60+
mocked.assert_called_once_with(
61+
"if_path",
62+
"zarr_path",
63+
None,
64+
worker_processes=1,
65+
show_progress=True,
66+
)
67+
68+
def test_convert_vcf(self):
69+
runner = ct.CliRunner(mix_stderr=False)
70+
with mock.patch("bio2zarr.vcf.convert") as mocked:
71+
result = runner.invoke(
72+
cli.vcf2zarr,
73+
["convert", "vcf_path", "zarr_path"],
74+
catch_exceptions=False,
75+
)
76+
assert result.exit_code == 0
77+
assert len(result.stdout) == 0
78+
assert len(result.stderr) == 0
79+
mocked.assert_called_once_with(
80+
("vcf_path",),
81+
"zarr_path",
82+
worker_processes=1,
83+
show_progress=True,
84+
)
85+
86+
def test_validate(self):
87+
runner = ct.CliRunner(mix_stderr=False)
88+
with mock.patch("bio2zarr.vcf.validate") as mocked:
89+
result = runner.invoke(
90+
cli.vcf2zarr,
91+
["validate", "vcf_path", "zarr_path"],
92+
catch_exceptions=False,
93+
)
94+
assert result.exit_code == 0
95+
assert len(result.stdout) == 0
96+
assert len(result.stderr) == 0
97+
mocked.assert_called_once_with(
98+
"vcf_path",
99+
"zarr_path",
100+
show_progress=True,
101+
)
102+
103+
def test_convert_plink(self):
104+
runner = ct.CliRunner(mix_stderr=False)
105+
with mock.patch("bio2zarr.plink.convert") as mocked:
106+
result = runner.invoke(
107+
cli.plink2zarr, ["convert", "in", "out"], catch_exceptions=False
108+
)
109+
assert result.exit_code == 0
110+
assert len(result.stdout) == 0
111+
assert len(result.stderr) == 0
112+
mocked.assert_called_once_with(
113+
"in",
114+
"out",
115+
worker_processes=1,
116+
chunk_width=None,
117+
chunk_length=None,
118+
show_progress=True,
119+
)

0 commit comments

Comments
 (0)