Skip to content

Commit 4a8e0ff

Browse files
Merge pull request #184 from jeromekelleher/encode-explode-init-interface
Improved info interface on encode/explode init
2 parents f2ec234 + eeeceea commit 4a8e0ff

File tree

7 files changed

+118
-45
lines changed

7 files changed

+118
-45
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# 0.0.10 2024-05-XX
2+
- Change output format of dexplode-init and dencode-init
3+
14
# 0.0.9 2024-05-02
25

36
- Change on-disk format for explode and schema

bio2zarr/cli.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import click
77
import coloredlogs
8-
import humanfriendly
98
import numcodecs
109
import tabulate
1110

@@ -65,6 +64,13 @@ def list_commands(self, ctx):
6564
help="Partition indexes are interpreted as one-based",
6665
)
6766

67+
json = click.option(
68+
"--json",
69+
is_flag=True,
70+
flag_value=True,
71+
help="Output summary data in JSON format",
72+
)
73+
6874
version = click.version_option(version=f"{provenance.__version__}")
6975

7076
worker_processes = click.option(
@@ -166,6 +172,16 @@ def get_compressor(cname):
166172
return numcodecs.get_codec(config)
167173

168174

175+
def show_work_summary(work_summary, json):
176+
if json:
177+
output = work_summary.asjson()
178+
else:
179+
data = work_summary.asdict()
180+
output = tabulate.tabulate(list(data.items()), tablefmt="plain")
181+
# output = "\n".join(f"{k}\t{v}" for k, v in data.items())
182+
click.echo(output)
183+
184+
169185
@click.command
170186
@vcfs
171187
@new_icf_path
@@ -199,6 +215,7 @@ def explode(
199215
@force
200216
@column_chunk_size
201217
@compressor
218+
@json
202219
@verbose
203220
@worker_processes
204221
def dexplode_init(
@@ -208,6 +225,7 @@ def dexplode_init(
208225
force,
209226
column_chunk_size,
210227
compressor,
228+
json,
211229
verbose,
212230
worker_processes,
213231
):
@@ -217,7 +235,7 @@ def dexplode_init(
217235
"""
218236
setup_logging(verbose)
219237
check_overwrite_dir(icf_path, force)
220-
num_partitions = vcf.explode_init(
238+
work_summary = vcf.explode_init(
221239
icf_path,
222240
vcfs,
223241
target_num_partitions=num_partitions,
@@ -226,7 +244,7 @@ def dexplode_init(
226244
compressor=get_compressor(compressor),
227245
show_progress=True,
228246
)
229-
click.echo(num_partitions)
247+
show_work_summary(work_summary, json)
230248

231249

232250
@click.command
@@ -331,6 +349,7 @@ def encode(
331349
@variants_chunk_size
332350
@samples_chunk_size
333351
@max_variant_chunks
352+
@json
334353
@verbose
335354
def dencode_init(
336355
icf_path,
@@ -341,6 +360,7 @@ def dencode_init(
341360
variants_chunk_size,
342361
samples_chunk_size,
343362
max_variant_chunks,
363+
json,
344364
verbose,
345365
):
346366
"""
@@ -358,7 +378,7 @@ def dencode_init(
358378
"""
359379
setup_logging(verbose)
360380
check_overwrite_dir(zarr_path, force)
361-
num_partitions, max_memory = vcf.encode_init(
381+
work_summary = vcf.encode_init(
362382
icf_path,
363383
zarr_path,
364384
target_num_partitions=num_partitions,
@@ -368,15 +388,7 @@ def dencode_init(
368388
max_variant_chunks=max_variant_chunks,
369389
show_progress=True,
370390
)
371-
formatted_size = humanfriendly.format_size(max_memory, binary=True)
372-
# NOTE adding the size to the stdout here so that users can parse it
373-
# and use in their submission scripts. This is a first pass, and
374-
# will most likely change as we see what works and doesn't.
375-
# NOTE we probably want to format this as a table, which lists
376-
# some other properties, line by line
377-
# NOTE This size number is also not quite enough, you need a bit of
378-
# headroom with it (probably 10% or so). We should include this.
379-
click.echo(f"{num_partitions}\t{formatted_size}")
391+
show_work_summary(work_summary, json)
380392

381393

382394
@click.command

bio2zarr/vcf.py

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,10 @@ def num_contigs(self):
204204
def num_filters(self):
205205
return len(self.filters)
206206

207+
@property
208+
def num_samples(self):
209+
return len(self.samples)
210+
207211
@staticmethod
208212
def fromdict(d):
209213
if d["format_version"] != ICF_METADATA_FORMAT_VERSION:
@@ -982,6 +986,19 @@ def check_field_clobbering(icf_metadata):
982986
)
983987

984988

989+
@dataclasses.dataclass
990+
class IcfWriteSummary:
991+
num_partitions: int
992+
num_samples: int
993+
num_variants: int
994+
995+
def asdict(self):
996+
return dataclasses.asdict(self)
997+
998+
def asjson(self):
999+
return json.dumps(self.asdict(), indent=4)
1000+
1001+
9851002
class IntermediateColumnarFormatWriter:
9861003
def __init__(self, path):
9871004
self.path = pathlib.Path(path)
@@ -1038,7 +1055,11 @@ def init(
10381055
logger.info("Writing WIP metadata")
10391056
with open(self.wip_path / "metadata.json", "w") as f:
10401057
json.dump(self.metadata.asdict(), f, indent=4)
1041-
return self.num_partitions
1058+
return IcfWriteSummary(
1059+
num_partitions=self.num_partitions,
1060+
num_variants=icf_metadata.num_records,
1061+
num_samples=icf_metadata.num_samples,
1062+
)
10421063

10431064
def mkdirs(self):
10441065
num_dirs = len(self.metadata.fields)
@@ -1371,6 +1392,7 @@ def variant_chunk_nbytes(self):
13711392
"""
13721393
Returns the nbytes for a single variant chunk of this array.
13731394
"""
1395+
# TODO WARNING IF this is a string
13741396
chunk_items = self.chunks[0]
13751397
for size in self.shape[1:]:
13761398
chunk_items *= size
@@ -1643,6 +1665,21 @@ def fromdict(d):
16431665
return ret
16441666

16451667

1668+
@dataclasses.dataclass
1669+
class VcfZarrWriteSummary:
1670+
num_partitions: int
1671+
num_samples: int
1672+
num_variants: int
1673+
num_chunks: int
1674+
max_encoding_memory: str
1675+
1676+
def asdict(self):
1677+
return dataclasses.asdict(self)
1678+
1679+
def asjson(self):
1680+
return json.dumps(self.asdict(), indent=4)
1681+
1682+
16461683
class VcfZarrWriter:
16471684
def __init__(self, path):
16481685
self.path = pathlib.Path(path)
@@ -1718,13 +1755,22 @@ def init(
17181755
store = zarr.DirectoryStore(self.arrays_path)
17191756
root = zarr.group(store=store)
17201757

1721-
for column in self.schema.fields.values():
1722-
self.init_array(root, column, partitions[-1].stop)
1758+
total_chunks = 0
1759+
for field in self.schema.fields.values():
1760+
a = self.init_array(root, field, partitions[-1].stop)
1761+
total_chunks += a.nchunks
17231762

17241763
logger.info("Writing WIP metadata")
17251764
with open(self.wip_path / "metadata.json", "w") as f:
17261765
json.dump(self.metadata.asdict(), f, indent=4)
1727-
return len(partitions)
1766+
1767+
return VcfZarrWriteSummary(
1768+
num_variants=self.icf.num_records,
1769+
num_samples=self.icf.num_samples,
1770+
num_partitions=self.num_partitions,
1771+
num_chunks=total_chunks,
1772+
max_encoding_memory=display_size(self.get_max_encoding_memory()),
1773+
)
17281774

17291775
def encode_samples(self, root):
17301776
if self.schema.samples != self.icf.metadata.samples:
@@ -1794,6 +1840,7 @@ def init_array(self, root, variable, variants_dim_size):
17941840
}
17951841
)
17961842
logger.debug(f"Initialised {a}")
1843+
return a
17971844

17981845
#######################
17991846
# encode_partition
@@ -2062,6 +2109,9 @@ def get_max_encoding_memory(self):
20622109
"""
20632110
Return the approximate maximum memory used to encode a variant chunk.
20642111
"""
2112+
# NOTE This size number is also not quite enough, you need a bit of
2113+
# headroom with it (probably 10% or so). We should include this.
2114+
# FIXME this is actively wrong for String columns. See if we can do better.
20652115
max_encoding_mem = max(
20662116
col.variant_chunk_nbytes for col in self.schema.fields.values()
20672117
)
@@ -2190,14 +2240,13 @@ def encode_init(
21902240
schema = VcfZarrSchema.fromjson(f.read())
21912241
zarr_path = pathlib.Path(zarr_path)
21922242
vzw = VcfZarrWriter(zarr_path)
2193-
vzw.init(
2243+
return vzw.init(
21942244
icf,
21952245
target_num_partitions=target_num_partitions,
21962246
schema=schema,
21972247
dimension_separator=dimension_separator,
21982248
max_variant_chunks=max_variant_chunks,
21992249
)
2200-
return vzw.num_partitions, vzw.get_max_encoding_memory()
22012250

22022251

22032252
def encode_partition(zarr_path, partition):

tests/test_cli.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import dataclasses
2+
import json
13
from unittest import mock
24

35
import click.testing as ct
@@ -46,6 +48,17 @@
4648
DEFAULT_DENCODE_FINALISE_ARGS = dict(show_progress=True)
4749

4850

51+
@dataclasses.dataclass
52+
class FakeWorkSummary:
53+
num_partitions: int
54+
55+
def asdict(self):
56+
return dataclasses.asdict(self)
57+
58+
def asjson(self):
59+
return json.dumps(self.asdict())
60+
61+
4962
class TestWithMocks:
5063
vcf_path = "tests/data/vcf/sample.vcf.gz"
5164

@@ -262,7 +275,7 @@ def test_vcf_explode_missing_and_existing_vcf(self, mocked, tmp_path):
262275
assert "'no_such_file' does not exist" in result.stderr
263276
mocked.assert_not_called()
264277

265-
@mock.patch("bio2zarr.vcf.explode_init", return_value=5)
278+
@mock.patch("bio2zarr.vcf.explode_init", return_value=FakeWorkSummary(5))
266279
def test_vcf_dexplode_init(self, mocked, tmp_path):
267280
runner = ct.CliRunner(mix_stderr=False)
268281
icf_path = tmp_path / "icf"
@@ -273,7 +286,7 @@ def test_vcf_dexplode_init(self, mocked, tmp_path):
273286
)
274287
assert result.exit_code == 0
275288
assert len(result.stderr) == 0
276-
assert result.stdout == "5\n"
289+
assert list(result.stdout.split()) == ["num_partitions", "5"]
277290
mocked.assert_called_once_with(
278291
str(icf_path),
279292
(self.vcf_path,),
@@ -415,7 +428,7 @@ def test_encode(self, mocked, tmp_path):
415428
**DEFAULT_ENCODE_ARGS,
416429
)
417430

418-
@mock.patch("bio2zarr.vcf.encode_init", return_value=(10, 1024))
431+
@mock.patch("bio2zarr.vcf.encode_init", return_value=FakeWorkSummary(10))
419432
def test_dencode_init(self, mocked, tmp_path):
420433
icf_path = tmp_path / "icf"
421434
icf_path.mkdir()
@@ -427,7 +440,7 @@ def test_dencode_init(self, mocked, tmp_path):
427440
catch_exceptions=False,
428441
)
429442
assert result.exit_code == 0
430-
assert result.stdout == "10\t1 KiB\n"
443+
assert list(result.stdout.split()) == ["num_partitions", "10"]
431444
assert len(result.stderr) == 0
432445
mocked.assert_called_once_with(
433446
str(icf_path),
@@ -529,11 +542,11 @@ def test_dexplode(self, tmp_path, one_based):
529542
runner = ct.CliRunner(mix_stderr=False)
530543
result = runner.invoke(
531544
cli.vcf2zarr,
532-
f"dexplode-init {self.vcf_path} {icf_path} 5",
545+
f"dexplode-init {self.vcf_path} {icf_path} 5 --json",
533546
catch_exceptions=False,
534547
)
535548
assert result.exit_code == 0
536-
assert result.stdout.strip() == "3"
549+
assert json.loads(result.stdout)["num_partitions"] == 3
537550

538551
for j in range(3):
539552
if one_based:
@@ -598,11 +611,11 @@ def test_dencode(self, tmp_path, one_based):
598611
assert result.exit_code == 0
599612
result = runner.invoke(
600613
cli.vcf2zarr,
601-
f"dencode-init {icf_path} {zarr_path} 5 --variants-chunk-size=3",
614+
f"dencode-init {icf_path} {zarr_path} 5 --variants-chunk-size=3 --json",
602615
catch_exceptions=False,
603616
)
604617
assert result.exit_code == 0
605-
assert result.stdout.split()[0] == "3"
618+
assert json.loads(result.stdout)["num_partitions"] == 3
606619

607620
for j in range(3):
608621
if one_based:

tests/test_icf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@ class TestIcfWriterExample:
105105
def test_init_paths(self, tmp_path):
106106
icf_path = tmp_path / "x.icf"
107107
assert not icf_path.exists()
108-
num_partitions = vcf.explode_init(icf_path, [self.data_path])
109-
assert num_partitions == 3
108+
summary = vcf.explode_init(icf_path, [self.data_path])
109+
assert summary.num_partitions == 3
110110
assert icf_path.exists()
111111
wip_path = icf_path / "wip"
112112
assert wip_path.exists()
@@ -118,9 +118,9 @@ def test_init_paths(self, tmp_path):
118118
def test_finalise_paths(self, tmp_path):
119119
icf_path = tmp_path / "x.icf"
120120
wip_path = icf_path / "wip"
121-
num_partitions = vcf.explode_init(icf_path, [self.data_path])
121+
summary = vcf.explode_init(icf_path, [self.data_path])
122122
assert icf_path.exists()
123-
for j in range(num_partitions):
123+
for j in range(summary.num_partitions):
124124
vcf.explode_partition(icf_path, j)
125125
assert wip_path.exists()
126126
vcf.explode_finalise(icf_path)
@@ -270,8 +270,8 @@ def run_explode(self, tmp_path, **kwargs):
270270

271271
def run_dexplode(self, tmp_path, **kwargs):
272272
icf_path = tmp_path / "icf"
273-
partitions = vcf.explode_init(icf_path, [self.data_path], **kwargs)
274-
for j in range(partitions):
273+
summary = vcf.explode_init(icf_path, [self.data_path], **kwargs)
274+
for j in range(summary.num_partitions):
275275
vcf.explode_partition(icf_path, j)
276276
vcf.explode_finalise(icf_path)
277277
return vcf.IntermediateColumnarFormat(icf_path)

0 commit comments

Comments
 (0)