Skip to content

Commit 6eb33a7

Browse files
Improve dencode interface
1 parent 687c715 commit 6eb33a7

File tree

3 files changed

+41
-16
lines changed

3 files changed

+41
-16
lines changed

bio2zarr/cli.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import click
77
import coloredlogs
8+
import humanfriendly
89
import numcodecs
910
import tabulate
1011

@@ -335,7 +336,7 @@ def dencode_init(
335336
"""
336337
setup_logging(verbose)
337338
check_overwrite_dir(zarr_path, force)
338-
num_partitions = vcf.encode_init(
339+
num_partitions, max_memory = vcf.encode_init(
339340
icf_path,
340341
zarr_path,
341342
target_num_partitions=num_partitions,
@@ -345,7 +346,15 @@ def dencode_init(
345346
max_variant_chunks=max_variant_chunks,
346347
show_progress=True,
347348
)
348-
click.echo(num_partitions)
349+
formatted_size = humanfriendly.format_size(max_memory, binary=True)
350+
# NOTE adding the size to the stdout here so that users can parse it
351+
# and use in their submission scripts. This is a first pass, and
352+
# will most likely change as we see what works and doesn't.
353+
# NOTE we probably want to format this as a table, which lists
354+
# some other properties, line by line
355+
# NOTE This size number is also not quite enough, you need a bit of
356+
# headroom with it (probably 10% or so). We should include this.
357+
click.echo(f"{num_partitions}\t{formatted_size}")
349358

350359

351360
@click.command

bio2zarr/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def flush(self):
110110
sync_flush_2d_array(
111111
self.buff[: self.buffer_row], self.array, self.array_offset
112112
)
113+
# FIXME the array.name doesn't seem to be working here for some reason
113114
logger.debug(
114115
f"Flushed <{self.array.name} {self.array.shape} "
115116
f"{self.array.dtype}> "

bio2zarr/vcf.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,12 +1323,9 @@ def variant_chunk_nbytes(self):
13231323
"""
13241324
Returns the nbytes for a single variant chunk of this array.
13251325
"""
1326-
chunk_items = 1
1327-
for dim, size in enumerate(self.shape):
1328-
chunk_dim_size = size
1329-
if dim < len(self.chunks):
1330-
chunk_dim_size = self.chunks[dim]
1331-
chunk_items *= chunk_dim_size
1326+
chunk_items = self.chunks[0]
1327+
for size in self.shape[1:]:
1328+
chunk_items *= size
13321329
dt = np.dtype(self.dtype)
13331330
return chunk_items * dt.itemsize
13341331

@@ -1616,6 +1613,10 @@ def __init__(self, path):
16161613
def schema(self):
16171614
return self.metadata.schema
16181615

1616+
@property
1617+
def num_partitions(self):
1618+
return len(self.metadata.partitions)
1619+
16191620
#######################
16201621
# init
16211622
#######################
@@ -1778,10 +1779,10 @@ def encode_partition(self, partition_index):
17781779
self.encode_id_partition(partition_index)
17791780
self.encode_filters_partition(partition_index)
17801781
self.encode_contig_partition(partition_index)
1781-
for col in self.metadata.schema.columns.values():
1782+
for col in self.schema.columns.values():
17821783
if col.vcf_field is not None:
17831784
self.encode_array_partition(col, partition_index)
1784-
if "call_genotype" in self.metadata.schema.columns:
1785+
if "call_genotype" in self.schema.columns:
17851786
self.encode_genotypes_partition(partition_index)
17861787

17871788
def init_partition_array(self, partition_index, name):
@@ -1954,6 +1955,7 @@ def finalise_array(self, name):
19541955
# Move all the files in partition dir to dest dir
19551956
src = self.partition_array_path(partition, name)
19561957
if not src.exists():
1958+
# Needs test
19571959
raise ValueError(f"Partition {partition} of {name} does not exist")
19581960
dest = self.arrays_path / name
19591961
# This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
@@ -1977,7 +1979,7 @@ def finalise(self, show_progress=False):
19771979
self.load_metadata()
19781980

19791981
progress_config = core.ProgressConfig(
1980-
total=len(self.metadata.schema.columns),
1982+
total=len(self.schema.columns),
19811983
title="Finalise",
19821984
units="array",
19831985
show=show_progress,
@@ -1991,7 +1993,7 @@ def finalise(self, show_progress=False):
19911993
# for multiple workers, or making a standard wrapper for tqdm
19921994
# that allows us to have a consistent look and feel.
19931995
with core.ParallelWorkManager(0, progress_config) as pwm:
1994-
for name in self.metadata.schema.columns:
1996+
for name in self.schema.columns:
19951997
pwm.submit(self.finalise_array, name)
19961998
zarr.consolidate_metadata(self.path)
19971999

@@ -2003,16 +2005,28 @@ def get_max_encoding_memory(self):
20032005
"""
20042006
Return the approximate maximum memory used to encode a variant chunk.
20052007
"""
2006-
return max(
2007-
col.variant_chunk_nbytes for col in self.metadata.schema.columns.values()
2008+
max_encoding_mem = max(
2009+
col.variant_chunk_nbytes for col in self.schema.columns.values()
20082010
)
2011+
gt_mem = 0
2012+
if "call_genotype" in self.schema.columns:
2013+
encoded_together = [
2014+
"call_genotype",
2015+
"call_genotype_phased",
2016+
"call_genotype_mask",
2017+
]
2018+
gt_mem = sum(
2019+
self.schema.columns[col].variant_chunk_nbytes
2020+
for col in encoded_together
2021+
)
2022+
return max(max_encoding_mem, gt_mem)
20092023

20102024
def encode_all_partitions(
20112025
self, *, worker_processes=1, show_progress=False, max_memory=None
20122026
):
20132027
max_memory = parse_max_memory(max_memory)
20142028
self.load_metadata()
2015-
num_partitions = len(self.metadata.partitions)
2029+
num_partitions = len(self.num_partitions)
20162030
per_worker_memory = self.get_max_encoding_memory()
20172031
logger.info(
20182032
f"Encoding Zarr over {num_partitions} partitions with "
@@ -2120,13 +2134,14 @@ def encode_init(
21202134
schema = VcfZarrSchema.fromjson(f.read())
21212135
zarr_path = pathlib.Path(zarr_path)
21222136
vzw = VcfZarrWriter(zarr_path)
2123-
return vzw.init(
2137+
vzw.init(
21242138
icf,
21252139
target_num_partitions=target_num_partitions,
21262140
schema=schema,
21272141
dimension_separator=dimension_separator,
21282142
max_variant_chunks=max_variant_chunks,
21292143
)
2144+
return vzw.num_partitions, vzw.get_max_encoding_memory()
21302145

21312146

21322147
def encode_partition(zarr_path, partition):

0 commit comments

Comments
 (0)