Skip to content

Commit bf7d56b

Browse files
Add progress for finalise step
1 parent 09fa305 commit bf7d56b

File tree

2 files changed

+24
-11
lines changed

2 files changed

+24
-11
lines changed

bio2zarr/core.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
131131
# encoder implementations.
132132
s = slice(offset, offset + np_buffer.shape[0])
133133
samples_chunk_size = zarr_array.chunks[1]
134-
# TODO use zarr chunks here to support non-uniform chunking later
135-
# and for simplicity
134+
# TODO use zarr chunks here for simplicity
136135
zarr_array_width = zarr_array.shape[1]
137136
start = 0
138137
while start < zarr_array_width:
@@ -192,7 +191,7 @@ def __init__(self, worker_processes=1, progress_config=None):
192191
self.progress_config = progress_config
193192
self.progress_bar = tqdm.tqdm(
194193
total=progress_config.total,
195-
desc=f"{progress_config.title:>7}",
194+
desc=f"{progress_config.title:>8}",
196195
unit_scale=True,
197196
unit=progress_config.units,
198197
smoothing=0.1,

bio2zarr/vcf.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1967,7 +1967,9 @@ def finalise_array(self, name):
19671967
chunk_files = [
19681968
path for path in src.iterdir() if not path.name.startswith(".")
19691969
]
1970-
# TODO check for a count of then number of files
1970+
# TODO check for a count of then number of files. If we require a
1971+
# dimension_separator of "/" then we could make stronger assertions
1972+
# here, as we'd always have num_variant_chunks
19711973
logger.debug(
19721974
f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
19731975
)
@@ -1976,11 +1978,23 @@ def finalise_array(self, name):
19761978
# Finally, once all the chunks have moved into the arrays dir,
19771979
# we move it out of wip
19781980
os.rename(self.arrays_path / name, self.path / name)
1981+
core.update_progress(1)
19791982

1980-
def finalise(self):
1983+
def finalise(self, show_progress=False):
19811984
self.load_metadata()
1982-
for name in self.metadata.schema.columns:
1983-
self.finalise_array(name)
1985+
1986+
progress_config = core.ProgressConfig(
1987+
total=len(self.metadata.schema.columns),
1988+
title="Finalise",
1989+
units="array",
1990+
show=show_progress,
1991+
)
1992+
# NOTE: it's not clear that adding more workers will make this quicker,
1993+
# as it's just going to be causing contention on the file system.
1994+
# Something to check empirically in some deployments.
1995+
with core.ParallelWorkManager(1, progress_config) as pwm:
1996+
for name in self.metadata.schema.columns:
1997+
pwm.submit(self.finalise_array, name)
19841998
zarr.consolidate_metadata(self.path)
19851999

19862000
######################
@@ -2074,7 +2088,7 @@ def encode(
20742088
show_progress=show_progress,
20752089
max_memory=max_memory,
20762090
)
2077-
encode_finalise(zarr_path)
2091+
vzw.finalise(show_progress)
20782092

20792093

20802094
def encode_init(
@@ -2124,9 +2138,9 @@ def encode_partition(zarr_path, partition, *, show_progress=False, worker_proces
21242138
)
21252139

21262140

2127-
def encode_finalise(zarr_path):
2141+
def encode_finalise(zarr_path, show_progress=False):
21282142
writer = VcfZarrWriter(zarr_path)
2129-
writer.finalise()
2143+
writer.finalise(show_progress=show_progress)
21302144

21312145

21322146
def convert(
@@ -2336,7 +2350,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
23362350
assert pos[start_index] == first_pos
23372351
vcf = cyvcf2.VCF(vcf_path)
23382352
if show_progress:
2339-
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
2353+
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
23402354
else:
23412355
iterator = vcf
23422356
for j, row in enumerate(iterator, start_index):

0 commit comments

Comments
 (0)