Skip to content

Commit 430ac5e

Browse files
Minor refactor to prepare for incremental finalise
1 parent 526cce1 commit 430ac5e

File tree

1 file changed

+29
-13
lines changed

1 file changed

+29
-13
lines changed

bio2zarr/vcf.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,7 @@ class EncodingWork:
12891289
func: callable
12901290
start: int
12911291
stop: int
1292+
columns: list[str]
12921293
memory: int = 0
12931294

12941295

@@ -1525,50 +1526,65 @@ def encode(
15251526
f = functools.partial(self.encode_array_slice, col)
15261527
work.append(
15271528
EncodingWork(
1528-
f, start, stop, encoding_memory_requirements[col.name]
1529+
f,
1530+
start,
1531+
stop,
1532+
[col.name],
1533+
encoding_memory_requirements[col.name],
15291534
)
15301535
)
1531-
work.append(EncodingWork(self.encode_alleles_slice, start, stop))
1532-
work.append(EncodingWork(self.encode_id_slice, start, stop))
1536+
work.append(
1537+
EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
1538+
)
1539+
work.append(EncodingWork(self.encode_id_slice, start, stop, ["variant_id"]))
15331540
work.append(
15341541
EncodingWork(
15351542
functools.partial(self.encode_filters_slice, filter_id_map),
15361543
start,
15371544
stop,
1545+
["variant_filters"],
15381546
)
15391547
)
15401548
work.append(
15411549
EncodingWork(
15421550
functools.partial(self.encode_contig_slice, contig_id_map),
15431551
start,
15441552
stop,
1553+
["variant_contig_id"],
15451554
)
15461555
)
15471556
if "call_genotype" in self.schema.columns:
1557+
variables = [
1558+
"call_genotype",
1559+
"call_genotype_phased",
1560+
"call_genotype_mask",
1561+
]
15481562
gt_memory = sum(
1549-
encoding_memory_requirements[name]
1550-
for name in [
1551-
"call_genotype",
1552-
"call_genotype_phased",
1553-
"call_genotype_mask",
1554-
]
1563+
encoding_memory_requirements[name] for name in variables
15551564
)
15561565
work.append(
1557-
EncodingWork(self.encode_genotypes_slice, start, stop, gt_memory)
1566+
EncodingWork(
1567+
self.encode_genotypes_slice, start, stop, variables, gt_memory
1568+
)
15581569
)
15591570
# Fail early if we can't fit a particular column into memory
15601571
for wp in work:
15611572
if wp.memory >= max_memory:
1562-
raise ValueError(f"Insufficient memory for {wp.func}: "
1563-
f"{display_size(wp.memory)} > {display_size(max_memory)}")
1564-
1573+
raise ValueError(
1574+
f"Insufficient memory for {wp.columns}: "
1575+
f"{display_size(wp.memory)} > {display_size(max_memory)}"
1576+
)
15651577

15661578
progress_config = core.ProgressConfig(
15671579
total=total_bytes,
15681580
title="Encode",
15691581
units="B",
15701582
show=show_progress,
15711583
)
1584+
# TODO add a map of slices completed to column here, so that we can
1585+
# finalise the arrays as they get completed. We'll have to service
1586+
# the futures more, though, not just when we exceed the memory budget
1587+
15721588
used_memory = 0
15731589
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
15741590
future = pwm.submit(self.encode_samples)

0 commit comments

Comments
 (0)