Skip to content

Commit db103c4

Browse files
Merge pull request #187 from jeromekelleher/better-logging-mem
Add more logging for variant chunk memory
2 parents 4a8e0ff + 60d4770 commit db103c4

File tree

2 files changed

+13
-11
lines changed

2 files changed

+13
-11
lines changed

bio2zarr/core.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ def flush(self):
128128
sync_flush_2d_array(
129129
self.buff[: self.buffer_row], self.array, self.array_offset
130130
)
131-
# FIXME the array.name doesn't seem to be working here for some reason
132131
logger.debug(
133132
f"Flushed <{self.array.name} {self.array.shape} "
134133
f"{self.array.dtype}> "

bio2zarr/vcf.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,11 +1392,15 @@ def variant_chunk_nbytes(self):
13921392
"""
13931393
Returns the nbytes for a single variant chunk of this array.
13941394
"""
1395-
# TODO WARNING IF this is a string
13961395
chunk_items = self.chunks[0]
13971396
for size in self.shape[1:]:
13981397
chunk_items *= size
13991398
dt = np.dtype(self.dtype)
1399+
if dt.kind == "O":
1400+
logger.warning(
1401+
f"Field {self.name} is a string; max memory usage may "
1402+
"be a significant underestimate"
1403+
)
14001404
return chunk_items * dt.itemsize
14011405

14021406

@@ -1890,13 +1894,15 @@ def encode_partition(self, partition_index):
18901894
os.rename(partition_path, final_path)
18911895

18921896
def init_partition_array(self, partition_index, name):
1893-
wip_path = self.wip_partition_array_path(partition_index, name)
18941897
# Create an empty array like the definition
18951898
src = self.arrays_path / name
18961899
# Overwrite any existing WIP files
1900+
wip_path = self.wip_partition_array_path(partition_index, name)
18971901
shutil.copytree(src, wip_path, dirs_exist_ok=True)
1898-
array = zarr.open(wip_path)
1899-
logger.debug(f"Opened empty array {array} @ {wip_path}")
1902+
store = zarr.DirectoryStore(self.wip_partition_path(partition_index))
1903+
wip_root = zarr.group(store=store)
1904+
array = wip_root[name]
1905+
logger.debug(f"Opened empty array {array.name} <{array.dtype}> @ {wip_path}")
19001906
return array
19011907

19021908
def finalise_partition_array(self, partition_index, name):
@@ -2109,12 +2115,9 @@ def get_max_encoding_memory(self):
21092115
"""
21102116
Return the approximate maximum memory used to encode a variant chunk.
21112117
"""
2112-
# NOTE This size number is also not quite enough, you need a bit of
2113-
# headroom with it (probably 10% or so). We should include this.
2114-
# FIXME this is actively wrong for String columns. See if we can do better.
2115-
max_encoding_mem = max(
2116-
col.variant_chunk_nbytes for col in self.schema.fields.values()
2117-
)
2118+
max_encoding_mem = 0
2119+
for col in self.schema.fields.values():
2120+
max_encoding_mem = max(max_encoding_mem, col.variant_chunk_nbytes)
21182121
gt_mem = 0
21192122
if "call_genotype" in self.schema.fields:
21202123
encoded_together = [

0 commit comments

Comments
 (0)