Logging improvements

jeromekelleher · jeromekelleher · commit 9f817a2115a8 · 2025-01-17T16:31:08.000Z
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -179,7 +179,12 @@ def flush(self):
                 f"{self.array_offset}:{self.array_offset + self.buffer_row}"
                 f"{self.buff.nbytes / 2**20: .2f}Mb"
             )
-            self.max_buff_size = max(self.max_buff_size, sys.getsizeof(self.buff))
+            # Note this is inaccurate for string data as we're just reporting the
+            # size of the container. When we switch the numpy 2 StringDtype this
+            # should improve and we can get more visibility on how memory
+            # is being used.
+            # https://github.com/sgkit-dev/bio2zarr/issues/30
+            self.max_buff_size = max(self.max_buff_size, self.buff.nbytes)
             self.array_offset += self.variants_chunk_size
             self.buffer_row = 0
 
diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
@@ -862,13 +862,9 @@ def init_partition_array(self, partition_index, name):
 
     def finalise_partition_array(self, partition_index, buffered_array):
         buffered_array.flush()
-        # field_map = self.schema.field_map()
-        # array_spec = field_map[buffered_array.name]
-        # ba = buffered_array
-        # print(array_spec.name, "ba.max_buff_size", ba.max_buff_size,
-        # array_spec.variant_chunk_nbytes)
         logger.info(
-            f"Completed partition {partition_index} array {buffered_array.name}"
+            f"Completed partition {partition_index} array {buffered_array.name} "
+            f"max_memory={core.display_size(buffered_array.max_buff_size)}"
         )
 
     def encode_array_partition(self, array_spec, partition_index):