Merge pull request #200 from csiro-coasts/track-numpy-memory-usage

mx-moth · web-flow · commit d3ec547ecb11 · 2025-11-17T16:22:56.000+11:00
Improve peak memory allocation during polygon construction
diff --git a/docs/releases/development.rst b/docs/releases/development.rst
@@ -2,6 +2,9 @@
 Next release (in development)
 =============================
 
+* Reduce memory allocations when constructing polygons.
+  This should allow opening even larger datasets
+  (:pr:`200`).
 * Add support for Python 3.14 and drop support for Python 3.11,
   following `SPEC-0 <https://scientific-python.org/specs/spec-0000/>`_.
   (:pr:`201`).
diff --git a/pyproject.toml b/pyproject.toml
@@ -102,6 +102,7 @@ filterwarnings = [
 markers = [
 	"matplotlib: Tests that involve matplotlib and plotting",
 	"tutorial: Tests that involve the tutorial datasets",
+	"memory_usage: Regression tests for memory allocations",
 ]
 
 mpl-use-full-test-name = true
diff --git a/src/emsarray/conventions/arakawa_c.py b/src/emsarray/conventions/arakawa_c.py
@@ -261,18 +261,31 @@ def pack_index(self, grid_kind: ArakawaCGridKind, indexes: Sequence[int]) -> Ara
         return cast(ArakawaCIndex, (grid_kind, *indexes))
 
     def _make_polygons(self) -> numpy.ndarray:
-        # Make an array of shape (j, i, 2) of all the nodes
-        grid = numpy.stack([self.node.longitude.values, self.node.latitude.values], axis=-1)
-
-        # Transform this in to an array of shape (topology.size, 4, 2)
-        points = numpy.stack([
-            grid[:-1, :-1],
-            grid[:-1, +1:],
-            grid[+1:, +1:],
-            grid[+1:, :-1],
-        ], axis=2).reshape((-1, 4, 2))
-
-        return utils.make_polygons_with_holes(points)
+        j_size, i_size = self.face.shape
+        longitude = self.node.longitude.values
+        latitude = self.node.latitude.values
+
+        # Preallocate the points array. We will copy data straight in to this
+        # to save repeated memory allocations.
+        points = numpy.empty(shape=(i_size, 4, 2), dtype=self.node.longitude.dtype)
+        # Preallocate the output array so we can fill it in batches
+        out = numpy.full(shape=self.face.size, fill_value=None, dtype=object)
+        # Construct polygons row by row
+        for j in range(self.face.shape[0]):
+            points[:, 0, 0] = longitude[j + 0, :-1]
+            points[:, 1, 0] = longitude[j + 0, +1:]
+            points[:, 2, 0] = longitude[j + 1, +1:]
+            points[:, 3, 0] = longitude[j + 1, :-1]
+
+            points[:, 0, 1] = latitude[j + 0, :-1]
+            points[:, 1, 1] = latitude[j + 0, +1:]
+            points[:, 2, 1] = latitude[j + 1, +1:]
+            points[:, 3, 1] = latitude[j + 1, :-1]
+
+            j_slice = slice(j * i_size, (j + 1) * i_size)
+            utils.make_polygons_with_holes(points, out=out[j_slice])
+
+        return out
 
     @cached_property
     def face_centres(self) -> numpy.ndarray:
diff --git a/src/emsarray/conventions/grid.py b/src/emsarray/conventions/grid.py
@@ -413,39 +413,32 @@ def _make_polygons(self) -> numpy.ndarray:
         lon_bounds = self.topology.longitude_bounds.values
         lat_bounds = self.topology.latitude_bounds.values
 
-        # Make a bounds array as if this dataset had 2D coordinates.
-        # 1D bounds are (lon, 2) and (lat, 2).
-        # 2D bounds are (lat, lon, 4)
-        # where the 4 points are (j-i, i-1), (j-1, i+1), (j+1, i+1), (j+1, i-1).
-        # The bounds values are repeated as required, are given a new dimension,
-        # then repeated along that new dimension.
-        # They will come out as array with shape (y_size, x_size, 4)
-
-        lon_bounds_2d = numpy.stack([
-            lon_bounds[:, 0],
-            lon_bounds[:, 1],
-            lon_bounds[:, 1],
-            lon_bounds[:, 0],
-        ], axis=-1)
-        lon_bounds_2d = numpy.broadcast_to(numpy.expand_dims(lon_bounds_2d, 0), (y_size, x_size, 4))
-
-        lat_bounds_2d = numpy.stack([
-            lat_bounds[:, 0],
-            lat_bounds[:, 0],
-            lat_bounds[:, 1],
-            lat_bounds[:, 1],
-        ], axis=-1)
-        lat_bounds_2d = numpy.broadcast_to(numpy.expand_dims(lat_bounds_2d, 0), (x_size, y_size, 4))
-        lat_bounds_2d = numpy.transpose(lat_bounds_2d, (1, 0, 2))
-
-        assert lon_bounds_2d.shape == lat_bounds_2d.shape == (y_size, x_size, 4)
-
-        # points is a (topology.size, 4, 2) array of the corners of each cell
-        points = numpy.stack([lon_bounds_2d, lat_bounds_2d], axis=-1).reshape((-1, 4, 2))
-
-        polygons = utils.make_polygons_with_holes(points)
-
-        return polygons
+        # Create the polygons batched by row.
+        # The point array is copied by shapely before being used,
+        # so this can accidentally use a whole bunch of memory for large datasets.
+        # Creating them one by one is very slow but very memory efficient.
+        # Creating the polygons in one batch is faster but uses up a huge amount of memory.
+        # Batching them row by row is a decent compromise.
+        out = numpy.full(shape=y_size * x_size, dtype=object, fill_value=None)
+
+        # By preallocating this array, we can copy data in to it to save on a number of allocations.
+        chunk_points = numpy.empty(shape=(x_size, 4, 2), dtype=lon_bounds.dtype)
+        # By chunking by row, the longitude bounds never change between loops
+        chunk_points[:, 0, 0] = lon_bounds[:, 0]
+        chunk_points[:, 1, 0] = lon_bounds[:, 1]
+        chunk_points[:, 2, 0] = lon_bounds[:, 1]
+        chunk_points[:, 3, 0] = lon_bounds[:, 0]
+
+        for row in range(y_size):
+            chunk_points[:, 0, 1] = lat_bounds[row, 0]
+            chunk_points[:, 1, 1] = lat_bounds[row, 0]
+            chunk_points[:, 2, 1] = lat_bounds[row, 1]
+            chunk_points[:, 3, 1] = lat_bounds[row, 1]
+
+            row_slice = slice(row * x_size, (row + 1) * x_size)
+            utils.make_polygons_with_holes(chunk_points, out=out[row_slice])
+
+        return out
 
     @cached_property
     def face_centres(self) -> numpy.ndarray:
@@ -597,13 +590,22 @@ def check_dataset(cls, dataset: xarray.Dataset) -> int | None:
 
     def _make_polygons(self) -> numpy.ndarray:
         # Construct polygons from the bounds of the cells
-        lon_bounds = self.topology.longitude_bounds.values
-        lat_bounds = self.topology.latitude_bounds.values
-
-        # points is a (topology.size, 4, 2) array of the corners of each cell
-        points = numpy.stack([lon_bounds, lat_bounds], axis=-1).reshape((-1, 4, 2))
-
-        return utils.make_polygons_with_holes(points)
+        j_size, i_size = self.topology.shape
+        lon_bounds = self.topology.longitude_bounds
+        lat_bounds = self.topology.latitude_bounds
+
+        assert lon_bounds.shape == (j_size, i_size, 4)
+        assert lat_bounds.shape == (j_size, i_size, 4)
+
+        chunk_points = numpy.empty(shape=(i_size, 4, 2), dtype=lon_bounds.dtype)
+        out = numpy.full(shape=j_size * i_size, dtype=object, fill_value=None)
+        for j in range(j_size):
+            chunk_points[:, :, 0] = lon_bounds[j, :, :]
+            chunk_points[:, :, 1] = lat_bounds[j, :, :]
+            chunk_slice = slice(j * i_size, (j + 1) * i_size)
+            utils.make_polygons_with_holes(chunk_points, out=out[chunk_slice])
+
+        return out
 
     @cached_property
     def face_centres(self) -> numpy.ndarray:
diff --git a/src/emsarray/conventions/ugrid.py b/src/emsarray/conventions/ugrid.py
@@ -7,6 +7,7 @@
 """
 import enum
 import logging
+import math
 import pathlib
 import warnings
 from collections import defaultdict
@@ -1115,10 +1116,15 @@ def _make_polygons(self) -> numpy.ndarray:
         for unique_size in unique_sizes:
             # Extract the face node data for every polygon of this size
             indices = numpy.flatnonzero(polygon_sizes == unique_size)
-            nodes = numpy.ma.getdata(face_node)[indices, :unique_size]
-            coords = numpy.stack([node_x[nodes], node_y[nodes]], axis=-1)
-            # Generate the polygons directly in to their correct locations
-            shapely.polygons(coords, indices=indices, out=polygons)
+            chunk_size = 1000
+            chunk_count = math.ceil(len(indices) / chunk_size)
+            for chunk_index in range(chunk_count):
+                chunk_slice = slice(chunk_index * chunk_size, (chunk_index + 1) * chunk_size)
+                chunk_indices = indices[chunk_slice]
+                nodes = numpy.ma.getdata(face_node)[chunk_indices, :unique_size]
+                coords = numpy.stack([node_x[nodes], node_y[nodes]], axis=-1)
+                # Generate the polygons directly in to their correct locations
+                shapely.polygons(coords, indices=chunk_indices, out=polygons)
 
         return polygons
 
diff --git a/tests/conventions/test_cfgrid1d.py b/tests/conventions/test_cfgrid1d.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import pathlib
 
 import numpy
@@ -15,7 +16,11 @@
     CFGrid1D, CFGrid1DTopology, CFGridKind, CFGridTopology
 )
 from emsarray.operations import geometry
-from tests.utils import assert_property_not_cached, box, mask_from_strings
+from tests.utils import (
+    assert_property_not_cached, box, mask_from_strings, track_peak_memory_usage
+)
+
+logger = logging.getLogger(__name__)
 
 
 def make_dataset(
@@ -500,3 +505,17 @@ def test_topology():
     assert_allclose(
         latitude_bounds.values,
         0.1 * numpy.array([[i - 0.5, i + 0.5] for i in range(11)]))
+
+
+def test_make_polygon_memory_usage() -> None:
+    width, height = 2000, 1000
+    dataset = make_dataset(width=width, height=height)
+
+    with track_peak_memory_usage() as tracker:
+        assert len(dataset.ems.polygons) == width * height
+
+    logger.info("current memory usage: %d, peak memory usage: %d", tracker.current, tracker.peak)
+
+    target = 135_000_000
+    assert tracker.peak < target, "Peak memory allocation is too large"
+    assert tracker.peak > target * 0.9, "Peak memory allocation is suspiciously small - did you improve things?"
diff --git a/tests/conventions/test_cfgrid2d.py b/tests/conventions/test_cfgrid2d.py
@@ -7,6 +7,7 @@
 """
 import itertools
 import json
+import logging
 import pathlib
 
 import numpy
@@ -25,9 +26,12 @@
 from emsarray.operations import geometry
 from tests.utils import (
     AxisAlignedShocGrid, DiagonalShocGrid, ShocGridGenerator,
-    ShocLayerGenerator, assert_property_not_cached, plot_geometry
+    ShocLayerGenerator, assert_property_not_cached, plot_geometry,
+    track_peak_memory_usage
 )
 
+logger = logging.getLogger(__name__)
+
 
 def make_dataset(
     *,
@@ -497,3 +501,17 @@ def test_plot_on_figure() -> None:
     dataset.ems.plot_on_figure(figure, surface_temp)
 
     assert len(figure.axes) == 2
+
+
+def test_make_polygon_memory_usage() -> None:
+    j_size, i_size = 1000, 2000
+    dataset = make_dataset(j_size=j_size, i_size=i_size)
+
+    with track_peak_memory_usage() as tracker:
+        assert len(dataset.ems.polygons) == j_size * i_size
+
+    logger.info("current memory usage: %d, peak memory usage: %d", tracker.current, tracker.peak)
+
+    target = 300_000_000
+    assert tracker.peak < target, "Peak memory allocation is too large"
+    assert tracker.peak > target * 0.9, "Peak memory allocation is suspiciously small - did you improve things?"
diff --git a/tests/conventions/test_shoc_standard.py b/tests/conventions/test_shoc_standard.py
@@ -1,5 +1,6 @@
 import itertools
 import json
+import logging
 import pathlib
 
 import numpy
@@ -17,9 +18,12 @@
 from emsarray.conventions.shoc import ShocStandard
 from emsarray.operations import geometry
 from tests.utils import (
-    DiagonalShocGrid, ShocGridGenerator, ShocLayerGenerator, mask_from_strings
+    DiagonalShocGrid, ShocGridGenerator, ShocLayerGenerator, mask_from_strings,
+    track_peak_memory_usage
 )
 
+logger = logging.getLogger(__name__)
+
 
 def make_dataset(
     *,
@@ -657,3 +661,18 @@ def clip_values(values: numpy.ndarray) -> numpy.ndarray:
     assert clipped.ems.polygons[6] is None
     assert clipped.ems.polygons[7].equals_exact(original_polygons[7], 1e-6)
     assert clipped.ems.polygons[8] is None
+
+
+@pytest.mark.memory_usage
+def test_make_polygons_memory_usage():
+    j_size, i_size = 1000, 2000
+    dataset = make_dataset(j_size=j_size, i_size=i_size)
+
+    with track_peak_memory_usage() as tracker:
+        assert len(dataset.ems.polygons) == j_size * i_size
+
+    logger.info("current memory usage: %d, peak memory usage: %d", tracker.current, tracker.peak)
+
+    target = 134_500_000
+    assert tracker.peak < target, "Peak memory allocation is too large"
+    assert tracker.peak > target * 0.9, "Peak memory allocation is suspiciously small - did you improve things?"
diff --git a/tests/conventions/test_ugrid.py b/tests/conventions/test_ugrid.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import pathlib
 import warnings
 
@@ -20,7 +21,11 @@
     ConventionViolationError, ConventionViolationWarning
 )
 from emsarray.operations import geometry
-from tests.utils import assert_property_not_cached, filter_warning
+from tests.utils import (
+    assert_property_not_cached, filter_warning, track_peak_memory_usage
+)
+
+logger = logging.getLogger(__name__)
 
 
 def make_faces(width: int, height, fill_value: int) -> tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
@@ -981,3 +986,17 @@ def test_has_valid_face_edge_connectivity():
     dataset_fill_value_above_range['Mesh2_face_edges'].encoding['_FillValue'] = 89
 
     assert dataset_fill_value_above_range.ems.topology.has_valid_face_edge_connectivity is True
+
+
+@pytest.mark.memory_usage
+def test_make_polygons_memory_usage():
+    dataset = make_dataset(width=600, height=600)
+
+    with track_peak_memory_usage() as tracker:
+        assert len(dataset.ems.polygons) == dataset.ems.topology.face_count
+
+    logger.info("current memory usage: %d, peak memory usage: %d", tracker.current, tracker.peak)
+
+    target = 78_000_000
+    assert tracker.peak < target, "Peak memory allocation is too large"
+    assert tracker.peak > target * 0.9, "Peak memory allocation is suspiciously small - did you improve things?"
diff --git a/tests/utils.py b/tests/utils.py
@@ -2,10 +2,12 @@
 import contextlib
 import importlib.metadata
 import itertools
+import tracemalloc
 import warnings
 from collections.abc import Hashable
 from functools import cached_property
-from typing import Any
+from types import TracebackType
+from typing import Any, Type
 
 import matplotlib.pyplot as plt
 import numpy
@@ -462,3 +464,41 @@ def plot_geometry(
         axes.set_extent(extent)
 
     figure.savefig(out)
+
+
+class TracemallocTracker:
+    _finished = False
+    _usage = None
+
+    def __enter__(self):
+        tracemalloc.start()
+        return self
+
+    @property
+    def current(self):
+        if not self._finished:
+            raise RuntimeError("Context manager has not exited yet")
+        return self._usage[0]
+
+    @property
+    def peak(self):
+        if not self._finished:
+            raise RuntimeError("Context manager has not exited yet")
+        return self._usage[1]
+
+    def __exit__(
+        self,
+        exc_type: Type[Exception] | None,
+        exc_value: Exception | None,
+        exc_traceback: TracebackType | None,
+    ) -> bool | None:
+        self._finished = True
+        self._usage = tracemalloc.get_traced_memory()
+
+        tracemalloc.stop()
+
+        return None
+
+
+def track_peak_memory_usage():
+    return TracemallocTracker()

Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@ filterwarnings = [`
`102`	`102`	`markers = [`
`103`	`103`	`"matplotlib: Tests that involve matplotlib and plotting",`
`104`	`104`	`"tutorial: Tests that involve the tutorial datasets",`
	`105`	`+ "memory_usage: Regression tests for memory allocations",`
`105`	`106`	`]`
`106`	`107`
`107`	`108`	`mpl-use-full-test-name = true`