Merge pull request #92 from chrishavlin/reorganize_chunking_ops

chrishavlin · web-flow · commit 16a277185426 · 2024-07-02T14:07:25.000-05:00
Reorganize chunking operations
diff --git a/yt_xarray/accessor/accessor.py b/yt_xarray/accessor/accessor.py
@@ -10,6 +10,7 @@
 from yt_xarray.accessor import _xr_to_yt
 from yt_xarray.accessor._readers import _get_xarray_reader
 from yt_xarray.accessor._xr_to_yt import _load_full_field_from_xr
+from yt_xarray.utilities._grid_decomposition import ChunkInfo
 from yt_xarray.utilities.logging import ytxr_log
 
 
@@ -535,18 +536,14 @@ def _load_chunked_grid(
     # otherwise it is number of nodes (which are treated as new cell centers).
     # the bbox will already account for this as well.
 
-    # do some grid/chunk counting
-    n_chnk = np.asarray(data_shp) / chunksizes  # may not be int
-    n_whl_chnk = np.floor(n_chnk).astype(int)  # whole chunks in each dim
-    n_part_chnk = np.ceil(n_chnk - n_whl_chnk).astype(int)  # partial chunks
-
-    n_tots = np.prod(n_part_chnk + n_whl_chnk)
-    ytxr_log.info(f"Constructing a yt chunked grid with {n_tots} chunks.")
-
     # initialize the global starting index
     si = np.array([0, 0, 0], dtype=int)
     si = sel_info.starting_indices + si
 
+    # do some grid/chunk counting
+    chnkinfo = ChunkInfo(data_shp, chunksizes, starting_index_offset=si)
+    ytxr_log.info(f"Constructing a yt chunked grid with {chnkinfo.n_tots} chunks.")
+
     # select field for grabbing coordinate arrays -- fields should all be
     # verified by now
     fld = fields[0]
@@ -564,29 +561,8 @@ def _load_chunked_grid(
     subgrid_start = []
     subgrid_end = []
     for idim in range(sel_info.ndims):
-        si_0 = si[idim] + chunksizes[idim] * np.arange(n_whl_chnk[idim])
-        ei_0 = si_0 + chunksizes[idim]
-
-        if n_part_chnk[idim] == 1:
-            si_0_partial = ei_0[-1]
-            ei_0_partial = data_shp[idim] - si_0_partial
-            si_0 = np.concatenate(
-                [
-                    si_0,
-                    [
-                        si_0_partial,
-                    ],
-                ]
-            )
-            ei_0 = np.concatenate(
-                [
-                    ei_0,
-                    [
-                        ei_0[-1] + ei_0_partial,
-                    ],
-                ]
-            )
-
+        si_0 = chnkinfo.si[idim]
+        ei_0 = chnkinfo.ei[idim]
         c = cnames[idim]
         rev_ax = sel_info.reverse_axis[idim]
         if rev_ax is False:
@@ -608,7 +584,7 @@ def _load_chunked_grid(
             le_0 = np.concatenate([[min_val], re_0[:-1]])
 
         # sizes also already account for interp_required
-        subgrid_size = ei_0 - si_0
+        subgrid_size = chnkinfo.sizes[idim]
 
         left_edges.append(le_0)
         right_edges.append(re_0)
diff --git a/yt_xarray/tests/test_chunking.py b/yt_xarray/tests/test_chunking.py
@@ -6,6 +6,7 @@
 
 import yt_xarray  # noqa: F401
 from yt_xarray import sample_data
+from yt_xarray.utilities._grid_decomposition import ChunkInfo
 from yt_xarray.utilities._utilities import construct_minimal_ds
 
 
@@ -146,3 +147,41 @@ def test_chunk_bad_length():
 
     with pytest.raises(ValueError, match="The number of elements in "):
         _ = ds.yt.load_grid(length_unit="km", chunksizes=(30, 40, 20, 5))
+
+
+_chunk_tests = [
+    ((20, 30, 40), (10, 15, 20), (0,) * 3, (2, 2, 2)),
+    ((20, 30, 40), (15, 15, 20), (0,) * 3, (2, 2, 2)),
+    ((10, 15, 20), (5, 5, 5), None, (2, 3, 4)),
+    ((10, 15, 20), (5, 5, 5), (1, 2, 3), (2, 3, 4)),
+]
+
+
+@pytest.mark.parametrize("data_shape,chunksizes,si0, expected_nchunks", _chunk_tests)
+def test_chunk_info(data_shape, chunksizes, si0, expected_nchunks):
+    chunksizes = np.array(chunksizes, dtype="int")
+    if si0 is not None:
+        si0 = np.array(si0, dtype="int")
+    ch = ChunkInfo(data_shape, chunksizes, starting_index_offset=si0)
+    chunks = ch.n_whl_chnk + ch.n_part_chnk
+    assert np.all(chunks == np.asarray(expected_nchunks))
+    if si0 is not None:
+        si = np.array([ch.si[id][0] for id in range(3)])
+        assert np.all(si == si0)
+
+
+def test_chunk_info_caching():
+
+    chunksizes = np.array([5, 5, 5], dtype="int")
+    data_shape = (10, 15, 20)
+
+    def _get_ch():
+        return ChunkInfo(data_shape, chunksizes)
+
+    ch = _get_ch()
+    _ = ch.ei
+    ch = _get_ch()
+    _ = ch.sizes
+    assert ch._sizes is not None
+    assert ch._si is not None
+    assert ch._ei is not None
diff --git a/yt_xarray/utilities/_grid_decomposition.py b/yt_xarray/utilities/_grid_decomposition.py
@@ -421,3 +421,106 @@ def _get_yt_ds(
         refine_by=refine_by,
         **load_kwargs,
     )
+
+
+class ChunkInfo:
+    """
+    Class for tracking info related to chunked-decomposition of a domain
+
+    Parameters
+    ----------
+    data_shp: Tuple[int,]
+        the global shape of the data to chunk
+    chunksizes: np.ndarray[int]
+        the chunksizes in each dimension of data_shp
+    starting_index_offset: np.ndarray[int]
+        global index offset. start and end indices will be offset
+        by this array. Defaults to [0,0,0].
+    """
+
+    def __init__(
+        self,
+        data_shp: Tuple[int,],
+        chunksizes: np.ndarray,
+        starting_index_offset: np.ndarray = None,
+    ):
+
+        self.chunksizes = chunksizes
+        self.data_shape = np.asarray(data_shp)
+        self.n_chnk = self.data_shape / chunksizes  # may not be int
+        self.n_whl_chnk = np.floor(self.n_chnk).astype(int)  # whole chunks in each dim
+        self.n_part_chnk = np.ceil(self.n_chnk - self.n_whl_chnk).astype(int)
+        self.n_tots = np.prod(self.n_part_chnk + self.n_whl_chnk)
+
+        self.ndim = len(data_shp)
+        if starting_index_offset is None:
+            starting_index_offset = np.zeros(self.data_shape.shape, dtype=int)
+        self.starting_index_offset = starting_index_offset
+
+    _si: List[np.ndarray] = None
+    _ei: List[np.ndarray] = None
+    _sizes: List[np.ndarray] = None
+
+    @property
+    def si(self) -> List[np.ndarray]:
+        """
+        The starting indices of individual chunks by dimension.
+        Includes any global offset.
+        """
+        if self._si is None:
+            si_list = []
+            ei_list = []
+            size_list = []
+            for idim in range(self.ndim):
+
+                # first get the starting and end points of whole chunks
+                si0 = self.starting_index_offset[idim]
+                si_0 = si0 + self.chunksizes[idim] * np.arange(self.n_whl_chnk[idim])
+                ei_0 = si_0 + self.chunksizes[idim]
+
+                # if this dim has a partial chunk at the end, add on a
+                # partial chunk.
+                if self.n_part_chnk[idim] == 1:
+                    si_0_partial = ei_0[-1]
+                    ei_0_partial = self.data_shape[idim] - si_0_partial
+                    si_0 = np.concatenate(
+                        [
+                            si_0,
+                            [
+                                si_0_partial,
+                            ],
+                        ]
+                    )
+                    ei_0 = np.concatenate(
+                        [
+                            ei_0,
+                            [
+                                ei_0[-1] + ei_0_partial,
+                            ],
+                        ]
+                    )
+                si_list.append(si_0)
+                ei_list.append(ei_0)
+                size_list.append(ei_0 - si_0)
+            self._si = si_list
+            self._ei = ei_list
+            self._sizes = size_list
+        return self._si
+
+    @property
+    def ei(self) -> List[np.ndarray]:
+        """
+        The ending indices of individual chunks by dimension.
+        Includes any global offset.
+        """
+        if self._ei is None:
+            _ = self.si
+            assert self._ei is not None
+        return self._ei
+
+    @property
+    def sizes(self) -> List[np.ndarray]:
+        if self._sizes is None:
+            _ = self.si
+            assert self._sizes is not None
+        return self._sizes