None chunk sizes should work now

forman · forman · commit 5c757a481a05 · 2024-03-11T08:33:49.000+01:00
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -142,10 +142,20 @@ def test_some_slices_local_output_to_existing_dir_force_new(self):
         zappend(slices, target_dir=target_dir, force_new=True)
         self.assertEqual(False, lock_file.exists())
 
-    def test_some_slices_with_class_slice_source(self):
+    def test_some_slices_with_slice_source_class(self):
+        class DropTsm(SliceSource):
+            def __init__(self, slice_ds):
+                self.slice_ds = slice_ds
+
+            def get_dataset(self) -> xr.Dataset:
+                return self.slice_ds.drop_vars(["tsm"])
+
+            def dispose(self):
+                pass
+
         target_dir = "memory://target.zarr"
         slices = [make_test_dataset(index=3 * i) for i in range(3)]
-        zappend(slices, target_dir=target_dir, slice_source=MySliceSource)
+        zappend(slices, target_dir=target_dir, slice_source=DropTsm)
         ds = xr.open_zarr(target_dir)
         self.assertEqual({"time": 9, "y": 50, "x": 100}, ds.sizes)
         self.assertEqual({"chl"}, set(ds.data_vars))
@@ -158,13 +168,13 @@ def test_some_slices_with_class_slice_source(self):
             ds.attrs,
         )
 
-    def test_some_slices_with_func_slice_source(self):
-        def process_slice(slice_ds: xr.Dataset) -> SliceSource:
-            return MySliceSource(slice_ds)
+    def test_some_slices_with_slice_source_func(self):
+        def drop_tsm(slice_ds: xr.Dataset) -> xr.Dataset:
+            return slice_ds.drop_vars(["tsm"])
 
         target_dir = "memory://target.zarr"
         slices = [make_test_dataset(index=3 * i) for i in range(3)]
-        zappend(slices, target_dir=target_dir, slice_source=process_slice)
+        zappend(slices, target_dir=target_dir, slice_source=drop_tsm)
         ds = xr.open_zarr(target_dir)
         self.assertEqual({"time": 9, "y": 50, "x": 100}, ds.sizes)
         self.assertEqual({"chl"}, set(ds.data_vars))
@@ -177,9 +187,67 @@ def process_slice(slice_ds: xr.Dataset) -> SliceSource:
             ds.attrs,
         )
 
-    def test_some_slices_with_cropping_slice_source(self):
-        # TODO: implement me after #78
-        pass
+    # See https://github.com/bcdev/zappend/issues/77
+    def test_some_slices_with_cropping_slice_source_no_chunks_spec(self):
+        def crop_ds(slice_ds: xr.Dataset) -> xr.Dataset:
+            w = slice_ds.x.size
+            h = slice_ds.y.size
+            return slice_ds.isel(x=slice(5, w - 5), y=slice(5, h - 5))
+
+        target_dir = "memory://target.zarr"
+        slices = [make_test_dataset(index=3 * i) for i in range(3)]
+        zappend(slices, target_dir=target_dir, slice_source=crop_ds)
+        ds = xr.open_zarr(target_dir)
+        self.assertEqual({"time": 9, "y": 40, "x": 90}, ds.sizes)
+        self.assertEqual({"chl", "tsm"}, set(ds.data_vars))
+        self.assertEqual({"time", "y", "x"}, set(ds.coords))
+        self.assertEqual((90,), ds.x.encoding.get("chunks"))
+        self.assertEqual((40,), ds.y.encoding.get("chunks"))
+        self.assertEqual((3,), ds.time.encoding.get("chunks"))
+        # Chunk sizes are the ones of the original array, because we have not
+        # specified chunks in encoding.
+        self.assertEqual((1, 25, 45), ds.chl.encoding.get("chunks"))
+        self.assertEqual((1, 25, 45), ds.tsm.encoding.get("chunks"))
+
+    # See https://github.com/bcdev/zappend/issues/77
+    def test_some_slices_with_cropping_slice_source_with_chunks_spec(self):
+        def crop_ds(slice_ds: xr.Dataset) -> xr.Dataset:
+            w = slice_ds.x.size
+            h = slice_ds.y.size
+            return slice_ds.isel(x=slice(5, w - 5), y=slice(5, h - 5))
+
+        variables = {
+            "*": {
+                "encoding": {
+                    "chunks": None,
+                }
+            },
+            "chl": {
+                "encoding": {
+                    "chunks": [1, None, None],
+                }
+            },
+            "tsm": {
+                "encoding": {
+                    "chunks": [None, 25, 50],
+                }
+            },
+        }
+
+        target_dir = "memory://target.zarr"
+        slices = [make_test_dataset(index=3 * i) for i in range(3)]
+        zappend(
+            slices, target_dir=target_dir, slice_source=crop_ds, variables=variables
+        )
+        ds = xr.open_zarr(target_dir)
+        self.assertEqual({"time": 9, "y": 40, "x": 90}, ds.sizes)
+        self.assertEqual({"chl", "tsm"}, set(ds.data_vars))
+        self.assertEqual({"time", "y", "x"}, set(ds.coords))
+        self.assertEqual((90,), ds.x.encoding.get("chunks"))
+        self.assertEqual((40,), ds.y.encoding.get("chunks"))
+        self.assertEqual((3,), ds.time.encoding.get("chunks"))
+        self.assertEqual((1, 40, 90), ds.chl.encoding.get("chunks"))
+        self.assertEqual((3, 25, 50), ds.tsm.encoding.get("chunks"))
 
     def test_some_slices_with_inc_append_step(self):
         target_dir = "memory://target.zarr"
@@ -395,14 +463,3 @@ def test_some_slices_with_profiling(self):
         finally:
             if os.path.exists("prof.out"):
                 os.remove("prof.out")
-
-
-class MySliceSource(SliceSource):
-    def __init__(self, slice_ds):
-        self.slice_ds = slice_ds
-
-    def get_dataset(self) -> xr.Dataset:
-        return self.slice_ds.drop_vars(["tsm"])
-
-    def dispose(self):
-        pass
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -291,6 +291,47 @@ def test_variable_encoding_from_netcdf(self):
             ).to_dict(),
         )
 
+    def test_variable_encoding_can_deal_with_chunk_size_none(self):
+        # See https://github.com/bcdev/zappend/issues/77
+        a = xr.DataArray(np.zeros((2, 3, 4)), dims=("time", "y", "x"))
+        b = xr.DataArray(np.zeros((2, 3, 4)), dims=("time", "y", "x"))
+        self.assertEqual(
+            {
+                "attrs": {},
+                "sizes": {"time": 2, "x": 4, "y": 3},
+                "variables": {
+                    "a": {
+                        "attrs": {},
+                        "dims": ("time", "y", "x"),
+                        "encoding": {"chunks": (1, 3, 4)},
+                        "shape": (2, 3, 4),
+                    },
+                    "b": {
+                        "attrs": {},
+                        "dims": ("time", "y", "x"),
+                        "encoding": {"chunks": (2, 2, 3)},
+                        "shape": (2, 3, 4),
+                    },
+                },
+            },
+            DatasetMetadata.from_dataset(
+                xr.Dataset(
+                    {
+                        "a": a,
+                        "b": b,
+                    }
+                ),
+                make_config(
+                    {
+                        "variables": {
+                            "a": {"encoding": {"chunks": [1, None, None]}},
+                            "b": {"encoding": {"chunks": [None, 2, 3]}},
+                        },
+                    }
+                ),
+            ).to_dict(),
+        )
+
     def test_variable_encoding_normalisation(self):
         def normalize(k, v):
             metadata = DatasetMetadata.from_dataset(
@@ -363,6 +404,7 @@ def test_it_raises_on_unspecified_variable(self):
                 ),
             )
 
+    # noinspection PyMethodMayBeStatic
     def test_it_raises_on_wrong_size_found_in_ds(self):
         with pytest.raises(
             ValueError,
diff --git a/zappend/metadata.py b/zappend/metadata.py
@@ -22,6 +22,8 @@ class Undefined:
 
 Codec = numcodecs.abc.Codec
 
+NoneType = type(None)
+
 
 class VariableEncoding:
     """The Zarr encoding of a dataset's variable.
@@ -346,6 +348,21 @@ def _get_effective_variables(
             chunk_sizes = encoding.pop("chunksizes")
             if "chunks" not in encoding:
                 encoding["chunks"] = chunk_sizes
+
+        # Handle case where a chunk size in None to indicate
+        # dimension is not chunked.
+        # See https://github.com/bcdev/zappend/issues/77
+        if (
+            "chunks" in encoding
+            and encoding["chunks"] is not None
+            and None in encoding["chunks"]
+        ):
+            chunks = encoding["chunks"]
+            encoding["chunks"] = tuple(
+                (dataset.sizes[dim_name] if chunk_size is None else chunk_size)
+                for dim_name, chunk_size in zip(dims, chunks)
+            )
+
         variables[var_name] = VariableMetadata(
             dims=dims, shape=shape, encoding=VariableEncoding(**encoding), attrs=attrs
         )
@@ -364,7 +381,7 @@ def _normalize_chunks(value: Any) -> tuple[int, ...] | None:
     if not value:
         return None
     assert isinstance(value, (tuple, list))
-    return tuple((v if isinstance(v, int) else v[0]) for v in value)
+    return tuple((v if isinstance(v, (int, NoneType)) else v[0]) for v in value)
 
 
 def _normalize_number(value: Any) -> int | float | None:
diff --git a/zappend/tailoring.py b/zappend/tailoring.py
@@ -83,6 +83,7 @@ def tailor_slice_dataset(ctx: Context, slice_ds: xr.Dataset) -> xr.Dataset:
 
 
 def _strip_dataset(dataset: xr.Dataset, target_metadata: DatasetMetadata) -> xr.Dataset:
+    """Remove unwanted variables from `dataset` and return a copy."""
     drop_var_names = set(map(str, dataset.variables.keys())) - set(
         target_metadata.variables.keys()
     )
@@ -92,36 +93,46 @@ def _strip_dataset(dataset: xr.Dataset, target_metadata: DatasetMetadata) -> xr.
 def _complete_dataset(
     dataset: xr.Dataset, target_metadata: DatasetMetadata
 ) -> xr.Dataset:
+    undefined = object()
+    """Chunk existing variables according to chunks in encoding or 
+    add missing variables to `dataset` (in-place operation) and return it.
+    """
     for var_name, var_metadata in target_metadata.variables.items():
         var = dataset.variables.get(var_name)
-        if var is not None:
-            continue
-        logger.warning(
-            f"Variable {var_name!r} not found in slice dataset;" f" creating it."
-        )
         encoding = var_metadata.encoding.to_dict()
-        chunks = encoding.get("chunks")
-        if chunks is None:
-            chunks = var_metadata.shape
-        if encoding.get("_FillValue") is not None:
-            # Since we have a defined fill value, the decoded in-memory
-            # variable uses NaN where fill value will be stored.
-            # This ia also what xarray does if decode_cf=True.
-            memory_dtype = np.dtype("float64")
-            memory_fill_value = float("NaN")
+        chunks = encoding.get("chunks", undefined)
+        if var is not None:
+            if chunks is None:
+                # May emit warning for large shapes
+                chunks = var_metadata.shape
+            if chunks is not undefined:
+                var = var.chunk(chunks=chunks)
         else:
-            # Fill value is not defined, so we use the data type
-            # defined in the encoding, if any and fill memory with zeros.
-            memory_dtype = encoding.get("dtype", np.dtype("float64"))
-            memory_fill_value = 0
-        var = xr.DataArray(
-            dask.array.full(
-                var_metadata.shape,
-                memory_fill_value,
-                chunks=chunks,
-                dtype=np.dtype(memory_dtype),
-            ),
-            dims=var_metadata.dims,
-        )
+            logger.warning(
+                f"Variable {var_name!r} not found in slice dataset; creating it."
+            )
+            if chunks is None or chunks is undefined:
+                # May emit warning for large shapes
+                chunks = var_metadata.shape
+            if encoding.get("_FillValue") is not None:
+                # Since we have a defined fill value, the decoded in-memory
+                # variable uses NaN where fill value will be stored.
+                # This ia also what xarray does if decode_cf=True.
+                memory_dtype = np.dtype("float64")
+                memory_fill_value = float("NaN")
+            else:
+                # Fill value is not defined, so we use the data type
+                # defined in the encoding, if any and fill memory with zeros.
+                memory_dtype = encoding.get("dtype", np.dtype("float64"))
+                memory_fill_value = 0
+            var = xr.DataArray(
+                dask.array.full(
+                    var_metadata.shape,
+                    memory_fill_value,
+                    chunks=chunks,
+                    dtype=np.dtype(memory_dtype),
+                ),
+                dims=var_metadata.dims,
+            )
         dataset[var_name] = var
     return dataset