Fix: mask and scale problem for non-varied data

heptaflar · heptaflar · commit 3057be86ad81 · 2025-05-07T12:34:56.000+02:00
diff --git a/kaleidoscope/algorithms/codec.py b/kaleidoscope/algorithms/codec.py
@@ -92,3 +92,83 @@ def decode(
     @override
     def name(self) -> str:
         return "decode"
+
+
+class Encode(BlockAlgorithm):
+    """
+    The algorithm to encode data according to CF conventions.
+    """
+
+    def __init__(self, dtype: np.dtype, m: int):
+        """
+        Creates a new algorithm instance.
+
+        :param dtype: The result data type.
+        :param m: The number of input data dimensions.
+        """
+        super().__init__(dtype, m, m)
+
+    @override
+    def chunks(self, *inputs: da.Array) -> tuple[int, ...] | None:
+        return None
+
+    @property
+    @override
+    def created_axes(self) -> list[int] | None:
+        return None
+
+    @property
+    @override
+    def dropped_axes(self) -> list[int]:
+        return []
+
+    # noinspection PyMethodMayBeStatic
+    def encode(
+        self,
+        x: np.ndarray,
+        *,
+        add_offset: Any = None,
+        scale_factor: Any = None,
+        fill_value: Any = None,
+        valid_min: Any = None,
+        valid_max: Any = None,
+    ) -> np.ndarray:
+        """
+        Encodes data.
+
+        :param x: The data.
+        :param add_offset: The add-offset.
+        :param scale_factor: The scale factor.
+        :param fill_value: The fill value.
+        :param valid_min: The valid minimum.
+        :param valid_max: The valid maximum.
+        :return: The encoded data.
+        """
+        if (
+            fill_value is None
+            and add_offset is None
+            and scale_factor is None
+            and valid_min is None
+            and valid_max is None
+        ):
+            y = x
+        else:
+            y = x.astype(np.double)
+            if add_offset is not None:
+                y = y - add_offset
+            if scale_factor is not None:
+                y = y / scale_factor
+            if valid_max is not None:
+                y[y > valid_max] = valid_max
+            if valid_min is not None:
+                y[y < valid_min] = valid_min
+            if fill_value is not None:
+                y[np.isnan(x)] = fill_value
+        return y
+
+    compute_block = encode
+
+    @property
+    @override
+    def name(self) -> str:
+        return "encode"
diff --git a/kaleidoscope/operators/randomizeop.py b/kaleidoscope/operators/randomizeop.py
@@ -17,6 +17,7 @@
 from xarray import Dataset
 
 from ..algorithms.codec import Decode
+from ..algorithms.codec import Encode
 from ..algorithms.randomize import Randomize
 from ..generators import DefaultGenerator
 from ..interface.logging import Logging
@@ -36,10 +37,21 @@ def _hash(name: str) -> int:
     return h
 
 
-def _decode(
-    x: da.Array, a: dict[str:Any], dtype: np.dtype = np.double
-) -> da.Array:
-    f = Decode(dtype, x.ndim)
+def _decode(x: da.Array, a: dict[str:Any]) -> da.Array:
+    f = Decode(np.single if x.dtype == np.single else np.double, x.ndim)
+    y = f.apply_to(
+        x,
+        add_offset=a.get("add_offset", None),
+        scale_factor=a.get("scale_factor", None),
+        fill_value=a.get("_FillValue", None),
+        valid_min=a.get("valid_min", None),
+        valid_max=a.get("valid_max", None),
+    )
+    return y
+
+
+def _encode(x: da.Array, a: dict[str:Any], dtype: np.dtype) -> da.Array:
+    f = Encode(dtype, x.ndim)
     y = f.apply_to(
         x,
         add_offset=a.get("add_offset", None),
@@ -79,11 +91,7 @@ def run(self, source: Dataset) -> Dataset:  # noqa: D102
         :return: The result dataset.
         """
         source_id = source.attrs.get(
-            "tracking_id",
-            source.attrs.get(
-                "uuid",
-                f"{uuid.uuid5(uuid.NAMESPACE_URL, self._args.source_file.stem)}",
-            ),
+            "tracking_id", source.attrs.get("uuid", f"{self.uuid}")
         )
         target: Dataset = Dataset(
             data_vars=source.data_vars,
@@ -130,7 +138,10 @@ def run(self, source: Dataset) -> Dataset:  # noqa: D102
                     clip=a.get("clip", None),
                 )
             target[v] = DataArray(
-                data=z, coords=x.coords, dims=x.dims, attrs=x.attrs
+                data=_encode(z, x.attrs, x.dtype),
+                coords=x.coords,
+                dims=x.dims,
+                attrs=x.attrs,
             )
             if "actual_range" in target[v].attrs:
                 target[v].attrs["actual_range"] = np.array(
@@ -140,7 +151,6 @@ def run(self, source: Dataset) -> Dataset:  # noqa: D102
                     ],
                     dtype=z.dtype,
                 )
-            target[v].attrs["dtype"] = x.dtype
             target[v].attrs["entropy"] = np.array(s, dtype=np.int64)
             if get_logger().is_enabled(Logging.DEBUG):
                 get_logger().debug(f"entropy: {s}")
@@ -162,6 +172,7 @@ def config(self) -> dict[str : dict[str:Any]]:
                 config = json.load(r)
         return config
 
+    # noinspection PyShadowingNames
     def entropy(self, name: str, uuid: str, n: int = 4) -> list[int]:
         """
         Returns the entropy of the seed sequence used for a given variable.
@@ -179,3 +190,10 @@ def entropy(self, name: str, uuid: str, n: int = 4) -> list[int]:
         seed = _hash(f"{name}-{uuid}") + self._args.selector
         g = DefaultGenerator(Philox(seed))
         return [g.next() for _ in range(n)]
+
+    @property
+    def uuid(self) -> uuid.UUID:
+        """
+        Returns a UUID constructed from the basename of the source file.
+        """
+        return uuid.uuid5(uuid.NAMESPACE_URL, self._args.source_file.stem)
diff --git a/kaleidoscope/writer.py b/kaleidoscope/writer.py
@@ -10,9 +10,7 @@
 from typing import Literal
 
 import numpy as np
-from dask.array import Array
 from typing_extensions import override
-from xarray import DataArray
 from xarray import Dataset
 
 from .interface.writing import Writing
@@ -113,10 +111,7 @@ def _encode(self, dataset: Dataset, to_zarr: bool = True):
         encodings: dict[str, dict[str, Any]] = {}
 
         for name, array in dataset.data_vars.items():
-            dtype = array.attrs.pop("dtype", array.dtype)
-            attrs = array.attrs
             data = array.data
-
             dims: list = list(array.dims)
             if array.ndim == 0:  # not an array
                 continue
@@ -138,7 +133,7 @@ def _encode(self, dataset: Dataset, to_zarr: bool = True):
                 else:
                     chunks.append(data.chunksize[i])
             encodings[name] = self._encode_compress(
-                dtype, attrs, chunks, to_zarr
+                data.dtype, chunks, to_zarr
             )
         return encodings
 
@@ -180,28 +175,14 @@ def _shuffle(self) -> bool:
         """This method does not belong to public API."""
         return self._config[_KEY_SHUFFLE] == "true"
 
-    @staticmethod
-    def _encode_variable(
-        name: str, dims: list[str], attrs: dict[str, Any], array: Array
-    ) -> DataArray:
-        """This method does not belong to public API."""
-        return DataArray(data=array, dims=dims, name=name, attrs=attrs)
-
     def _encode_compress(
         self,
         dtype: np.dtype,
-        attrs: dict[str:Any],
         chunks: list[int],
         to_zarr: bool = True,
     ) -> dict[str, Any]:
         """This method does not belong to public API."""
         enc = {"dtype": dtype}
-        if "_FillValue" in attrs:
-            enc["_FillValue"] = attrs.pop("_FillValue")
-        if "add_offset" in attrs:
-            enc["add_offset"] = attrs.pop("add_offset")
-        if "scale_factor" in attrs:
-            enc["scale_factor"] = attrs.pop("scale_factor")
         if chunks:
             if to_zarr:
                 enc["chunks"] = tuple(chunks)