implement coders, adapt tests

kmuehlbauer · kmuehlbauer · commit bcd8b6e2cafd · 2023-04-01T10:22:47.000+02:00
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -78,6 +78,72 @@ def __repr__(self) -> str:
         )
 
 
+class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Decode arrays on the fly from non-native to native endianness
+
+    This is useful for decoding arrays from netCDF3 files (which are all
+    big endian) into native endianness, so they can be used with Cython
+    functions, such as those found in bottleneck and pandas.
+
+    >>> x = np.arange(5, dtype=">i2")
+
+    >>> x.dtype
+    dtype('>i2')
+
+    >>> NativeEndiannessArray(x).dtype
+    dtype('int16')
+
+    >>> indexer = indexing.BasicIndexer((slice(None),))
+    >>> NativeEndiannessArray(x)[indexer].dtype
+    dtype('int16')
+    """
+
+    __slots__ = ("array",)
+
+    def __init__(self, array):
+        self.array = indexing.as_indexable(array)
+
+    @property
+    def dtype(self):
+        return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
+
+    def __getitem__(self, key):
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
+class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Decode arrays on the fly from integer to boolean datatype
+
+    This is useful for decoding boolean arrays from integer typed netCDF
+    variables.
+
+    >>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
+
+    >>> x.dtype
+    dtype('int8')
+
+    >>> BoolTypeArray(x).dtype
+    dtype('bool')
+
+    >>> indexer = indexing.BasicIndexer((slice(None),))
+    >>> BoolTypeArray(x)[indexer].dtype
+    dtype('bool')
+    """
+
+    __slots__ = ("array",)
+
+    def __init__(self, array):
+        self.array = indexing.as_indexable(array)
+
+    @property
+    def dtype(self):
+        return np.dtype("bool")
+
+    def __getitem__(self, key):
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
+
 def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
     """Lazily apply an element-wise function to an array.
     Parameters
@@ -349,3 +415,99 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
+
+
+class DefaultFillvalueCoder(VariableCoder):
+    """Encode default _FillValue if needed."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        dims, data, attrs, encoding = unpack_for_encoding(variable)
+        # make NaN the fill value for float types
+        if (
+            "_FillValue" not in attrs
+            and "_FillValue" not in encoding
+            and np.issubdtype(variable.dtype, np.floating)
+        ):
+            attrs["_FillValue"] = variable.dtype.type(np.nan)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        raise NotImplementedError()
+
+
+class BooleanCoder(VariableCoder):
+    """Code boolean values."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if (
+            (variable.dtype == bool)
+            and ("dtype" not in variable.encoding)
+            and ("dtype" not in variable.attrs)
+        ):
+            dims, data, attrs, encoding = unpack_for_encoding(variable)
+            attrs["dtype"] = "bool"
+            data = duck_array_ops.astype(data, dtype="i1", copy=True)
+
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if variable.attrs.get("dtype", False) == "bool":
+            dims, data, attrs, encoding = unpack_for_decoding(variable)
+            del attrs["dtype"]
+            data = BoolTypeArray(data)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+
+class EndianCoder(VariableCoder):
+    """Decode Endianness to native."""
+
+    def encode(self):
+        raise NotImplementedError()
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+        if not data.dtype.isnative:
+            data = NativeEndiannessArray(data)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+
+class NonStringCoder(VariableCoder):
+    """Encode NonString variables if dtypes differ."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
+            "S1",
+            str,
+        ):
+            dims, data, attrs, encoding = unpack_for_encoding(variable)
+            dtype = np.dtype(encoding.pop("dtype"))
+            if dtype != variable.dtype:
+                if np.issubdtype(dtype, np.integer):
+                    if (
+                        np.issubdtype(variable.dtype, np.floating)
+                        and "_FillValue" not in variable.attrs
+                        and "missing_value" not in variable.attrs
+                    ):
+                        warnings.warn(
+                            f"saving variable {name} with floating "
+                            "point data as an integer dtype without "
+                            "any _FillValue to use for NaNs",
+                            SerializationWarning,
+                            stacklevel=10,
+                        )
+                    data = np.around(data)
+                data = data.astype(dtype=dtype)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self):
+        raise NotImplementedError()
+
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -48,123 +48,10 @@
     T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore]
 
 
-class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
-    """Decode arrays on the fly from non-native to native endianness
-
-    This is useful for decoding arrays from netCDF3 files (which are all
-    big endian) into native endianness, so they can be used with Cython
-    functions, such as those found in bottleneck and pandas.
-
-    >>> x = np.arange(5, dtype=">i2")
-
-    >>> x.dtype
-    dtype('>i2')
-
-    >>> NativeEndiannessArray(x).dtype
-    dtype('int16')
-
-    >>> indexer = indexing.BasicIndexer((slice(None),))
-    >>> NativeEndiannessArray(x)[indexer].dtype
-    dtype('int16')
-    """
-
-    __slots__ = ("array",)
-
-    def __init__(self, array):
-        self.array = indexing.as_indexable(array)
-
-    @property
-    def dtype(self):
-        return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
-
-    def __getitem__(self, key):
-        return np.asarray(self.array[key], dtype=self.dtype)
-
-
-class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
-    """Decode arrays on the fly from integer to boolean datatype
-
-    This is useful for decoding boolean arrays from integer typed netCDF
-    variables.
-
-    >>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
-
-    >>> x.dtype
-    dtype('int8')
-
-    >>> BoolTypeArray(x).dtype
-    dtype('bool')
-
-    >>> indexer = indexing.BasicIndexer((slice(None),))
-    >>> BoolTypeArray(x)[indexer].dtype
-    dtype('bool')
-    """
-
-    __slots__ = ("array",)
-
-    def __init__(self, array):
-        self.array = indexing.as_indexable(array)
-
-    @property
-    def dtype(self):
-        return np.dtype("bool")
-
-    def __getitem__(self, key):
-        return np.asarray(self.array[key], dtype=self.dtype)
-
-
 def _var_as_tuple(var: Variable) -> T_VarTuple:
     return var.dims, var.data, var.attrs.copy(), var.encoding.copy()
 
 
-def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable:
-    if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str):
-        dims, data, attrs, encoding = _var_as_tuple(var)
-        dtype = np.dtype(encoding.pop("dtype"))
-        if dtype != var.dtype:
-            if np.issubdtype(dtype, np.integer):
-                if (
-                    np.issubdtype(var.dtype, np.floating)
-                    and "_FillValue" not in var.attrs
-                    and "missing_value" not in var.attrs
-                ):
-                    warnings.warn(
-                        f"saving variable {name} with floating "
-                        "point data as an integer dtype without "
-                        "any _FillValue to use for NaNs",
-                        SerializationWarning,
-                        stacklevel=10,
-                    )
-                data = np.around(data)
-            data = data.astype(dtype=dtype)
-        var = Variable(dims, data, attrs, encoding, fastpath=True)
-    return var
-
-
-def maybe_default_fill_value(var: Variable) -> Variable:
-    # make NaN the fill value for float types:
-    if (
-        "_FillValue" not in var.attrs
-        and "_FillValue" not in var.encoding
-        and np.issubdtype(var.dtype, np.floating)
-    ):
-        var.attrs["_FillValue"] = var.dtype.type(np.nan)
-    return var
-
-
-def maybe_encode_bools(var: Variable) -> Variable:
-    if (
-        (var.dtype == bool)
-        and ("dtype" not in var.encoding)
-        and ("dtype" not in var.attrs)
-    ):
-        dims, data, attrs, encoding = _var_as_tuple(var)
-        attrs["dtype"] = "bool"
-        data = duck_array_ops.astype(data, dtype="i1", copy=True)
-        var = Variable(dims, data, attrs, encoding, fastpath=True)
-    return var
-
-
 def _infer_dtype(array, name: T_Name = None) -> np.dtype:
     """Given an object array with no missing values, infer its dtype from its
     first element
@@ -292,13 +179,13 @@ def encode_cf_variable(
         variables.CFScaleOffsetCoder(),
         variables.CFMaskCoder(),
         variables.UnsignedIntegerCoder(),
+        variables.NonStringCoder(),
+        variables.DefaultFillvalueCoder(),
+        variables.BooleanCoder(),
     ]:
         var = coder.encode(var, name=name)
 
-    # TODO(shoyer): convert all of these to use coders, too:
-    var = maybe_encode_nonstring_dtype(var, name=name)
-    var = maybe_default_fill_value(var)
-    var = maybe_encode_bools(var)
+    # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends:
     var = ensure_dtype_not_object(var, name=name)
 
     for attr_name in CF_RELATED_DATA:
@@ -389,19 +276,15 @@ def decode_cf_variable(
     if decode_times:
         var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)
 
-    dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
-    # TODO(shoyer): convert everything below to use coders
+    if decode_endianness and not var.dtype.isnative:
+        var = variables.EndianCoder().decode(var)
+        original_dtype = var.dtype
 
-    if decode_endianness and not data.dtype.isnative:
-        # do this last, so it's only done if we didn't already unmask/scale
-        data = NativeEndiannessArray(data)
-        original_dtype = data.dtype
+    var = variables.BooleanCoder().decode(var)
 
-    encoding.setdefault("dtype", original_dtype)
+    dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
 
-    if "dtype" in attributes and attributes["dtype"] == "bool":
-        del attributes["dtype"]
-        data = BoolTypeArray(data)
+    encoding.setdefault("dtype", original_dtype)
 
     if not is_duck_dask_array(data):
         data = indexing.LazilyIndexedArray(data)
diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py
@@ -32,7 +32,7 @@
 class TestBoolTypeArray:
     def test_booltype_array(self) -> None:
         x = np.array([1, 0, 1, 1, 0], dtype="i1")
-        bx = conventions.BoolTypeArray(x)
+        bx = coding.variables.BoolTypeArray(x)
         assert bx.dtype == bool
         assert_array_equal(bx, np.array([True, False, True, True, False], dtype=bool))
 
@@ -41,7 +41,7 @@ class TestNativeEndiannessArray:
     def test(self) -> None:
         x = np.arange(5, dtype=">i8")
         expected = np.arange(5, dtype="int64")
-        a = conventions.NativeEndiannessArray(x)
+        a = coding.variables.NativeEndiannessArray(x)
         assert a.dtype == expected.dtype
         assert a.dtype == expected[:].dtype
         assert_array_equal(a, expected)
@@ -247,7 +247,7 @@ def test_decode_coordinates(self) -> None:
     def test_0d_int32_encoding(self) -> None:
         original = Variable((), np.int32(0), encoding={"dtype": "int64"})
         expected = Variable((), np.int64(0))
-        actual = conventions.maybe_encode_nonstring_dtype(original)
+        actual = coding.variables.NonStringCoder().encode(original)
         assert_identical(expected, actual)
 
     def test_decode_cf_with_multiple_missing_values(self) -> None: