SciTools
diff --git a/‎lib/iris/fileformats/netcdf/_bytecoding_datasets.py‎
Lines changed: 276 additions & 0 deletions b/‎lib/iris/fileformats/netcdf/_bytecoding_datasets.py‎
Lines changed: 276 additions & 0 deletions
diff --git a/‎lib/iris/fileformats/netcdf/_thread_safe_nc.py‎
Lines changed: 9 additions & 6 deletions b/‎lib/iris/fileformats/netcdf/_thread_safe_nc.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎lib/iris/tests/integration/netcdf/test_chararrays.py‎
Lines changed: 5 additions & 2 deletions b/‎lib/iris/tests/integration/netcdf/test_chararrays.py‎
Lines changed: 5 additions & 2 deletions
@@ -0,0 +1,276 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Module providing to netcdf datasets with automatic character encoding.
+
+The requirement is to convert numpy fixed-width unicode arrays on writing to a variable
+which is declared as a byte (character) array with a fixed-length string dimension.
+
+Numpy unicode string arrays are ones with dtypes of the form "U<character-width>".
+Numpy character variables have the dtype "S1", and map to a fixed-length "string
+dimension".
+
+In principle, netCDF4 already performs these translations, but in practice current
+releases are not functional for anything other than "ascii" encoding -- including UTF-8,
+which is the most obvious and desirable "general" solution.
+
+There is also the question of whether we should like to implement UTF-8 as our default.
+Current discussions on this are inconclusive and neither CF conventions nor the NetCDF
+User Guide are definite on what possible values of "_Encoding" are, or what the effective
+default is, even though they do both mention the "_Encoding" attribute as a potential
+way to handle the issue.
+
+Because of this, we interpret as follows:
+  * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to
+    decode bytes as UTF-8
+  * when writing strings : in the absence of an "_Encoding" attribute (on the Iris
+    cube or coord object), we will attempt to encode data with "ascii" : If this fails,
+    it raise an error prompting the user to supply an "_Encoding" attribute.
+
+Where an "_Encoding" attribute is provided to Iris, we will honour it where possible,
+identifying with "codecs.lookup" :  This means we support the encodings in the Python
+Standard Library, and the name aliases which it recognises.
+
+See:
+
+* known problems https://github.com/Unidata/netcdf4-python/issues/1440
+* suggestions for how this "ought" to work, discussed in the netcdf-c library
+   * https://github.com/Unidata/netcdf-c/issues/402
+
+"""
+
+import codecs
+import contextlib
+import threading
+import warnings
+
+import numpy as np
+
+from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
+
+
+def decode_bytesarray_to_stringarray(
+    byte_array: np.ndarray, encoding: str, string_width: int
+) -> np.ndarray:
+    """Convert an array of bytes to an array of strings, with one less dimension.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
+    """
+    if np.ma.isMaskedArray(byte_array):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        byte_array = byte_array.data
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    string_dtype = f"U{string_width}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+#
+# TODO: remove?
+# this older version is "overly flexible", less efficient and not needed here.
+#
+def flexi_encode_stringarray_as_bytearray(
+    data: np.ndarray, encoding=None, string_dimension_length: int | None = None
+) -> np.ndarray:
+    """Encode strings as bytearray.
+
+    Note: if 'string_dimension_length' is not given (None), it is set to the longest
+    encoded bytes element, **OR** the dtype size, if that is greater.
+    If 'string_dimension_length' is specified, the last array
+    dimension is set to this and content strings are truncated or extended as required.
+    """
+    if np.ma.isMaskedArray(data):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        data = data.data
+    element_shape = data.shape
+    # Encode all the strings + see which is longest
+    max_length = 1  # this is a MINIMUM - i.e. not zero!
+    data_elements = np.zeros(element_shape, dtype=object)
+    for index in np.ndindex(element_shape):
+        data_element = data[index].encode(encoding=encoding)
+        element_length = len(data_element)
+        data_elements[index] = data_element
+        if element_length > max_length:
+            max_length = element_length
+
+    if string_dimension_length is None:
+        # If the string length was not specified, it is the maximum encoded length
+        # (n-bytes), **or** the dtype string-length, if greater.
+        string_dimension_length = max_length
+        array_string_length = int(str(data.dtype)[2:])  # Yuck. No better public way?
+        if array_string_length > string_dimension_length:
+            string_dimension_length = array_string_length
+
+    # We maybe *already* encoded all the strings above, but stored them in an
+    #  object-array as we didn't yet know the fixed byte-length to convert to.
+    # Now convert to a fixed-width byte array with an extra string-length dimension
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data_elements[index]
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+def encode_stringarray_as_bytearray(
+    data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
+) -> np.ndarray:
+    """Encode strings as a bytes array."""
+    data = np.asanyarray(data)
+    element_shape = data.shape
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data[index].encode(encoding=encoding)
+        # It's all a bit nasty ...
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+class NetcdfStringDecodeSetting(threading.local):
+    def __init__(self, perform_encoding: bool = True):
+        self.set(perform_encoding)
+
+    def set(self, perform_encoding: bool):
+        self.perform_encoding = perform_encoding
+
+    def __bool__(self):
+        return self.perform_encoding
+
+    @contextlib.contextmanager
+    def context(self, perform_encoding: bool):
+        old_setting = self.perform_encoding
+        self.perform_encoding = perform_encoding
+        yield
+        self.perform_encoding = old_setting
+
+
+DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting()
+DEFAULT_READ_ENCODING = "utf-8"
+DEFAULT_WRITE_ENCODING = "ascii"
+
+
+class EncodedVariable(VariableWrapper):
+    """A variable wrapper that translates variable data according to byte encodings."""
+
+    def __getitem__(self, keys):
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
+
+        data = super().__getitem__(keys)
+
+        if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
+            strlen = self._get_string_width()
+            try:
+                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+            except UnicodeDecodeError as err:
+                msg = (
+                    f"Character data in variable {self.name!r} could not be decoded "
+                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                    "variable '_Encoding' attribute to suit the content."
+                )
+                raise ValueError(msg) from err
+
+        return data
+
+    def __setitem__(self, keys, data):
+        data = np.asanyarray(data)
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
+
+            # N.B. typically, write encoding default is "ascii" --> fails bad content
+            if data.dtype.kind == "U":
+                try:
+                    encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+                    strlen = self._get_byte_width()
+                    data = encode_stringarray_as_bytearray(data, encoding, strlen)
+                except UnicodeEncodeError as err:
+                    msg = (
+                        f"String data written to netcdf character variable {self.name!r} "
+                        f"could not be represented in encoding {encoding!r}.  This can be "
+                        "fixed by setting a suitable variable '_Encoding' attribute, "
+                        'e.g. <variable>._Encoding="UTF-8".'
+                    )
+                    raise ValueError(msg) from err
+
+        super().__setitem__(keys, data)
+
+    def _is_chardata(self):
+        return np.issubdtype(self.dtype, np.bytes_)
+
+    def _get_encoding(self) -> str | None:
+        """Get the byte encoding defined for this variable (or None)."""
+        result = getattr(self, "_Encoding", None)
+        if result is not None:
+            try:
+                # Accept + normalise naming of encodings
+                result = codecs.lookup(result).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Unrecognised encoding name : handle this as just a warning
+                msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
+                warnings.warn(msg, UserWarning)
+
+        return result
+
+    def _get_byte_width(self) -> int | None:
+        if not hasattr(self, "_bytewidth"):
+            n_bytes = self.group().dimensions[self.dimensions[-1]].size
+            # Cache this length control on the variable -- but not as a netcdf attribute
+            self.__dict__["_bytewidth"] = n_bytes
+
+        return self.__dict__["_bytewidth"]
+
+    def _get_string_width(self):
+        """Return the string-length defined for this variable."""
+        if not hasattr(self, "_strlen"):
+            # Work out the actual byte width from the parent dataset dimensions.
+            strlen = self._get_byte_width()
+            # Convert the string dimension length (i.e. bytes) to a sufficiently-long
+            #  string width, depending on the encoding used.
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # regularise the name for comparison with recognised ones
+            encoding = codecs.lookup(encoding).name
+            if "utf-16" in encoding:
+                # Each char needs at least 2 bytes -- including a terminator char
+                strlen = (strlen // 2) - 1
+            elif "utf-32" in encoding:
+                # Each char needs exactly 4 bytes -- including a terminator char
+                strlen = (strlen // 4) - 1
+            # "ELSE": assume there can be (at most) as many chars as bytes
+
+            # Cache this length control on the variable -- but not as a netcdf attribute
+            self.__dict__["_strlen"] = strlen
+
+        return self._strlen
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
+        raise TypeError(msg)
+
+
+class EncodedDataset(DatasetWrapper):
+    """A specialised DatasetWrapper whose variables perform byte encoding."""
+
+    VAR_WRAPPER_CLS = EncodedVariable
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
+        raise TypeError(msg)
@@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper):
     CONTAINED_CLASS = netCDF4.Group
     # Note: will also accept a whole Dataset object, but that is OK.
     _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
+    # Class to use when creating variable wrappers (default=VariableWrapper).
+    # - needed to support _byte_encoded_data.EncodedDataset.
+    VAR_WRAPPER_CLS = VariableWrapper
 
     # All Group API that returns Dimension(s) is wrapped to instead return
     #  DimensionWrapper(s).
@@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]:
         """
         with _GLOBAL_NETCDF4_LOCK:
             variables_ = self._contained_instance.variables
-        return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()}
+        return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()}
 
     def createVariable(self, *args, **kwargs) -> VariableWrapper:
         """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK.
@@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper:
         """
         with _GLOBAL_NETCDF4_LOCK:
             new_variable = self._contained_instance.createVariable(*args, **kwargs)
-        return VariableWrapper.from_existing(new_variable)
+        return self.VAR_WRAPPER_CLS.from_existing(new_variable)
 
     def get_variables_by_attributes(
         self, *args, **kwargs
@@ -234,7 +237,7 @@ def get_variables_by_attributes(
             variables_ = list(
                 self._contained_instance.get_variables_by_attributes(*args, **kwargs)
             )
-        return [VariableWrapper.from_existing(v) for v in variables_]
+        return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_]
 
     # All Group API that returns Group(s) is wrapped to instead return
     #  GroupWrapper(s).
@@ -252,7 +255,7 @@ def groups(self):
         """
         with _GLOBAL_NETCDF4_LOCK:
             groups_ = self._contained_instance.groups
-        return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()}
+        return {k: self.__class__.from_existing(v) for k, v in groups_.items()}
 
     @property
     def parent(self):
@@ -268,7 +271,7 @@ def parent(self):
         """
         with _GLOBAL_NETCDF4_LOCK:
             parent_ = self._contained_instance.parent
-        return GroupWrapper.from_existing(parent_)
+        return self.__class__.from_existing(parent_)
 
     def createGroup(self, *args, **kwargs):
         """Call createGroup() from netCDF4.Group/Dataset.
@@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs):
         """
         with _GLOBAL_NETCDF4_LOCK:
             new_group = self._contained_instance.createGroup(*args, **kwargs)
-        return GroupWrapper.from_existing(new_group)
+        return self.__class__.from_existing(new_group)
 
 
 class DatasetWrapper(GroupWrapper):
 
@@ -137,8 +137,11 @@ def make_testcube(
 
 def ncdump(nc_path: str, *args):
     """Call ncdump to print a dump of a file."""
-    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
-    subprocess.run(call_args, check=True)
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(args)
+    bytes = subprocess.check_output(call_args)
+    text = bytes.decode("utf-8")
+    print(text)
+    return text
 
 
 def show_result(filepath):