pp-mo
diff --git a/‎lib/iris/fileformats/netcdf/_bytecoding_datasets.py‎
Lines changed: 113 additions & 42 deletions b/‎lib/iris/fileformats/netcdf/_bytecoding_datasets.py‎
Lines changed: 113 additions & 42 deletions
diff --git a/‎lib/iris/tests/integration/netcdf/test_chararrays.py‎
Lines changed: 5 additions & 2 deletions b/‎lib/iris/tests/integration/netcdf/test_chararrays.py‎
Lines changed: 5 additions & 2 deletions
@@ -41,6 +41,8 @@
 """
 
 import codecs
+import contextlib
+import threading
 import warnings
 
 import numpy as np
@@ -49,17 +51,18 @@
 
 
 def decode_bytesarray_to_stringarray(
-    byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
+    byte_array: np.ndarray, encoding: str, string_width: int
 ) -> np.ndarray:
     """Convert an array of bytes to an array of strings, with one less dimension.
 
     N.B. for now at least, we assume the string dim is **always the last one**.
     If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
     """
+    if np.ma.isMaskedArray(byte_array):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        byte_array = byte_array.data
     bytes_shape = byte_array.shape
     var_shape = bytes_shape[:-1]
-    if string_width is None:
-        string_width = bytes_shape[-1]
     string_dtype = f"U{string_width}"
     result = np.empty(var_shape, dtype=string_dtype)
     for ndindex in np.ndindex(var_shape):
@@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray(
     return result
 
 
-def encode_stringarray_as_bytearray(
+#
+# TODO: remove?
+# this older version is "overly flexible", less efficient and not needed here.
+#
+def flexi_encode_stringarray_as_bytearray(
     data: np.ndarray, encoding=None, string_dimension_length: int | None = None
 ) -> np.ndarray:
     """Encode strings as bytearray.
 
     Note: if 'string_dimension_length' is not given (None), it is set to the longest
-    encoded bytes element.  If 'string_dimension_length' is specified, the last array
+    encoded bytes element, **OR** the dtype size, if that is greater.
+    If 'string_dimension_length' is specified, the last array
     dimension is set to this and content strings are truncated or extended as required.
     """
+    if np.ma.isMaskedArray(data):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        data = data.data
     element_shape = data.shape
+    # Encode all the strings + see which is longest
     max_length = 1  # this is a MINIMUM - i.e. not zero!
     data_elements = np.zeros(element_shape, dtype=object)
     for index in np.ndindex(element_shape):
@@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray(
             max_length = element_length
 
     if string_dimension_length is None:
+        # If the string length was not specified, it is the maximum encoded length
+        # (n-bytes), **or** the dtype string-length, if greater.
         string_dimension_length = max_length
+        array_string_length = int(str(data.dtype)[2:])  # Yuck. No better public way?
+        if array_string_length > string_dimension_length:
+            string_dimension_length = array_string_length
 
-    # We already encoded all the strings, but stored them in an object-array as
-    #  we didn't yet know the fixed byte-length to convert to.
+    # We maybe *already* encoded all the strings above, but stored them in an
+    #  object-array as we didn't yet know the fixed byte-length to convert to.
     # Now convert to a fixed-width byte array with an extra string-length dimension
     result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
     right_pad = b"\0" * string_dimension_length
@@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray(
     return result
 
 
-DEFAULT_ENCODING = "utf-8"
+def encode_stringarray_as_bytearray(
+    data: np.ndarray, encoding: str, string_dimension_length: int
+) -> np.ndarray:
+    """Encode strings as a bytes array."""
+    element_shape = data.shape
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data[index].encode(encoding=encoding)
+        # It's all a bit nasty ...
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+class NetcdfStringDecodeSetting(threading.local):
+    def __init__(self, perform_encoding: bool = True):
+        self.set(perform_encoding)
+
+    def set(self, perform_encoding: bool):
+        self.perform_encoding = perform_encoding
+
+    def __bool__(self):
+        return self.perform_encoding
+
+    @contextlib.contextmanager
+    def context(self, perform_encoding: bool):
+        old_setting = self.perform_encoding
+        self.perform_encoding = perform_encoding
+        yield
+        self.perform_encoding = old_setting
+
+
+DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting()
+DEFAULT_READ_ENCODING = "utf-8"
+DEFAULT_WRITE_ENCODING = "ascii"
 
 
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 
     def __getitem__(self, keys):
-        if self.is_chardata():
-            super().set_auto_chartostring(False)
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
 
         data = super().__getitem__(keys)
 
-        if self.is_chardata():
-            encoding = self.get_byte_encoding()
-            strlen = self.get_string_length()
-            data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+        if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
+            strlen = self._get_string_length()
+            try:
+                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+            except UnicodeDecodeError as err:
+                msg = (
+                    f"Character data in variable {self.name!r} could not be decoded"
+                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                    "variable '_Encoding' attribute to suit the content."
+                )
+                raise ValueError(msg) from err
 
         return data
 
     def __setitem__(self, keys, data):
-        if self.is_chardata():
-            encoding = self.get_byte_encoding()
-            strlen = self.get_string_length()
-            if encoding is not None:
-                data = encode_stringarray_as_bytearray(data, encoding, strlen)
-            else:
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
+
+            encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+            # N.B. typically, write encoding default is "ascii" --> fails bad content
+            if data.dtype.kind == "U":
                 try:
-                    # Check if all characters are valid ascii
-                    data = encode_stringarray_as_bytearray(data, "ascii", strlen)
-                except UnicodeEncodeError:
-                    data = encode_stringarray_as_bytearray(
-                        data, DEFAULT_ENCODING, strlen
-                    )
-                    # As this was necessary, record the new encoding on the variable
-                    self.set_ncattr("_Encoding", DEFAULT_ENCODING)
+                    strlen = self._get_string_length()
+                    data = encode_stringarray_as_bytearray(data, encoding, strlen)
+                except UnicodeEncodeError as err:
                     msg = (
-                        f"Non-ascii data written to label variable {self.name}. "
-                        f"Applied {DEFAULT_ENCODING!r} encoding, "
-                        f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
+                        f"String data written to netcdf character variable {self.name!r} "
+                        f"could not be represented in encoding {encoding!r}.  This can be "
+                        "fixed by setting a suitable variable '_Encoding' attribute, "
+                        'e.g. <variable>._Encoding="UTF-8".'
                     )
-                    warnings.warn(msg, UserWarning)
-
-            super().set_auto_chartostring(False)
+                    raise ValueError(msg) from err
 
         super().__setitem__(keys, data)
 
-    def is_chardata(self):
+    def _is_chardata(self):
         return np.issubdtype(self.dtype, np.bytes_)
 
-    def get_encoding(self) -> str | None:
-        """Get the effective byte encoding to be used for this variable."""
-        # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+    def _get_encoding(self) -> str | None:
+        """Get the byte encoding defined for this variable (or None)."""
         result = getattr(self, "_Encoding", None)
         if result is not None:
             try:
@@ -165,18 +222,32 @@ def get_encoding(self) -> str | None:
                 # NOTE: if encoding does not suit data, errors can occur.
                 # For example, _Encoding = "ascii", with non-ascii content.
             except LookupError:
-                # Replace some invalid setting with "safe"(ish) fallback.
+                # Unrecognised encoding name : handle this as just a warning
                 msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
                 warnings.warn(msg, UserWarning)
 
         return result
 
-    def get_string_length(self):
-        """Return the string-length defined for this variable (or None)."""
-        return getattr(self, "iris_string_length", None)
+    def _get_string_length(self):
+        """Return the string-length defined for this variable."""
+        if not hasattr(self, "_strlen"):
+            # Work out the string length from the parent dataset dimensions.
+            strlen = self.group().dimensions[self.dimensions[-1]].size
+            # Cache this on the variable -- but not as a netcdf attribute (!)
+            self.__dict__["_strlen"] = strlen
+
+        return self._strlen
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
+        raise TypeError(msg)
 
 
 class EncodedDataset(DatasetWrapper):
     """A specialised DatasetWrapper whose variables perform byte encoding."""
 
     VAR_WRAPPER_CLS = EncodedVariable
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
+        raise TypeError(msg)
@@ -137,8 +137,11 @@ def make_testcube(
 
 def ncdump(nc_path: str, *args):
     """Call ncdump to print a dump of a file."""
-    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
-    subprocess.run(call_args, check=True)
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(args)
+    bytes = subprocess.check_output(call_args)
+    text = bytes.decode("utf-8")
+    print(text)
+    return text
 
 
 def show_result(filepath):