SciTools · pp-mo · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 25, 2025
diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py
@@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine):
             ),
         )
         if problem is not None:
-            stack_notes = problem.stack_trace.__notes__
+            stack_notes = problem.stack_trace.__notes__  # type: ignore[attr-defined]
             if stack_notes is None:
                 stack_notes = []
             stack_notes.append(
                 f"Skipping disallowed global attribute '{attr_name}' (see above error)"
             )
-            problem.stack_trace.__notes__ = stack_notes
+            problem.stack_trace.__notes__ = stack_notes  # type: ignore[attr-defined]
 
 
 ################################################################################
@@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate(
     )
     if problem is not None:
         coord_var_name = str(cf_coord_var.cf_name)
-        stack_notes = problem.stack_trace.__notes__
+        stack_notes = problem.stack_trace.__notes__  # type: ignore[attr-defined]
         if stack_notes is None:
             stack_notes = []
         stack_notes.append(
             f"Failed to create {coord_var_name} dimension coordinate:\n"
             f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead."
         )
-        problem.stack_trace.__notes__ = stack_notes
+        problem.stack_trace.__notes__ = stack_notes  # type: ignore[attr-defined]
         problem.handled = True
 
         _ = _add_or_capture(
@@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate(
 
     # Determine the name of the dimension/s shared between the CF-netCDF data variable
     # and the coordinate being built.
-    common_dims = [
-        dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions
-    ]
+    coord_dims = cf_coord_var.dimensions
+    if cf._is_str_dtype(cf_coord_var):
+        coord_dims = coord_dims[:-1]
+    datavar_dims = engine.cf_var.dimensions
+    if cf._is_str_dtype(engine.cf_var):
+        datavar_dims = datavar_dims[:-1]
+    common_dims = [dim for dim in coord_dims if dim in datavar_dims]
     data_dims = None
     if common_dims:
         # Calculate the offset of each common dimension.

diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
@@ -15,6 +15,7 @@
 """
 
 from abc import ABCMeta, abstractmethod
+import codecs
 from collections.abc import Iterable, MutableMapping
 import os
 import re
@@ -89,6 +90,11 @@ def __init__(self, name, data):
 
         self.cf_data = data
         """NetCDF4 Variable data instance."""
+        # Note: *always* disable encoding/decoding translations
+        #  To avoid current known problems
+        #  See https://github.com/Unidata/netcdf4-python/issues/1440
+        data.set_auto_chartostring(False)
+        # ALSO NOTE: not stored. NetCDFDataProxy must re-assert when re-loading.
 
         """File source of the NetCDF content."""
         try:
@@ -790,25 +796,73 @@ def cf_label_data(self, cf_data_var):
 
         # Determine the name of the label string (or length) dimension by
         # finding the dimension name that doesn't exist within the data dimensions.
-        str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        n_nondata_dims = len(str_dim_names)
+
+        if n_nondata_dims == 0:
+            # *All* dims are shared with the data-variable.
+            # This is only ok if the data-var is *also* a string type.
+            dim_ok = _is_str_dtype(cf_data_var)
+            # In this case, we must just *assume* that the last dimension is "the"
+            #  string dimension
+            str_dim_name = self.dimensions[-1]
+        else:
+            # If there is exactly one non-data dim, that is the one we want
+            dim_ok = len(str_dim_names) == 1
+            (str_dim_name,) = str_dim_names
 
-        if len(str_dim_name) != 1:
+        if not dim_ok:
             raise ValueError(
                 "Invalid string dimensions for CF-netCDF label variable %r"
                 % self.cf_name
             )
 
-        str_dim_name = str_dim_name[0]
         label_data = self[:]
 
         if ma.isMaskedArray(label_data):
-            label_data = label_data.filled()
+            label_data = label_data.filled(b"\0")
+
+        default_encoding = "utf-8"
+        encoding = getattr(self, "_Encoding", None)
+        if encoding is None:
+            # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+            encoding = default_encoding
+        else:
+            try:
+                # Accept + normalise naming of encodings
+                encoding = codecs.lookup(encoding).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Replace some invalid setting with "safe"(ish) fallback.
+                encoding = default_encoding
+
+        def string_from_1d_bytearray(array, encoding):
+            r"""Because numpy bytes arrays behave very oddly.
+
+            Elements which "should" contain a zero byte b'\0' instead appear to contain
+            an *empty* byte b''.  So a "b''.join()" will *omit* any zero bytes.
+            """
+            assert array.dtype.kind == "S" and array.dtype.itemsize == 1
+            assert array.ndim == 1
+            bytelist = [b"\0" if byte == b"" else byte for byte in array]
+            bytes = b"".join(bytelist)
+            assert len(bytes) == array.shape[0]
+            try:
+                string = bytes.decode(encoding=encoding)
+            except UnicodeDecodeError:
+                # if encoding == "ascii":
+                #     print("\n\n*** FIX !!")
+                #     string = bytes.decode("utf-8")
+                # else:
+                raise
+            result = string.strip()
+            return result
 
         # Determine whether we have a string-valued scalar label
         # i.e. a character variable that only has one dimension (the length of the string).
         if self.ndim == 1:
-            label_string = b"".join(label_data).strip()
-            label_string = label_string.decode("utf8")
+            label_string = string_from_1d_bytearray(label_data, encoding)
             data = np.array([label_string])
         else:
             # Determine the index of the string dimension.
@@ -829,9 +883,10 @@ def cf_label_data(self, cf_data_var):
                 else:
                     label_index = index + (slice(None, None),)
 
-                label_string = b"".join(label_data[label_index]).strip()
-                label_string = label_string.decode("utf8")
-                data[index] = label_string
+                label_string = string_from_1d_bytearray(
+                    label_data[label_index], encoding
+                )
+                data[index] = label_string.strip()
 
         return data
 

diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -310,14 +310,39 @@ def fromcdl(cls, *args, **kwargs):
 class NetCDFDataProxy:
     """A reference to the data payload of a single NetCDF file variable."""
 
-    __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
-
-    def __init__(self, shape, dtype, path, variable_name, fill_value):
+    __slots__ = (
+        "shape",
+        "dtype",
+        "path",
+        "variable_name",
+        "fill_value",
+        "is_bytes",
+        "encoding",
+        "string_length",
+    )
+
+    def __init__(
+        self,
+        shape,
+        dtype,
+        path,
+        variable_name,
+        fill_value,
+        encoding: str | None = None,
+        string_length: int = 0,
+    ):
         self.shape = shape
         self.dtype = dtype
         self.path = path
         self.variable_name = variable_name
         self.fill_value = fill_value
+        self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
+        if self.is_bytes:
+            # We will be returning a different shape : the last dim is the byte-length
+            self.shape = self.shape[:-1]
+            self.dtype = np.dtype(f"U{string_length}")
+        self.encoding = encoding
+        self.string_length = string_length
 
     @property
     def ndim(self):
@@ -336,11 +361,26 @@ def __getitem__(self, keys):
             dataset = netCDF4.Dataset(self.path)
             try:
                 variable = dataset.variables[self.variable_name]
+                # ALWAYS disable byte encoding/decoding
+                #  To avoid current known problems
+                #  See https://github.com/Unidata/netcdf4-python/issues/1440
+                variable.set_auto_chartostring(False)
+
                 # Get the NetCDF variable data and slice.
-                var = variable[keys]
+                data = variable[keys]
+
+                # If bytes, decode to strings
+                if self.is_bytes:
+                    from iris.util import convert_bytesarray_to_strings
+
+                    data = convert_bytesarray_to_strings(
+                        data,
+                        encoding=self.encoding,
+                        string_length=self.string_length,
+                    )
             finally:
                 dataset.close()
-        return np.asanyarray(var)
+        return np.asanyarray(data)
 
     def __repr__(self):
         fmt = (
@@ -388,6 +428,8 @@ def __setitem__(self, keys, array_data):
             try:
                 dataset = netCDF4.Dataset(self.path, "r+")
                 var = dataset.variables[self.varname]
+                # **Always** disable encode/decode of bytes to strings
+                var.set_auto_chartostring(False)
                 var[keys] = array_data
             finally:
                 try:

diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
@@ -11,6 +11,7 @@
 
 """
 
+import codecs
 from collections.abc import Iterable, Iterator, Mapping
 from contextlib import contextmanager
 from copy import deepcopy
@@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var):
             # Normal NCVariable type:
             total_bytes = cf_var.size * cf_var.dtype.itemsize
 
+        default_encoding = "utf-8"
+        encoding = getattr(cf_var, "_Encoding", None)
+        if encoding is None:
+            # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+            encoding = default_encoding
+        else:
+            try:
+                # Accept + normalise naming of encodings
+                encoding = codecs.lookup(encoding).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Replace some invalid setting with "safe"(ish) fallback.
+                encoding = default_encoding
+
+        string_length = getattr(cf_var, "iris_string_length", None)
+
         if total_bytes < _LAZYVAR_MIN_BYTES:
             # Don't make a lazy array, as it will cost more memory AND more time to access.
             result = cf_var[:]
 
+            if result.dtype.kind == "S":
+                from iris.util import convert_bytesarray_to_strings
+
+                result = convert_bytesarray_to_strings(
+                    result,
+                    encoding=encoding,
+                    string_length=string_length,
+                )
+
             # Special handling of masked scalar value; this will be returned as
             # an `np.ma.masked` instance which will lose the original dtype.
             # Workaround for this it return a 1-element masked array of the
@@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var):
                 "_FillValue",
                 _thread_safe_nc.default_fillvals[fill_dtype],
             )
+
+            # NOTE: if the data is bytes which need to be converted to strings on read,
+            #  the data-proxy will do that (and it modifies its shape + dtype).
             proxy = NetCDFDataProxy(
-                cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
+                cf_var.shape,
+                dtype,
+                cf_var.filename,
+                cf_var.cf_name,
+                fill_value,
+                encoding=encoding,
+                string_length=string_length,
             )
             # Get the chunking specified for the variable : this is either a shape, or
             # maybe the string "contiguous".