Skip to content
Draft
18 changes: 11 additions & 7 deletions lib/iris/fileformats/_nc_load_rules/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine):
),
)
if problem is not None:
stack_notes = problem.stack_trace.__notes__
stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined]
if stack_notes is None:
stack_notes = []
stack_notes.append(
f"Skipping disallowed global attribute '{attr_name}' (see above error)"
)
problem.stack_trace.__notes__ = stack_notes
problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined]


################################################################################
Expand Down Expand Up @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate(
)
if problem is not None:
coord_var_name = str(cf_coord_var.cf_name)
stack_notes = problem.stack_trace.__notes__
stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined]
if stack_notes is None:
stack_notes = []
stack_notes.append(
f"Failed to create {coord_var_name} dimension coordinate:\n"
f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead."
)
problem.stack_trace.__notes__ = stack_notes
problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined]
problem.handled = True

_ = _add_or_capture(
Expand Down Expand Up @@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate(

# Determine the name of the dimension/s shared between the CF-netCDF data variable
# and the coordinate being built.
common_dims = [
dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions
]
coord_dims = cf_coord_var.dimensions
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: this possibly needs to be implemented for ancillary-variables too

  • which might also be strings
  • which is awkward because of DRY failure in rules code

if cf._is_str_dtype(cf_coord_var):
coord_dims = coord_dims[:-1]
datavar_dims = engine.cf_var.dimensions
if cf._is_str_dtype(engine.cf_var):
datavar_dims = datavar_dims[:-1]
common_dims = [dim for dim in coord_dims if dim in datavar_dims]
data_dims = None
if common_dims:
# Calculate the offset of each common dimension.
Expand Down
73 changes: 64 additions & 9 deletions lib/iris/fileformats/cf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""

from abc import ABCMeta, abstractmethod
import codecs
from collections.abc import Iterable, MutableMapping
import os
import re
Expand Down Expand Up @@ -89,6 +90,11 @@ def __init__(self, name, data):

self.cf_data = data
"""NetCDF4 Variable data instance."""
# Note: *always* disable encoding/decoding translations
# To avoid current known problems
# See https://github.com/Unidata/netcdf4-python/issues/1440
data.set_auto_chartostring(False)
# ALSO NOTE: not stored. NetCDFDataProxy must re-assert when re-loading.

"""File source of the NetCDF content."""
try:
Expand Down Expand Up @@ -790,25 +796,73 @@ def cf_label_data(self, cf_data_var):

# Determine the name of the label string (or length) dimension by
# finding the dimension name that doesn't exist within the data dimensions.
str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions))
str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
n_nondata_dims = len(str_dim_names)

if n_nondata_dims == 0:
# *All* dims are shared with the data-variable.
# This is only ok if the data-var is *also* a string type.
dim_ok = _is_str_dtype(cf_data_var)
# In this case, we must just *assume* that the last dimension is "the"
# string dimension
str_dim_name = self.dimensions[-1]
else:
# If there is exactly one non-data dim, that is the one we want
dim_ok = len(str_dim_names) == 1
(str_dim_name,) = str_dim_names

if len(str_dim_name) != 1:
if not dim_ok:
raise ValueError(
"Invalid string dimensions for CF-netCDF label variable %r"
% self.cf_name
)

str_dim_name = str_dim_name[0]
label_data = self[:]

if ma.isMaskedArray(label_data):
label_data = label_data.filled()
label_data = label_data.filled(b"\0")

default_encoding = "utf-8"
encoding = getattr(self, "_Encoding", None)
if encoding is None:
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
encoding = default_encoding
else:
try:
# Accept + normalise naming of encodings
encoding = codecs.lookup(encoding).name
# NOTE: if encoding does not suit data, errors can occur.
# For example, _Encoding = "ascii", with non-ascii content.
except LookupError:
# Replace some invalid setting with "safe"(ish) fallback.
encoding = default_encoding

def string_from_1d_bytearray(array, encoding):
r"""Because numpy bytes arrays behave very oddly.

Elements which "should" contain a zero byte b'\0' instead appear to contain
an *empty* byte b''. So a "b''.join()" will *omit* any zero bytes.
"""
assert array.dtype.kind == "S" and array.dtype.itemsize == 1
assert array.ndim == 1
bytelist = [b"\0" if byte == b"" else byte for byte in array]
bytes = b"".join(bytelist)
assert len(bytes) == array.shape[0]
try:
string = bytes.decode(encoding=encoding)
except UnicodeDecodeError:
# if encoding == "ascii":
# print("\n\n*** FIX !!")
# string = bytes.decode("utf-8")
# else:
Comment on lines +854 to +857
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: remove

raise
result = string.strip()
return result

# Determine whether we have a string-valued scalar label
# i.e. a character variable that only has one dimension (the length of the string).
if self.ndim == 1:
label_string = b"".join(label_data).strip()
label_string = label_string.decode("utf8")
label_string = string_from_1d_bytearray(label_data, encoding)
data = np.array([label_string])
else:
# Determine the index of the string dimension.
Expand All @@ -829,9 +883,10 @@ def cf_label_data(self, cf_data_var):
else:
label_index = index + (slice(None, None),)

label_string = b"".join(label_data[label_index]).strip()
label_string = label_string.decode("utf8")
data[index] = label_string
label_string = string_from_1d_bytearray(
label_data[label_index], encoding
)
data[index] = label_string.strip()

return data

Expand Down
52 changes: 47 additions & 5 deletions lib/iris/fileformats/netcdf/_thread_safe_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,14 +310,39 @@ def fromcdl(cls, *args, **kwargs):
class NetCDFDataProxy:
"""A reference to the data payload of a single NetCDF file variable."""

__slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")

def __init__(self, shape, dtype, path, variable_name, fill_value):
__slots__ = (
"shape",
"dtype",
"path",
"variable_name",
"fill_value",
"is_bytes",
"encoding",
"string_length",
)

def __init__(
self,
shape,
dtype,
path,
variable_name,
fill_value,
encoding: str | None = None,
string_length: int = 0,
):
self.shape = shape
self.dtype = dtype
self.path = path
self.variable_name = variable_name
self.fill_value = fill_value
self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
if self.is_bytes:
# We will be returning a different shape : the last dim is the byte-length
self.shape = self.shape[:-1]
self.dtype = np.dtype(f"U{string_length}")
self.encoding = encoding
self.string_length = string_length

@property
def ndim(self):
Expand All @@ -336,11 +361,26 @@ def __getitem__(self, keys):
dataset = netCDF4.Dataset(self.path)
try:
variable = dataset.variables[self.variable_name]
# ALWAYS disable byte encoding/decoding
# To avoid current known problems
# See https://github.com/Unidata/netcdf4-python/issues/1440
variable.set_auto_chartostring(False)

# Get the NetCDF variable data and slice.
var = variable[keys]
data = variable[keys]

# If bytes, decode to strings
if self.is_bytes:
from iris.util import convert_bytesarray_to_strings

data = convert_bytesarray_to_strings(
data,
encoding=self.encoding,
string_length=self.string_length,
)
finally:
dataset.close()
return np.asanyarray(var)
return np.asanyarray(data)

def __repr__(self):
fmt = (
Expand Down Expand Up @@ -388,6 +428,8 @@ def __setitem__(self, keys, array_data):
try:
dataset = netCDF4.Dataset(self.path, "r+")
var = dataset.variables[self.varname]
# **Always** disable encode/decode of bytes to strings
var.set_auto_chartostring(False)
var[keys] = array_data
finally:
try:
Expand Down
38 changes: 37 additions & 1 deletion lib/iris/fileformats/netcdf/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

"""

import codecs
from collections.abc import Iterable, Iterator, Mapping
from contextlib import contextmanager
from copy import deepcopy
Expand Down Expand Up @@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var):
# Normal NCVariable type:
total_bytes = cf_var.size * cf_var.dtype.itemsize

default_encoding = "utf-8"
encoding = getattr(cf_var, "_Encoding", None)
if encoding is None:
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
encoding = default_encoding
else:
try:
# Accept + normalise naming of encodings
encoding = codecs.lookup(encoding).name
# NOTE: if encoding does not suit data, errors can occur.
# For example, _Encoding = "ascii", with non-ascii content.
except LookupError:
# Replace some invalid setting with "safe"(ish) fallback.
encoding = default_encoding

string_length = getattr(cf_var, "iris_string_length", None)

if total_bytes < _LAZYVAR_MIN_BYTES:
# Don't make a lazy array, as it will cost more memory AND more time to access.
result = cf_var[:]

if result.dtype.kind == "S":
from iris.util import convert_bytesarray_to_strings

result = convert_bytesarray_to_strings(
result,
encoding=encoding,
string_length=string_length,
)

# Special handling of masked scalar value; this will be returned as
# an `np.ma.masked` instance which will lose the original dtype.
# Workaround for this it return a 1-element masked array of the
Expand All @@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var):
"_FillValue",
_thread_safe_nc.default_fillvals[fill_dtype],
)

# NOTE: if the data is bytes which need to be converted to strings on read,
# the data-proxy will do that (and it modifies its shape + dtype).
proxy = NetCDFDataProxy(
cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
cf_var.shape,
dtype,
cf_var.filename,
cf_var.cf_name,
fill_value,
encoding=encoding,
string_length=string_length,
)
# Get the chunking specified for the variable : this is either a shape, or
# maybe the string "contiguous".
Expand Down
Loading
Loading