Skip to content

Commit fe1e22d

Browse files
committed
Various notes, choices + changes: Beginnings of encoded-dataset testing.
1 parent ea536e6 commit fe1e22d

File tree

5 files changed

+595
-50
lines changed

5 files changed

+595
-50
lines changed

lib/iris/fileformats/netcdf/_bytecoding_datasets.py

Lines changed: 113 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
"""
4242

4343
import codecs
44+
import contextlib
45+
import threading
4446
import warnings
4547

4648
import numpy as np
@@ -49,17 +51,18 @@
4951

5052

5153
def decode_bytesarray_to_stringarray(
52-
byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
54+
byte_array: np.ndarray, encoding: str, string_width: int
5355
) -> np.ndarray:
5456
"""Convert an array of bytes to an array of strings, with one less dimension.
5557
5658
N.B. for now at least, we assume the string dim is **always the last one**.
5759
If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
5860
"""
61+
if np.ma.isMaskedArray(byte_array):
62+
# netCDF4-python sees zeros as "missing" -- we don't need or want that
63+
byte_array = byte_array.data
5964
bytes_shape = byte_array.shape
6065
var_shape = bytes_shape[:-1]
61-
if string_width is None:
62-
string_width = bytes_shape[-1]
6366
string_dtype = f"U{string_width}"
6467
result = np.empty(var_shape, dtype=string_dtype)
6568
for ndindex in np.ndindex(var_shape):
@@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray(
7073
return result
7174

7275

73-
def encode_stringarray_as_bytearray(
76+
#
77+
# TODO: remove?
78+
# this older version is "overly flexible", less efficient and not needed here.
79+
#
80+
def flexi_encode_stringarray_as_bytearray(
7481
data: np.ndarray, encoding=None, string_dimension_length: int | None = None
7582
) -> np.ndarray:
7683
"""Encode strings as bytearray.
7784
7885
Note: if 'string_dimension_length' is not given (None), it is set to the longest
79-
encoded bytes element. If 'string_dimension_length' is specified, the last array
86+
encoded bytes element, **OR** the dtype size, if that is greater.
87+
If 'string_dimension_length' is specified, the last array
8088
dimension is set to this and content strings are truncated or extended as required.
8189
"""
90+
if np.ma.isMaskedArray(data):
91+
# netCDF4-python sees zeros as "missing" -- we don't need or want that
92+
data = data.data
8293
element_shape = data.shape
94+
# Encode all the strings + see which is longest
8395
max_length = 1 # this is a MINIMUM - i.e. not zero!
8496
data_elements = np.zeros(element_shape, dtype=object)
8597
for index in np.ndindex(element_shape):
@@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray(
90102
max_length = element_length
91103

92104
if string_dimension_length is None:
105+
# If the string length was not specified, it is the maximum encoded length
106+
# (n-bytes), **or** the dtype string-length, if greater.
93107
string_dimension_length = max_length
108+
array_string_length = int(str(data.dtype)[2:]) # Yuck. No better public way?
109+
if array_string_length > string_dimension_length:
110+
string_dimension_length = array_string_length
94111

95-
# We already encoded all the strings, but stored them in an object-array as
96-
# we didn't yet know the fixed byte-length to convert to.
112+
# We maybe *already* encoded all the strings above, but stored them in an
113+
# object-array as we didn't yet know the fixed byte-length to convert to.
97114
# Now convert to a fixed-width byte array with an extra string-length dimension
98115
result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
99116
right_pad = b"\0" * string_dimension_length
@@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray(
105122
return result
106123

107124

108-
DEFAULT_ENCODING = "utf-8"
125+
def encode_stringarray_as_bytearray(
126+
data: np.ndarray, encoding: str, string_dimension_length: int
127+
) -> np.ndarray:
128+
"""Encode strings as a bytes array."""
129+
element_shape = data.shape
130+
result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
131+
right_pad = b"\0" * string_dimension_length
132+
for index in np.ndindex(element_shape):
133+
bytes = data[index].encode(encoding=encoding)
134+
# It's all a bit nasty ...
135+
bytes = (bytes + right_pad)[:string_dimension_length]
136+
result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
137+
138+
return result
139+
140+
141+
class NetcdfStringDecodeSetting(threading.local):
142+
def __init__(self, perform_encoding: bool = True):
143+
self.set(perform_encoding)
144+
145+
def set(self, perform_encoding: bool):
146+
self.perform_encoding = perform_encoding
147+
148+
def __bool__(self):
149+
return self.perform_encoding
150+
151+
@contextlib.contextmanager
152+
def context(self, perform_encoding: bool):
153+
old_setting = self.perform_encoding
154+
self.perform_encoding = perform_encoding
155+
yield
156+
self.perform_encoding = old_setting
157+
158+
159+
DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting()
160+
DEFAULT_READ_ENCODING = "utf-8"
161+
DEFAULT_WRITE_ENCODING = "ascii"
109162

110163

111164
class EncodedVariable(VariableWrapper):
112165
"""A variable wrapper that translates variable data according to byte encodings."""
113166

114167
def __getitem__(self, keys):
115-
if self.is_chardata():
116-
super().set_auto_chartostring(False)
168+
if self._is_chardata():
169+
# N.B. we never need to UNset this, as we totally control it
170+
self._contained_instance.set_auto_chartostring(False)
117171

118172
data = super().__getitem__(keys)
119173

120-
if self.is_chardata():
121-
encoding = self.get_byte_encoding()
122-
strlen = self.get_string_length()
123-
data = decode_bytesarray_to_stringarray(data, encoding, strlen)
174+
if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
175+
encoding = self._get_encoding() or DEFAULT_READ_ENCODING
176+
# N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
177+
strlen = self._get_string_length()
178+
try:
179+
data = decode_bytesarray_to_stringarray(data, encoding, strlen)
180+
except UnicodeDecodeError as err:
181+
msg = (
182+
f"Character data in variable {self.name!r} could not be decoded"
183+
f"with the {encoding!r} encoding. This can be fixed by setting the "
184+
"variable '_Encoding' attribute to suit the content."
185+
)
186+
raise ValueError(msg) from err
124187

125188
return data
126189

127190
def __setitem__(self, keys, data):
128-
if self.is_chardata():
129-
encoding = self.get_byte_encoding()
130-
strlen = self.get_string_length()
131-
if encoding is not None:
132-
data = encode_stringarray_as_bytearray(data, encoding, strlen)
133-
else:
191+
if self._is_chardata():
192+
# N.B. we never need to UNset this, as we totally control it
193+
self._contained_instance.set_auto_chartostring(False)
194+
195+
encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
196+
# N.B. typically, write encoding default is "ascii" --> fails bad content
197+
if data.dtype.kind == "U":
134198
try:
135-
# Check if all characters are valid ascii
136-
data = encode_stringarray_as_bytearray(data, "ascii", strlen)
137-
except UnicodeEncodeError:
138-
data = encode_stringarray_as_bytearray(
139-
data, DEFAULT_ENCODING, strlen
140-
)
141-
# As this was necessary, record the new encoding on the variable
142-
self.set_ncattr("_Encoding", DEFAULT_ENCODING)
199+
strlen = self._get_string_length()
200+
data = encode_stringarray_as_bytearray(data, encoding, strlen)
201+
except UnicodeEncodeError as err:
143202
msg = (
144-
f"Non-ascii data written to label variable {self.name}. "
145-
f"Applied {DEFAULT_ENCODING!r} encoding, "
146-
f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
203+
f"String data written to netcdf character variable {self.name!r} "
204+
f"could not be represented in encoding {encoding!r}. This can be "
205+
"fixed by setting a suitable variable '_Encoding' attribute, "
206+
'e.g. <variable>._Encoding="UTF-8".'
147207
)
148-
warnings.warn(msg, UserWarning)
149-
150-
super().set_auto_chartostring(False)
208+
raise ValueError(msg) from err
151209

152210
super().__setitem__(keys, data)
153211

154-
def is_chardata(self):
212+
def _is_chardata(self):
155213
return np.issubdtype(self.dtype, np.bytes_)
156214

157-
def get_encoding(self) -> str | None:
158-
"""Get the effective byte encoding to be used for this variable."""
159-
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
215+
def _get_encoding(self) -> str | None:
216+
"""Get the byte encoding defined for this variable (or None)."""
160217
result = getattr(self, "_Encoding", None)
161218
if result is not None:
162219
try:
@@ -165,18 +222,32 @@ def get_encoding(self) -> str | None:
165222
# NOTE: if encoding does not suit data, errors can occur.
166223
# For example, _Encoding = "ascii", with non-ascii content.
167224
except LookupError:
168-
# Replace some invalid setting with "safe"(ish) fallback.
225+
# Unrecognised encoding name : handle this as just a warning
169226
msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
170227
warnings.warn(msg, UserWarning)
171228

172229
return result
173230

174-
def get_string_length(self):
175-
"""Return the string-length defined for this variable (or None)."""
176-
return getattr(self, "iris_string_length", None)
231+
def _get_string_length(self):
232+
"""Return the string-length defined for this variable."""
233+
if not hasattr(self, "_strlen"):
234+
# Work out the string length from the parent dataset dimensions.
235+
strlen = self.group().dimensions[self.dimensions[-1]].size
236+
# Cache this on the variable -- but not as a netcdf attribute (!)
237+
self.__dict__["_strlen"] = strlen
238+
239+
return self._strlen
240+
241+
def set_auto_chartostring(self, onoff: bool):
242+
msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
243+
raise TypeError(msg)
177244

178245

179246
class EncodedDataset(DatasetWrapper):
180247
"""A specialised DatasetWrapper whose variables perform byte encoding."""
181248

182249
VAR_WRAPPER_CLS = EncodedVariable
250+
251+
def set_auto_chartostring(self, onoff: bool):
252+
msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
253+
raise TypeError(msg)

lib/iris/tests/integration/netcdf/test_chararrays.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,11 @@ def make_testcube(
137137

138138
def ncdump(nc_path: str, *args):
139139
"""Call ncdump to print a dump of a file."""
140-
call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
141-
subprocess.run(call_args, check=True)
140+
call_args = [NCDUMP_PATHSTR, nc_path] + list(args)
141+
bytes = subprocess.check_output(call_args)
142+
text = bytes.decode("utf-8")
143+
print(text)
144+
return text
142145

143146

144147
def show_result(filepath):

0 commit comments

Comments
 (0)