Skip to content

Commit 28b124c

Browse files
committed
Merge branch 'encoded_datasets' into chardata_plus_encoded_datasets
2 parents a3e1217 + e684d1d commit 28b124c

File tree

6 files changed

+983
-8
lines changed

6 files changed

+983
-8
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Module providing to netcdf datasets with automatic character encoding.
6+
7+
The requirement is to convert numpy fixed-width unicode arrays on writing to a variable
8+
which is declared as a byte (character) array with a fixed-length string dimension.
9+
10+
Numpy unicode string arrays are ones with dtypes of the form "U<character-width>".
11+
Numpy character variables have the dtype "S1", and map to a fixed-length "string
12+
dimension".
13+
14+
In principle, netCDF4 already performs these translations, but in practice current
15+
releases are not functional for anything other than "ascii" encoding -- including UTF-8,
16+
which is the most obvious and desirable "general" solution.
17+
18+
There is also the question of whether we should like to implement UTF-8 as our default.
19+
Current discussions on this are inconclusive and neither CF conventions nor the NetCDF
20+
User Guide are definite on what possible values of "_Encoding" are, or what the effective
21+
default is, even though they do both mention the "_Encoding" attribute as a potential
22+
way to handle the issue.
23+
24+
Because of this, we interpret as follows:
25+
* when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to
26+
decode bytes as UTF-8
27+
* when writing strings : in the absence of an "_Encoding" attribute (on the Iris
28+
cube or coord object), we will attempt to encode data with "ascii" : If this fails,
29+
it raise an error prompting the user to supply an "_Encoding" attribute.
30+
31+
Where an "_Encoding" attribute is provided to Iris, we will honour it where possible,
32+
identifying with "codecs.lookup" : This means we support the encodings in the Python
33+
Standard Library, and the name aliases which it recognises.
34+
35+
See:
36+
37+
* known problems https://github.com/Unidata/netcdf4-python/issues/1440
38+
* suggestions for how this "ought" to work, discussed in the netcdf-c library
39+
* https://github.com/Unidata/netcdf-c/issues/402
40+
41+
"""
42+
43+
import codecs
44+
import contextlib
45+
import threading
46+
import warnings
47+
48+
import numpy as np
49+
50+
from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
51+
52+
53+
def decode_bytesarray_to_stringarray(
54+
byte_array: np.ndarray, encoding: str, string_width: int
55+
) -> np.ndarray:
56+
"""Convert an array of bytes to an array of strings, with one less dimension.
57+
58+
N.B. for now at least, we assume the string dim is **always the last one**.
59+
If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
60+
"""
61+
if np.ma.isMaskedArray(byte_array):
62+
# netCDF4-python sees zeros as "missing" -- we don't need or want that
63+
byte_array = byte_array.data
64+
bytes_shape = byte_array.shape
65+
var_shape = bytes_shape[:-1]
66+
string_dtype = f"U{string_width}"
67+
result = np.empty(var_shape, dtype=string_dtype)
68+
for ndindex in np.ndindex(var_shape):
69+
element_bytes = byte_array[ndindex]
70+
bytes = b"".join([b if b else b"\0" for b in element_bytes])
71+
string = bytes.decode(encoding)
72+
result[ndindex] = string
73+
return result
74+
75+
76+
#
77+
# TODO: remove?
78+
# this older version is "overly flexible", less efficient and not needed here.
79+
#
80+
def flexi_encode_stringarray_as_bytearray(
81+
data: np.ndarray, encoding=None, string_dimension_length: int | None = None
82+
) -> np.ndarray:
83+
"""Encode strings as bytearray.
84+
85+
Note: if 'string_dimension_length' is not given (None), it is set to the longest
86+
encoded bytes element, **OR** the dtype size, if that is greater.
87+
If 'string_dimension_length' is specified, the last array
88+
dimension is set to this and content strings are truncated or extended as required.
89+
"""
90+
if np.ma.isMaskedArray(data):
91+
# netCDF4-python sees zeros as "missing" -- we don't need or want that
92+
data = data.data
93+
element_shape = data.shape
94+
# Encode all the strings + see which is longest
95+
max_length = 1 # this is a MINIMUM - i.e. not zero!
96+
data_elements = np.zeros(element_shape, dtype=object)
97+
for index in np.ndindex(element_shape):
98+
data_element = data[index].encode(encoding=encoding)
99+
element_length = len(data_element)
100+
data_elements[index] = data_element
101+
if element_length > max_length:
102+
max_length = element_length
103+
104+
if string_dimension_length is None:
105+
# If the string length was not specified, it is the maximum encoded length
106+
# (n-bytes), **or** the dtype string-length, if greater.
107+
string_dimension_length = max_length
108+
array_string_length = int(str(data.dtype)[2:]) # Yuck. No better public way?
109+
if array_string_length > string_dimension_length:
110+
string_dimension_length = array_string_length
111+
112+
# We maybe *already* encoded all the strings above, but stored them in an
113+
# object-array as we didn't yet know the fixed byte-length to convert to.
114+
# Now convert to a fixed-width byte array with an extra string-length dimension
115+
result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
116+
right_pad = b"\0" * string_dimension_length
117+
for index in np.ndindex(element_shape):
118+
bytes = data_elements[index]
119+
bytes = (bytes + right_pad)[:string_dimension_length]
120+
result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
121+
122+
return result
123+
124+
125+
def encode_stringarray_as_bytearray(
126+
data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
127+
) -> np.ndarray:
128+
"""Encode strings as a bytes array."""
129+
data = np.asanyarray(data)
130+
element_shape = data.shape
131+
result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
132+
right_pad = b"\0" * string_dimension_length
133+
for index in np.ndindex(element_shape):
134+
bytes = data[index].encode(encoding=encoding)
135+
# It's all a bit nasty ...
136+
bytes = (bytes + right_pad)[:string_dimension_length]
137+
result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
138+
139+
return result
140+
141+
142+
class NetcdfStringDecodeSetting(threading.local):
143+
def __init__(self, perform_encoding: bool = True):
144+
self.set(perform_encoding)
145+
146+
def set(self, perform_encoding: bool):
147+
self.perform_encoding = perform_encoding
148+
149+
def __bool__(self):
150+
return self.perform_encoding
151+
152+
@contextlib.contextmanager
153+
def context(self, perform_encoding: bool):
154+
old_setting = self.perform_encoding
155+
self.perform_encoding = perform_encoding
156+
yield
157+
self.perform_encoding = old_setting
158+
159+
160+
DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting()
161+
DEFAULT_READ_ENCODING = "utf-8"
162+
DEFAULT_WRITE_ENCODING = "ascii"
163+
164+
165+
class EncodedVariable(VariableWrapper):
166+
"""A variable wrapper that translates variable data according to byte encodings."""
167+
168+
def __getitem__(self, keys):
169+
if self._is_chardata():
170+
# N.B. we never need to UNset this, as we totally control it
171+
self._contained_instance.set_auto_chartostring(False)
172+
173+
data = super().__getitem__(keys)
174+
175+
if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
176+
encoding = self._get_encoding() or DEFAULT_READ_ENCODING
177+
# N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
178+
strlen = self._get_string_width()
179+
try:
180+
data = decode_bytesarray_to_stringarray(data, encoding, strlen)
181+
except UnicodeDecodeError as err:
182+
msg = (
183+
f"Character data in variable {self.name!r} could not be decoded "
184+
f"with the {encoding!r} encoding. This can be fixed by setting the "
185+
"variable '_Encoding' attribute to suit the content."
186+
)
187+
raise ValueError(msg) from err
188+
189+
return data
190+
191+
def __setitem__(self, keys, data):
192+
data = np.asanyarray(data)
193+
if self._is_chardata():
194+
# N.B. we never need to UNset this, as we totally control it
195+
self._contained_instance.set_auto_chartostring(False)
196+
197+
# N.B. typically, write encoding default is "ascii" --> fails bad content
198+
if data.dtype.kind == "U":
199+
try:
200+
encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
201+
strlen = self._get_byte_width()
202+
data = encode_stringarray_as_bytearray(data, encoding, strlen)
203+
except UnicodeEncodeError as err:
204+
msg = (
205+
f"String data written to netcdf character variable {self.name!r} "
206+
f"could not be represented in encoding {encoding!r}. This can be "
207+
"fixed by setting a suitable variable '_Encoding' attribute, "
208+
'e.g. <variable>._Encoding="UTF-8".'
209+
)
210+
raise ValueError(msg) from err
211+
212+
super().__setitem__(keys, data)
213+
214+
def _is_chardata(self):
215+
return np.issubdtype(self.dtype, np.bytes_)
216+
217+
def _get_encoding(self) -> str | None:
218+
"""Get the byte encoding defined for this variable (or None)."""
219+
result = getattr(self, "_Encoding", None)
220+
if result is not None:
221+
try:
222+
# Accept + normalise naming of encodings
223+
result = codecs.lookup(result).name
224+
# NOTE: if encoding does not suit data, errors can occur.
225+
# For example, _Encoding = "ascii", with non-ascii content.
226+
except LookupError:
227+
# Unrecognised encoding name : handle this as just a warning
228+
msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
229+
warnings.warn(msg, UserWarning)
230+
231+
return result
232+
233+
def _get_byte_width(self) -> int | None:
234+
if not hasattr(self, "_bytewidth"):
235+
n_bytes = self.group().dimensions[self.dimensions[-1]].size
236+
# Cache this length control on the variable -- but not as a netcdf attribute
237+
self.__dict__["_bytewidth"] = n_bytes
238+
239+
return self.__dict__["_bytewidth"]
240+
241+
def _get_string_width(self):
242+
"""Return the string-length defined for this variable."""
243+
if not hasattr(self, "_strlen"):
244+
# Work out the actual byte width from the parent dataset dimensions.
245+
strlen = self._get_byte_width()
246+
# Convert the string dimension length (i.e. bytes) to a sufficiently-long
247+
# string width, depending on the encoding used.
248+
encoding = self._get_encoding() or DEFAULT_READ_ENCODING
249+
# regularise the name for comparison with recognised ones
250+
encoding = codecs.lookup(encoding).name
251+
if "utf-16" in encoding:
252+
# Each char needs at least 2 bytes -- including a terminator char
253+
strlen = (strlen // 2) - 1
254+
elif "utf-32" in encoding:
255+
# Each char needs exactly 4 bytes -- including a terminator char
256+
strlen = (strlen // 4) - 1
257+
# "ELSE": assume there can be (at most) as many chars as bytes
258+
259+
# Cache this length control on the variable -- but not as a netcdf attribute
260+
self.__dict__["_strlen"] = strlen
261+
262+
return self._strlen
263+
264+
def set_auto_chartostring(self, onoff: bool):
265+
msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
266+
raise TypeError(msg)
267+
268+
269+
class EncodedDataset(DatasetWrapper):
270+
"""A specialised DatasetWrapper whose variables perform byte encoding."""
271+
272+
VAR_WRAPPER_CLS = EncodedVariable
273+
274+
def set_auto_chartostring(self, onoff: bool):
275+
msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
276+
raise TypeError(msg)

lib/iris/fileformats/netcdf/_thread_safe_nc.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper):
159159
CONTAINED_CLASS = netCDF4.Group
160160
# Note: will also accept a whole Dataset object, but that is OK.
161161
_DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
162+
# Class to use when creating variable wrappers (default=VariableWrapper).
163+
# - needed to support _byte_encoded_data.EncodedDataset.
164+
VAR_WRAPPER_CLS = VariableWrapper
162165

163166
# All Group API that returns Dimension(s) is wrapped to instead return
164167
# DimensionWrapper(s).
@@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]:
203206
"""
204207
with _GLOBAL_NETCDF4_LOCK:
205208
variables_ = self._contained_instance.variables
206-
return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()}
209+
return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()}
207210

208211
def createVariable(self, *args, **kwargs) -> VariableWrapper:
209212
"""Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK.
@@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper:
216219
"""
217220
with _GLOBAL_NETCDF4_LOCK:
218221
new_variable = self._contained_instance.createVariable(*args, **kwargs)
219-
return VariableWrapper.from_existing(new_variable)
222+
return self.VAR_WRAPPER_CLS.from_existing(new_variable)
220223

221224
def get_variables_by_attributes(
222225
self, *args, **kwargs
@@ -234,7 +237,7 @@ def get_variables_by_attributes(
234237
variables_ = list(
235238
self._contained_instance.get_variables_by_attributes(*args, **kwargs)
236239
)
237-
return [VariableWrapper.from_existing(v) for v in variables_]
240+
return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_]
238241

239242
# All Group API that returns Group(s) is wrapped to instead return
240243
# GroupWrapper(s).
@@ -252,7 +255,7 @@ def groups(self):
252255
"""
253256
with _GLOBAL_NETCDF4_LOCK:
254257
groups_ = self._contained_instance.groups
255-
return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()}
258+
return {k: self.__class__.from_existing(v) for k, v in groups_.items()}
256259

257260
@property
258261
def parent(self):
@@ -268,7 +271,7 @@ def parent(self):
268271
"""
269272
with _GLOBAL_NETCDF4_LOCK:
270273
parent_ = self._contained_instance.parent
271-
return GroupWrapper.from_existing(parent_)
274+
return self.__class__.from_existing(parent_)
272275

273276
def createGroup(self, *args, **kwargs):
274277
"""Call createGroup() from netCDF4.Group/Dataset.
@@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs):
281284
"""
282285
with _GLOBAL_NETCDF4_LOCK:
283286
new_group = self._contained_instance.createGroup(*args, **kwargs)
284-
return GroupWrapper.from_existing(new_group)
287+
return self.__class__.from_existing(new_group)
285288

286289

287290
class DatasetWrapper(GroupWrapper):

lib/iris/tests/integration/netcdf/test_chararrays.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,11 @@ def make_testcube(
137137

138138
def ncdump(nc_path: str, *args):
139139
"""Call ncdump to print a dump of a file."""
140-
call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
141-
subprocess.run(call_args, check=True)
140+
call_args = [NCDUMP_PATHSTR, nc_path] + list(args)
141+
bytes = subprocess.check_output(call_args)
142+
text = bytes.decode("utf-8")
143+
print(text)
144+
return text
142145

143146

144147
def show_result(filepath):

0 commit comments

Comments
 (0)