Skip to content

Commit ea536e6

Browse files
committed
Initial dataset wrappers.
Rename; addin parts of old investigation; add temporary notes.
1 parent 9bd0970 commit ea536e6

File tree

5 files changed

+457
-6
lines changed

5 files changed

+457
-6
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Module providing to netcdf datasets with automatic character encoding.
6+
7+
The requirement is to convert numpy fixed-width unicode arrays on writing to a variable
8+
which is declared as a byte (character) array with a fixed-length string dimension.
9+
10+
Numpy unicode string arrays are ones with dtypes of the form "U<character-width>".
11+
Numpy character variables have the dtype "S1", and map to a fixed-length "string
12+
dimension".
13+
14+
In principle, netCDF4 already performs these translations, but in practice current
15+
releases are not functional for anything other than "ascii" encoding -- including UTF-8,
16+
which is the most obvious and desirable "general" solution.
17+
18+
There is also the question of whether we should like to implement UTF-8 as our default.
19+
Current discussions on this are inconclusive and neither CF conventions nor the NetCDF
20+
User Guide are definite on what possible values of "_Encoding" are, or what the effective
21+
default is, even though they do both mention the "_Encoding" attribute as a potential
22+
way to handle the issue.
23+
24+
Because of this, we interpret as follows:
25+
* when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to
26+
decode bytes as UTF-8
27+
* when writing strings : in the absence of an "_Encoding" attribute (on the Iris
28+
cube or coord object), we will attempt to encode data with "ascii" : If this fails,
29+
it raise an error prompting the user to supply an "_Encoding" attribute.
30+
31+
Where an "_Encoding" attribute is provided to Iris, we will honour it where possible,
32+
identifying with "codecs.lookup" : This means we support the encodings in the Python
33+
Standard Library, and the name aliases which it recognises.
34+
35+
See:
36+
37+
* known problems https://github.com/Unidata/netcdf4-python/issues/1440
38+
* suggestions for how this "ought" to work, discussed in the netcdf-c library
39+
* https://github.com/Unidata/netcdf-c/issues/402
40+
41+
"""
42+
43+
import codecs
44+
import warnings
45+
46+
import numpy as np
47+
48+
from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
49+
50+
51+
def decode_bytesarray_to_stringarray(
52+
byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
53+
) -> np.ndarray:
54+
"""Convert an array of bytes to an array of strings, with one less dimension.
55+
56+
N.B. for now at least, we assume the string dim is **always the last one**.
57+
If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
58+
"""
59+
bytes_shape = byte_array.shape
60+
var_shape = bytes_shape[:-1]
61+
if string_width is None:
62+
string_width = bytes_shape[-1]
63+
string_dtype = f"U{string_width}"
64+
result = np.empty(var_shape, dtype=string_dtype)
65+
for ndindex in np.ndindex(var_shape):
66+
element_bytes = byte_array[ndindex]
67+
bytes = b"".join([b if b else b"\0" for b in element_bytes])
68+
string = bytes.decode(encoding)
69+
result[ndindex] = string
70+
return result
71+
72+
73+
def encode_stringarray_as_bytearray(
74+
data: np.ndarray, encoding=None, string_dimension_length: int | None = None
75+
) -> np.ndarray:
76+
"""Encode strings as bytearray.
77+
78+
Note: if 'string_dimension_length' is not given (None), it is set to the longest
79+
encoded bytes element. If 'string_dimension_length' is specified, the last array
80+
dimension is set to this and content strings are truncated or extended as required.
81+
"""
82+
element_shape = data.shape
83+
max_length = 1 # this is a MINIMUM - i.e. not zero!
84+
data_elements = np.zeros(element_shape, dtype=object)
85+
for index in np.ndindex(element_shape):
86+
data_element = data[index].encode(encoding=encoding)
87+
element_length = len(data_element)
88+
data_elements[index] = data_element
89+
if element_length > max_length:
90+
max_length = element_length
91+
92+
if string_dimension_length is None:
93+
string_dimension_length = max_length
94+
95+
# We already encoded all the strings, but stored them in an object-array as
96+
# we didn't yet know the fixed byte-length to convert to.
97+
# Now convert to a fixed-width byte array with an extra string-length dimension
98+
result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
99+
right_pad = b"\0" * string_dimension_length
100+
for index in np.ndindex(element_shape):
101+
bytes = data_elements[index]
102+
bytes = (bytes + right_pad)[:string_dimension_length]
103+
result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
104+
105+
return result
106+
107+
108+
DEFAULT_ENCODING = "utf-8"
109+
110+
111+
class EncodedVariable(VariableWrapper):
112+
"""A variable wrapper that translates variable data according to byte encodings."""
113+
114+
def __getitem__(self, keys):
115+
if self.is_chardata():
116+
super().set_auto_chartostring(False)
117+
118+
data = super().__getitem__(keys)
119+
120+
if self.is_chardata():
121+
encoding = self.get_byte_encoding()
122+
strlen = self.get_string_length()
123+
data = decode_bytesarray_to_stringarray(data, encoding, strlen)
124+
125+
return data
126+
127+
def __setitem__(self, keys, data):
128+
if self.is_chardata():
129+
encoding = self.get_byte_encoding()
130+
strlen = self.get_string_length()
131+
if encoding is not None:
132+
data = encode_stringarray_as_bytearray(data, encoding, strlen)
133+
else:
134+
try:
135+
# Check if all characters are valid ascii
136+
data = encode_stringarray_as_bytearray(data, "ascii", strlen)
137+
except UnicodeEncodeError:
138+
data = encode_stringarray_as_bytearray(
139+
data, DEFAULT_ENCODING, strlen
140+
)
141+
# As this was necessary, record the new encoding on the variable
142+
self.set_ncattr("_Encoding", DEFAULT_ENCODING)
143+
msg = (
144+
f"Non-ascii data written to label variable {self.name}. "
145+
f"Applied {DEFAULT_ENCODING!r} encoding, "
146+
f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
147+
)
148+
warnings.warn(msg, UserWarning)
149+
150+
super().set_auto_chartostring(False)
151+
152+
super().__setitem__(keys, data)
153+
154+
def is_chardata(self):
155+
return np.issubdtype(self.dtype, np.bytes_)
156+
157+
def get_encoding(self) -> str | None:
158+
"""Get the effective byte encoding to be used for this variable."""
159+
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
160+
result = getattr(self, "_Encoding", None)
161+
if result is not None:
162+
try:
163+
# Accept + normalise naming of encodings
164+
result = codecs.lookup(result).name
165+
# NOTE: if encoding does not suit data, errors can occur.
166+
# For example, _Encoding = "ascii", with non-ascii content.
167+
except LookupError:
168+
# Replace some invalid setting with "safe"(ish) fallback.
169+
msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
170+
warnings.warn(msg, UserWarning)
171+
172+
return result
173+
174+
def get_string_length(self):
175+
"""Return the string-length defined for this variable (or None)."""
176+
return getattr(self, "iris_string_length", None)
177+
178+
179+
class EncodedDataset(DatasetWrapper):
180+
"""A specialised DatasetWrapper whose variables perform byte encoding."""
181+
182+
VAR_WRAPPER_CLS = EncodedVariable

lib/iris/fileformats/netcdf/_thread_safe_nc.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper):
159159
CONTAINED_CLASS = netCDF4.Group
160160
# Note: will also accept a whole Dataset object, but that is OK.
161161
_DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
162+
# Class to use when creating variable wrappers (default=VariableWrapper).
163+
# - needed to support _byte_encoded_data.EncodedDataset.
164+
VAR_WRAPPER_CLS = VariableWrapper
162165

163166
# All Group API that returns Dimension(s) is wrapped to instead return
164167
# DimensionWrapper(s).
@@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]:
203206
"""
204207
with _GLOBAL_NETCDF4_LOCK:
205208
variables_ = self._contained_instance.variables
206-
return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()}
209+
return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()}
207210

208211
def createVariable(self, *args, **kwargs) -> VariableWrapper:
209212
"""Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK.
@@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper:
216219
"""
217220
with _GLOBAL_NETCDF4_LOCK:
218221
new_variable = self._contained_instance.createVariable(*args, **kwargs)
219-
return VariableWrapper.from_existing(new_variable)
222+
return self.VAR_WRAPPER_CLS.from_existing(new_variable)
220223

221224
def get_variables_by_attributes(
222225
self, *args, **kwargs
@@ -234,7 +237,7 @@ def get_variables_by_attributes(
234237
variables_ = list(
235238
self._contained_instance.get_variables_by_attributes(*args, **kwargs)
236239
)
237-
return [VariableWrapper.from_existing(v) for v in variables_]
240+
return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_]
238241

239242
# All Group API that returns Group(s) is wrapped to instead return
240243
# GroupWrapper(s).
@@ -252,7 +255,7 @@ def groups(self):
252255
"""
253256
with _GLOBAL_NETCDF4_LOCK:
254257
groups_ = self._contained_instance.groups
255-
return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()}
258+
return {k: self.__class__.from_existing(v) for k, v in groups_.items()}
256259

257260
@property
258261
def parent(self):
@@ -268,7 +271,7 @@ def parent(self):
268271
"""
269272
with _GLOBAL_NETCDF4_LOCK:
270273
parent_ = self._contained_instance.parent
271-
return GroupWrapper.from_existing(parent_)
274+
return self.__class__.from_existing(parent_)
272275

273276
def createGroup(self, *args, **kwargs):
274277
"""Call createGroup() from netCDF4.Group/Dataset.
@@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs):
281284
"""
282285
with _GLOBAL_NETCDF4_LOCK:
283286
new_group = self._contained_instance.createGroup(*args, **kwargs)
284-
return GroupWrapper.from_existing(new_group)
287+
return self.__class__.from_existing(new_group)
285288

286289

287290
class DatasetWrapper(GroupWrapper):

0 commit comments

Comments
 (0)