Skip to content

Commit 9d668db

Browse files
committed
Initial dataset wrappers.
1 parent 729d0b5 commit 9d668db

File tree

2 files changed

+169
-6
lines changed

2 files changed

+169
-6
lines changed
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Module providing to netcdf datasets with automatic character encoding.
6+
7+
The requirement is to convert numpy fixed-width unicode arrays on writing to a variable
8+
which is declared as a byte (character) array with a fixed-length string dimension.
9+
10+
Numpy unicode string arrays are ones with dtypes of the form "U<character-width>".
11+
Numpy character variables have the dtype "S1", and map to a fixed-length "string
12+
dimension".
13+
14+
In principle, netCDF4 already performs these translations, but in practice current
15+
releases are not functional for anything other than "ascii" encoding -- including UTF-8,
16+
which is the most obvious and desirable "general" solution.
17+
18+
There is also the question of whether we should like to implement UTF-8 as our default.
19+
Current discussions on this are inconclusive and neither CF conventions nor the NetCDF
20+
User Guide are definite on what possible values of "_Encoding" are, or what the effective
21+
default is, even though they do both mention the "_Encoding" attribute as a potential
22+
way to handle the issue.
23+
24+
Because of this, we interpret as follows:
25+
* in the absence of an "_Encoding" attribute, we will attempt to decode bytes as UTF-8
26+
* when writing string data, in the absence of an "_Encoding" attribute (on the Iris
27+
cube or coord object), we will attempt to encode data with "ascii" : If this succeeds,
28+
we will save as is (with no "_Encoding" attribute), but if it fails we will encode
29+
as UTF-8 **and** add an "_Encoding='UTF-8'" attribute.
30+
31+
Where an "_Encoding" attribute is provided to Iris, we will honour it where possible,
32+
identifying with "codecs.lookup" : This means we support the encodings in the Python
33+
Standard Library, and name aliases which it recognises.
34+
35+
See:
36+
37+
* known problems https://github.com/Unidata/netcdf4-python/issues/1440
38+
* suggestions for how this "ought" to work, discussed in the netcdf-c library
39+
* https://github.com/Unidata/netcdf-c/issues/402
40+
41+
"""
42+
43+
import codecs
44+
45+
import numpy as np
46+
47+
from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
48+
49+
50+
def decode_bytesarray_to_stringarray(
51+
byte_array, encoding="utf-8", string_width: int | None = None
52+
):
53+
"""Convert an array of bytes to an array of strings, with one less dimension.
54+
55+
N.B. for now at least, we assume the string dim is **always the last one**.
56+
If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
57+
"""
58+
bytes_shape = byte_array.shape
59+
var_shape = bytes_shape[:-1]
60+
if string_width is None:
61+
string_width = bytes_shape[-1]
62+
string_dtype = f"U{string_width}"
63+
result = np.empty(var_shape, dtype=string_dtype)
64+
for ndindex in np.ndindex(var_shape):
65+
element_bytes = byte_array[ndindex]
66+
bytes = b"".join([b if b else b"\0" for b in element_bytes])
67+
string = bytes.decode(encoding)
68+
result[ndindex] = string
69+
return result
70+
71+
72+
def encode_stringarray_as_bytearray(
73+
data: np.ndarray, encoding=None, string_dimension_length: int | None = None
74+
) -> np.ndarray:
75+
"""Encode strings as bytearray.
76+
77+
Note: if 'string_dimension_length' is not given (None), it is set to the longest
78+
encoded bytes element. If 'string_dimension_length' is specified, the last array
79+
dimension is set to this and content strings are truncated or extended as required.
80+
"""
81+
element_shape = data.shape
82+
max_length = 1 # this is a MINIMUM - i.e. not zero!
83+
data_elements = np.zeros(element_shape, dtype=object)
84+
for index in np.ndindex(element_shape):
85+
data_element = data[index].encode(encoding=encoding)
86+
element_length = len(data_element)
87+
data_elements[index] = data_element
88+
if element_length > max_length:
89+
max_length = element_length
90+
91+
if string_dimension_length is None:
92+
string_dimension_length = max_length
93+
94+
# We already encoded all the strings, but stored them in an object-array as
95+
# we didn't yet know the fixed byte-length to convert to.
96+
# Now convert to a fixed-width byte array with an extra string-length dimension
97+
result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
98+
right_pad = b"\0" * string_dimension_length
99+
for index in np.ndindex(element_shape):
100+
bytes = data_elements[index]
101+
bytes = (bytes + right_pad)[:string_dimension_length]
102+
result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
103+
104+
return result
105+
106+
107+
class EncodedVariable(VariableWrapper):
108+
"""A variable wrapper that translates variable data according to byte encodings."""
109+
110+
def __getitem__(self, keys):
111+
if self.is_chardata():
112+
super().set_auto_chartostring(False)
113+
114+
data = super().__getitem__(keys)
115+
116+
if self.is_chardata():
117+
encoding = self.get_byte_encoding()
118+
strlen = self.get_string_length()
119+
data = decode_bytesarray_to_stringarray(data, encoding, strlen)
120+
121+
return data
122+
123+
def __setitem__(self, keys, data):
124+
if self.is_chardata():
125+
encoding = self.get_byte_encoding()
126+
strlen = self.get_string_length()
127+
data = encode_stringarray_as_bytearray(data, encoding, strlen)
128+
super().set_auto_chartostring(False)
129+
130+
super().__setitem__(keys, data)
131+
132+
def is_chardata(self):
133+
return np.issubdtype(self.dtype, np.bytes_)
134+
135+
def get_encoding(self):
136+
"""Get the effective byte encoding to be used for this variable."""
137+
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
138+
default_encoding = "utf-8"
139+
encoding = getattr(self, "_Encoding", None)
140+
if encoding is None:
141+
encoding = default_encoding
142+
else:
143+
try:
144+
# Accept + normalise naming of encodings
145+
encoding = codecs.lookup(encoding).name
146+
# NOTE: if encoding does not suit data, errors can occur.
147+
# For example, _Encoding = "ascii", with non-ascii content.
148+
except LookupError:
149+
# Replace some invalid setting with "safe"(ish) fallback.
150+
encoding = default_encoding
151+
152+
def get_string_length(self):
153+
"""Return the string-length defined for this variable (or None)."""
154+
return getattr(self, "iris_string_length", None)
155+
156+
157+
class EncodedDataset(DatasetWrapper):
158+
"""A specialised DatasetWrapper whose variables perform byte encodings."""
159+
160+
VAR_WRAPPER_CLS = EncodedVariable

lib/iris/fileformats/netcdf/_thread_safe_nc.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ class GroupWrapper(_ThreadSafeWrapper):
158158
CONTAINED_CLASS = netCDF4.Group
159159
# Note: will also accept a whole Dataset object, but that is OK.
160160
_DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
161+
# Class to use when creating variable wrappers (default=VariableWrapper).
162+
# - needed to support _byte_encoded_data.EncodedDataset.
163+
VAR_WRAPPER_CLS = VariableWrapper
161164

162165
# All Group API that returns Dimension(s) is wrapped to instead return
163166
# DimensionWrapper(s).
@@ -202,7 +205,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]:
202205
"""
203206
with _GLOBAL_NETCDF4_LOCK:
204207
variables_ = self._contained_instance.variables
205-
return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()}
208+
return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()}
206209

207210
def createVariable(self, *args, **kwargs) -> VariableWrapper:
208211
"""Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK.
@@ -215,7 +218,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper:
215218
"""
216219
with _GLOBAL_NETCDF4_LOCK:
217220
new_variable = self._contained_instance.createVariable(*args, **kwargs)
218-
return VariableWrapper.from_existing(new_variable)
221+
return self.VAR_WRAPPER_CLS.from_existing(new_variable)
219222

220223
def get_variables_by_attributes(
221224
self, *args, **kwargs
@@ -233,7 +236,7 @@ def get_variables_by_attributes(
233236
variables_ = list(
234237
self._contained_instance.get_variables_by_attributes(*args, **kwargs)
235238
)
236-
return [VariableWrapper.from_existing(v) for v in variables_]
239+
return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_]
237240

238241
# All Group API that returns Group(s) is wrapped to instead return
239242
# GroupWrapper(s).
@@ -251,7 +254,7 @@ def groups(self):
251254
"""
252255
with _GLOBAL_NETCDF4_LOCK:
253256
groups_ = self._contained_instance.groups
254-
return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()}
257+
return {k: self.__class__.from_existing(v) for k, v in groups_.items()}
255258

256259
@property
257260
def parent(self):
@@ -267,7 +270,7 @@ def parent(self):
267270
"""
268271
with _GLOBAL_NETCDF4_LOCK:
269272
parent_ = self._contained_instance.parent
270-
return GroupWrapper.from_existing(parent_)
273+
return self.__class__.from_existing(parent_)
271274

272275
def createGroup(self, *args, **kwargs):
273276
"""Call createGroup() from netCDF4.Group/Dataset.
@@ -280,7 +283,7 @@ def createGroup(self, *args, **kwargs):
280283
"""
281284
with _GLOBAL_NETCDF4_LOCK:
282285
new_group = self._contained_instance.createGroup(*args, **kwargs)
283-
return GroupWrapper.from_existing(new_group)
286+
return self.__class__.from_existing(new_group)
284287

285288

286289
class DatasetWrapper(GroupWrapper):

0 commit comments

Comments
 (0)