4141"""
4242
4343import codecs
44+ import contextlib
45+ import threading
4446import warnings
4547
4648import numpy as np
4951
5052
5153def decode_bytesarray_to_stringarray (
52- byte_array : np .ndarray , encoding = "utf-8" , string_width : int | None = None
54+ byte_array : np .ndarray , encoding : str , string_width : int
5355) -> np .ndarray :
5456 """Convert an array of bytes to an array of strings, with one less dimension.
5557
5658 N.B. for now at least, we assume the string dim is **always the last one**.
5759 If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
5860 """
61+ if np .ma .isMaskedArray (byte_array ):
62+ # netCDF4-python sees zeros as "missing" -- we don't need or want that
63+ byte_array = byte_array .data
5964 bytes_shape = byte_array .shape
6065 var_shape = bytes_shape [:- 1 ]
61- if string_width is None :
62- string_width = bytes_shape [- 1 ]
6366 string_dtype = f"U{ string_width } "
6467 result = np .empty (var_shape , dtype = string_dtype )
6568 for ndindex in np .ndindex (var_shape ):
@@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray(
7073 return result
7174
7275
73- def encode_stringarray_as_bytearray (
76+ #
77+ # TODO: remove?
78+ # this older version is "overly flexible", less efficient and not needed here.
79+ #
80+ def flexi_encode_stringarray_as_bytearray (
7481 data : np .ndarray , encoding = None , string_dimension_length : int | None = None
7582) -> np .ndarray :
7683 """Encode strings as bytearray.
7784
7885 Note: if 'string_dimension_length' is not given (None), it is set to the longest
79- encoded bytes element. If 'string_dimension_length' is specified, the last array
86+ encoded bytes element, **OR** the dtype size, if that is greater.
87+ If 'string_dimension_length' is specified, the last array
8088 dimension is set to this and content strings are truncated or extended as required.
8189 """
90+ if np .ma .isMaskedArray (data ):
91+ # netCDF4-python sees zeros as "missing" -- we don't need or want that
92+ data = data .data
8293 element_shape = data .shape
94+ # Encode all the strings + see which is longest
8395 max_length = 1 # this is a MINIMUM - i.e. not zero!
8496 data_elements = np .zeros (element_shape , dtype = object )
8597 for index in np .ndindex (element_shape ):
@@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray(
90102 max_length = element_length
91103
92104 if string_dimension_length is None :
105+ # If the string length was not specified, it is the maximum encoded length
106+ # (n-bytes), **or** the dtype string-length, if greater.
93107 string_dimension_length = max_length
108+ array_string_length = int (str (data .dtype )[2 :]) # Yuck. No better public way?
109+ if array_string_length > string_dimension_length :
110+ string_dimension_length = array_string_length
94111
95- # We already encoded all the strings, but stored them in an object-array as
96- # we didn't yet know the fixed byte-length to convert to.
112+ # We maybe * already* encoded all the strings above , but stored them in an
113+ # object-array as we didn't yet know the fixed byte-length to convert to.
97114 # Now convert to a fixed-width byte array with an extra string-length dimension
98115 result = np .zeros (element_shape + (string_dimension_length ,), dtype = "S1" )
99116 right_pad = b"\0 " * string_dimension_length
@@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray(
105122 return result
106123
107124
108- DEFAULT_ENCODING = "utf-8"
125+ def encode_stringarray_as_bytearray (
126+ data : np .ndarray , encoding : str , string_dimension_length : int
127+ ) -> np .ndarray :
128+ """Encode strings as a bytes array."""
129+ element_shape = data .shape
130+ result = np .zeros (element_shape + (string_dimension_length ,), dtype = "S1" )
131+ right_pad = b"\0 " * string_dimension_length
132+ for index in np .ndindex (element_shape ):
133+ bytes = data [index ].encode (encoding = encoding )
134+ # It's all a bit nasty ...
135+ bytes = (bytes + right_pad )[:string_dimension_length ]
136+ result [index ] = [bytes [i : i + 1 ] for i in range (string_dimension_length )]
137+
138+ return result
139+
140+
141+ class NetcdfStringDecodeSetting (threading .local ):
142+ def __init__ (self , perform_encoding : bool = True ):
143+ self .set (perform_encoding )
144+
145+ def set (self , perform_encoding : bool ):
146+ self .perform_encoding = perform_encoding
147+
148+ def __bool__ (self ):
149+ return self .perform_encoding
150+
151+ @contextlib .contextmanager
152+ def context (self , perform_encoding : bool ):
153+ old_setting = self .perform_encoding
154+ self .perform_encoding = perform_encoding
155+ yield
156+ self .perform_encoding = old_setting
157+
158+
159+ DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting ()
160+ DEFAULT_READ_ENCODING = "utf-8"
161+ DEFAULT_WRITE_ENCODING = "ascii"
109162
110163
111164class EncodedVariable (VariableWrapper ):
112165 """A variable wrapper that translates variable data according to byte encodings."""
113166
114167 def __getitem__ (self , keys ):
115- if self .is_chardata ():
116- super ().set_auto_chartostring (False )
168+ if self ._is_chardata ():
169+ # N.B. we never need to UNset this, as we totally control it
170+ self ._contained_instance .set_auto_chartostring (False )
117171
118172 data = super ().__getitem__ (keys )
119173
120- if self .is_chardata ():
121- encoding = self .get_byte_encoding ()
122- strlen = self .get_string_length ()
123- data = decode_bytesarray_to_stringarray (data , encoding , strlen )
174+ if DECODE_TO_STRINGS_ON_READ and self ._is_chardata ():
175+ encoding = self ._get_encoding () or DEFAULT_READ_ENCODING
176+ # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
177+ strlen = self ._get_string_length ()
178+ try :
179+ data = decode_bytesarray_to_stringarray (data , encoding , strlen )
180+ except UnicodeDecodeError as err :
181+ msg = (
182+ f"Character data in variable { self .name !r} could not be decoded"
183+ f"with the { encoding !r} encoding. This can be fixed by setting the "
184+ "variable '_Encoding' attribute to suit the content."
185+ )
186+ raise ValueError (msg ) from err
124187
125188 return data
126189
127190 def __setitem__ (self , keys , data ):
128- if self .is_chardata ():
129- encoding = self .get_byte_encoding ()
130- strlen = self .get_string_length ()
131- if encoding is not None :
132- data = encode_stringarray_as_bytearray (data , encoding , strlen )
133- else :
191+ if self ._is_chardata ():
192+ # N.B. we never need to UNset this, as we totally control it
193+ self ._contained_instance .set_auto_chartostring (False )
194+
195+ encoding = self ._get_encoding () or DEFAULT_WRITE_ENCODING
196+ # N.B. typically, write encoding default is "ascii" --> fails bad content
197+ if data .dtype .kind == "U" :
134198 try :
135- # Check if all characters are valid ascii
136- data = encode_stringarray_as_bytearray (data , "ascii" , strlen )
137- except UnicodeEncodeError :
138- data = encode_stringarray_as_bytearray (
139- data , DEFAULT_ENCODING , strlen
140- )
141- # As this was necessary, record the new encoding on the variable
142- self .set_ncattr ("_Encoding" , DEFAULT_ENCODING )
199+ strlen = self ._get_string_length ()
200+ data = encode_stringarray_as_bytearray (data , encoding , strlen )
201+ except UnicodeEncodeError as err :
143202 msg = (
144- f"Non-ascii data written to label variable { self .name } . "
145- f"Applied { DEFAULT_ENCODING !r} encoding, "
146- f"and set attribute _Encoding={ DEFAULT_ENCODING !r} ."
203+ f"String data written to netcdf character variable { self .name !r} "
204+ f"could not be represented in encoding { encoding !r} . This can be "
205+ "fixed by setting a suitable variable '_Encoding' attribute, "
206+ 'e.g. <variable>._Encoding="UTF-8".'
147207 )
148- warnings .warn (msg , UserWarning )
149-
150- super ().set_auto_chartostring (False )
208+ raise ValueError (msg ) from err
151209
152210 super ().__setitem__ (keys , data )
153211
154- def is_chardata (self ):
212+ def _is_chardata (self ):
155213 return np .issubdtype (self .dtype , np .bytes_ )
156214
157- def get_encoding (self ) -> str | None :
158- """Get the effective byte encoding to be used for this variable."""
159- # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
215+ def _get_encoding (self ) -> str | None :
216+ """Get the byte encoding defined for this variable (or None)."""
160217 result = getattr (self , "_Encoding" , None )
161218 if result is not None :
162219 try :
@@ -165,18 +222,32 @@ def get_encoding(self) -> str | None:
165222 # NOTE: if encoding does not suit data, errors can occur.
166223 # For example, _Encoding = "ascii", with non-ascii content.
167224 except LookupError :
168- # Replace some invalid setting with "safe"(ish) fallback.
225+ # Unrecognised encoding name : handle this as just a warning
169226 msg = f"Unknown encoding for variable { self .name !r} : { result !r} "
170227 warnings .warn (msg , UserWarning )
171228
172229 return result
173230
174- def get_string_length (self ):
175- """Return the string-length defined for this variable (or None)."""
176- return getattr (self , "iris_string_length" , None )
231+ def _get_string_length (self ):
232+ """Return the string-length defined for this variable."""
233+ if not hasattr (self , "_strlen" ):
234+ # Work out the string length from the parent dataset dimensions.
235+ strlen = self .group ().dimensions [self .dimensions [- 1 ]].size
236+ # Cache this on the variable -- but not as a netcdf attribute (!)
237+ self .__dict__ ["_strlen" ] = strlen
238+
239+ return self ._strlen
240+
241+ def set_auto_chartostring (self , onoff : bool ):
242+ msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
243+ raise TypeError (msg )
177244
178245
179246class EncodedDataset (DatasetWrapper ):
180247 """A specialised DatasetWrapper whose variables perform byte encoding."""
181248
182249 VAR_WRAPPER_CLS = EncodedVariable
250+
251+ def set_auto_chartostring (self , onoff : bool ):
252+ msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
253+ raise TypeError (msg )
0 commit comments