Skip to content

Commit 54d7743

Browse files
committed
Test writing of string data: various encodings, from strings or bytes.
1 parent 9bdeb5d commit 54d7743

File tree

1 file changed

+155
-10
lines changed

1 file changed

+155
-10
lines changed

lib/iris/tests/integration/netcdf/test_stringdata.py

Lines changed: 155 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,17 @@
88
data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
99
"""
1010

11-
from contextlib import contextmanager
1211
from dataclasses import dataclass
1312
from pathlib import Path
13+
from typing import Iterable
1414

1515
import numpy as np
16+
from numpy.typing import ArrayLike
1617
import pytest
1718

1819
import iris
20+
from iris.coords import AuxCoord, DimCoord
21+
from iris.cube import Cube
1922
from iris.fileformats.netcdf import _thread_safe_nc
2023

2124

@@ -49,8 +52,8 @@ def all_lazy_auxcoords():
4952
# Independently defined here, to avoid relying on any code we are testing.
5053
#
5154
def convert_strings_to_chararray(
52-
string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None
53-
):
55+
string_array_1d: ArrayLike, maxlen: int, encoding: str | None = None
56+
) -> np.ndarray:
5457
# Note: this is limited to 1-D arrays of strings.
5558
# Could generalise that if needed, but for now this makes it simpler.
5659
if encoding is None:
@@ -63,12 +66,13 @@ def convert_strings_to_chararray(
6366

6467

6568
def convert_bytearray_to_strings(
66-
byte_array, encoding="utf-8", string_length: int | None = None
67-
):
69+
byte_array: ArrayLike, encoding: str = "utf-8", string_length: int | None = None
70+
) -> np.ndarray:
6871
"""Convert bytes to strings.
6972
7073
N.B. for now at least, we assume the string dim is **always the last one**.
7174
"""
75+
byte_array = np.asanyarray(byte_array)
7276
bytes_shape = byte_array.shape
7377
var_shape = bytes_shape[:-1]
7478
if string_length is None:
@@ -88,9 +92,9 @@ class SamplefileDetails:
8892
"""Convenience container for information about a sample file."""
8993

9094
filepath: Path
91-
datavar_data: np.ndarray
92-
stringcoord_data: np.ndarray
93-
numericcoord_data: np.ndarray
95+
datavar_data: ArrayLike
96+
stringcoord_data: ArrayLike
97+
numericcoord_data: ArrayLike
9498

9599

96100
def make_testfile(
@@ -200,7 +204,7 @@ def testdata(
200204
encoding,
201205
tmp_path,
202206
use_separate_dims,
203-
):
207+
) -> Iterable[SamplefileDetails]:
204208
"""Create a suitable valid testfile, and return expected string content."""
205209
if PERSIST_TESTFILES:
206210
tmp_path = Path(PERSIST_TESTFILES).expanduser()
@@ -218,7 +222,7 @@ def testdata(
218222
from iris.tests.integration.netcdf.test_chararrays import ncdump
219223

220224
# TODO: temporary for debug -- TO REMOVE
221-
ncdump(tempfile_path)
225+
ncdump(str(tempfile_path))
222226
yield testdata
223227

224228
def test_valid_encodings(self, encoding, testdata: SamplefileDetails):
@@ -246,3 +250,144 @@ def test_valid_encodings(self, encoding, testdata: SamplefileDetails):
246250
coord_var_2 = cube.coord("v_numeric")
247251
assert coord_var_2.dtype == np.float64
248252
assert np.all(coord_var_2.points == numeric_data)
253+
254+
255+
@pytest.fixture(params=["stringdata", "bytedata"])
256+
def as_bytes(request):
257+
yield request.param == "bytedata"
258+
259+
260+
@dataclass
261+
class SampleCubeDetails:
262+
cube: Cube
263+
datavar_data: np.ndarray
264+
stringcoord_data: np.ndarray
265+
save_path: str | Path | None = None
266+
267+
268+
def make_testcube(
269+
encoding_str: str | None = None,
270+
byte_data: bool = False,
271+
) -> SampleCubeDetails:
272+
data_is_ascii = encoding_str in (NO_ENCODING_STR, "ascii")
273+
274+
numeric_values = np.arange(3.0)
275+
if data_is_ascii:
276+
coordvar_strings = ["mOnster", "London", "Amsterdam"]
277+
datavar_strings = ["bun", "Eclair", "sandwich"]
278+
else:
279+
coordvar_strings = ["Münster", "London", "Amsterdam"]
280+
datavar_strings = ["bun", "éclair", "sandwich"]
281+
282+
if not byte_data:
283+
charlen = N_CHARS_DIM
284+
if encoding_str == "utf-32":
285+
charlen = charlen // 4 - 1
286+
strings_dtype = np.dtype(f"U{charlen}")
287+
coordvar_array = np.array(coordvar_strings, dtype=strings_dtype)
288+
datavar_array = np.array(datavar_strings, dtype=strings_dtype)
289+
else:
290+
write_encoding = encoding_str
291+
if write_encoding == NO_ENCODING_STR:
292+
write_encoding = "ascii"
293+
coordvar_array = convert_strings_to_chararray(
294+
coordvar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding
295+
)
296+
datavar_array = convert_strings_to_chararray(
297+
datavar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding
298+
)
299+
300+
cube = Cube(datavar_array, var_name="v")
301+
cube.add_dim_coord(DimCoord(np.arange(N_XDIM), var_name="x"), 0)
302+
if encoding_str != NO_ENCODING_STR:
303+
cube.attributes["_Encoding"] = encoding_str
304+
co_x = AuxCoord(coordvar_array, var_name="v_co")
305+
if encoding_str != NO_ENCODING_STR:
306+
co_x.attributes["_Encoding"] = encoding_str
307+
co_dims = (0, 1) if byte_data else (0,)
308+
cube.add_aux_coord(co_x, co_dims)
309+
310+
result = SampleCubeDetails(
311+
cube=cube,
312+
datavar_data=datavar_array,
313+
stringcoord_data=coordvar_array,
314+
)
315+
return result
316+
317+
318+
class TestWriteEncodings:
319+
"""Test saving of testfiles with encoded string data.
320+
321+
To avoid circularity, we generate and save *cube* data.
322+
"""
323+
324+
@pytest.fixture(params=["dataAsStrings", "dataAsBytes"])
325+
def write_bytes(self, request):
326+
yield request.param == "dataAsBytes"
327+
328+
@pytest.fixture()
329+
def testpath(self, encoding, write_bytes, tmp_path):
330+
"""Create a suitable test cube, with either string or byte content."""
331+
if PERSIST_TESTFILES:
332+
tmp_path = Path(PERSIST_TESTFILES).expanduser()
333+
if encoding == "<noencoding>":
334+
filetag = "noencoding"
335+
else:
336+
filetag = encoding
337+
datatag = "writebytes" if write_bytes else "writestrings"
338+
tempfile_path = tmp_path / f"sample_write_{filetag}_{datatag}.nc"
339+
yield tempfile_path
340+
341+
@pytest.fixture()
342+
def testdata(self, testpath, encoding, write_bytes):
343+
"""Create a suitable test cube + save to a file.
344+
345+
Apply the given encoding to both coord and cube data.
346+
Form the data as bytes, or as strings, depending on 'write_bytes'.'
347+
"""
348+
cube_info = make_testcube(encoding_str=encoding, byte_data=write_bytes)
349+
cube_info.save_path = testpath
350+
cube = cube_info.cube
351+
iris.save(cube, testpath)
352+
yield cube_info
353+
354+
def test_valid_encodings(self, encoding, testdata, write_bytes):
355+
cube_info = testdata
356+
cube, path = cube_info.cube, cube_info.save_path
357+
# TODO: not testing the "byte read/write" yet
358+
# Make a quick check for cube equality : but the presentation depends on the read mode
359+
# with DECODE_TO_STRINGS_ON_READ.context(not write_bytes):
360+
# read_cube = iris.load_cube(path)
361+
# assert read_cube == cube
362+
363+
# N.B. file content should not depend on whether bytes or strings were written
364+
vararray, coordarray = cube_info.datavar_data, cube_info.stringcoord_data
365+
ds = _thread_safe_nc.DatasetWrapper(path)
366+
ds.set_auto_chartostring(False)
367+
v_main = ds.variables["v"]
368+
v_co = ds.variables["v_co"]
369+
assert v_main.shape == (N_XDIM, N_CHARS_DIM)
370+
assert v_co.shape == (N_XDIM, N_CHARS_DIM)
371+
assert v_main.dtype == "<S1"
372+
assert v_co.dtype == "<S1"
373+
if encoding == NO_ENCODING_STR:
374+
assert not "_Encoding" in v_main.ncattrs()
375+
assert not "_Encoding" in v_co.ncattrs()
376+
else:
377+
assert v_main.getncattr("_Encoding") == encoding
378+
assert v_co.getncattr("_Encoding") == encoding
379+
data_main = v_main[:]
380+
data_co = v_co[:]
381+
if not write_bytes:
382+
# convert to strings, to compare with originals
383+
# ("ELSE": varrray/coordarray are bytes anyway)
384+
if encoding == NO_ENCODING_STR:
385+
encoding = "ascii"
386+
data_main = convert_bytearray_to_strings(
387+
data_main, encoding, string_length=N_CHARS_DIM
388+
)
389+
data_co = convert_bytearray_to_strings(
390+
data_co, encoding, string_length=N_CHARS_DIM
391+
)
392+
assert np.all(data_main == vararray)
393+
assert np.all(data_co == coordarray)

0 commit comments

Comments
 (0)