From 48c1e9ce57136d5c01b6aab0230fe56ac4f7fef9 Mon Sep 17 00:00:00 2001 From: Martin Yeo Date: Thu, 23 Oct 2025 13:57:09 +0100 Subject: [PATCH 1/3] Protect NetCDF saving from bad Python-vs-HDF file lock timing. --- docs/src/whatsnew/latest.rst | 4 + .../fileformats/netcdf/_thread_safe_nc.py | 20 ++++- .../netcdf/_thread_safe_nc/__init__.py | 8 ++ .../_thread_safe_nc/test_NetCDFWriteProxy.py | 76 +++++++++++++++++++ 4 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/__init__.py create mode 100644 lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index f4c5f66716..c2c79758ee 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -65,6 +65,10 @@ This document explains the changes made to Iris for this release using :meth:`~iris.cube.Cube.aggregated_by` or :meth:`~iris.cube.Cube.collapsed`. (:issue:`6473`, :pull:`6706`, :pull:`6719`) +#. `@trexfeathers`_ protected the NetCDF saving code from a transient I/O + error, caused by bad synchronisation between Python-layer and HDF-layer + file locking on certain filesystems. (:pull:`6760`). + 💣 Incompatible Changes ======================= diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 35588eb2c4..33183ef0fa 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -10,6 +10,7 @@ from abc import ABC from threading import Lock +from time import sleep import typing import netCDF4 @@ -386,7 +387,24 @@ def __setitem__(self, keys, array_data): with _GLOBAL_NETCDF4_LOCK: dataset = None try: - dataset = netCDF4.Dataset(self.path, "r+") + # Even when fully serialised - no parallelism - HDF still + # occasionally fails to acquire the file. This is despite all + # Python locks being available at expected moments, and the + # file reporting as closed. During testing, 2nd retry always + # succeeded. This is likely caused by HDF-level locking + # running on a different timescale to Python-level locking - + # i.e. sometimes Python has released its locks but HDF still + # has not. Thought to be filesystem-dependent; further + # investigation needed. + for attempt in range(5): + try: + dataset = netCDF4.Dataset(self.path, "r+") + break + except OSError: + if attempt < 4: + sleep(0.1) + else: + raise var = dataset.variables[self.varname] var[keys] = array_data finally: diff --git a/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/__init__.py b/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/__init__.py new file mode 100644 index 0000000000..be9cff14ec --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/__init__.py @@ -0,0 +1,8 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Unit tests for the :mod:`iris.fileformats.netcdf._thread_safe_nc` module. + +Not required for a private module, but useful for specific checks. +""" diff --git a/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py b/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py new file mode 100644 index 0000000000..0a281036ed --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py @@ -0,0 +1,76 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Unit tests for :class:`iris.fileformats.netcdf._thread_safe_nc.NetCDFWriteProxy`.""" + +from threading import Lock + +import netCDF4 as nc +from netCDF4 import Dataset as DatasetOriginal +import pytest + +from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, NetCDFWriteProxy + + +@pytest.fixture +def dataset_path(tmp_path): + return tmp_path / "test.nc" + + +@pytest.fixture +def netcdf_variable(dataset_path): + dataset = DatasetWrapper(dataset_path, "w") + _ = dataset.createDimension("dim1", 1) + variable = dataset.createVariable( + "test_var", + "f4", + ("dim1",), + ) + return variable + + +@pytest.fixture +def write_proxy(netcdf_variable) -> NetCDFWriteProxy: + dataset = netcdf_variable.group() + proxy = NetCDFWriteProxy( + filepath=dataset.filepath(), + cf_var=netcdf_variable, + file_write_lock=Lock(), + ) + dataset.close() + return proxy + + +class UnreliableDatasetMaker: + """A mock operation that returns a Dataset, but fails the first time it is called. + + This simulates non-deterministic HDF locking errors which are difficult to + debug at the Python layer - pending further investigation. + """ + + def __init__(self): + self.call_count = 0 + + def __call__(self, *args, **kwargs) -> nc.Dataset: + self.call_count += 1 + if self.call_count < 2: + raise OSError("Simulated non-deterministic HDF locking error") + else: + return DatasetOriginal(*args, **kwargs) + + +def test_handle_hdf_locking_error(dataset_path, monkeypatch, write_proxy): + """Test that NetCDFWriteProxy can handle non-deterministic HDF locking errors.""" + monkeypatch.setattr(nc, "Dataset", UnreliableDatasetMaker()) + with pytest.raises(OSError, match="Simulated non-deterministic HDF locking error"): + dataset = nc.Dataset(write_proxy.path, "r+") + var = dataset.variables[write_proxy.varname] + var[0] = 1.0 + + # Reset. + monkeypatch.setattr(nc, "Dataset", UnreliableDatasetMaker()) + try: + write_proxy[0] = 1.0 + except OSError: + pytest.fail("NetCDFWriteProxy failed to handle HDF locking error") From 380d0eb1ecb81cb83bf1f4a905c015ea947437a9 Mon Sep 17 00:00:00 2001 From: Martin Yeo Date: Thu, 23 Oct 2025 14:14:28 +0100 Subject: [PATCH 2/3] Add test_NetCDFWriteProxy to list of files importing netCDF4. --- lib/iris/tests/test_coding_standards.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/iris/tests/test_coding_standards.py b/lib/iris/tests/test_coding_standards.py index d44ccadff6..f51a531721 100644 --- a/lib/iris/tests/test_coding_standards.py +++ b/lib/iris/tests/test_coding_standards.py @@ -15,7 +15,6 @@ import pytest import iris -from iris.fileformats.netcdf import _thread_safe_nc from iris.tests import system_test LICENSE_TEMPLATE = """# Copyright Iris contributors @@ -44,6 +43,9 @@ def test_netcdf4_import(): # Please avoid including these phrases in any comments/strings throughout # Iris (e.g. use "from the netCDF4 library" instead) - this allows the # below search to remain quick and simple. + from iris.fileformats.netcdf import _thread_safe_nc + from iris.tests.unit.fileformats.netcdf._thread_safe_nc import test_NetCDFWriteProxy + import_strings = ("import netCDF4", "from netCDF4") files_including_import = [] @@ -55,6 +57,7 @@ def test_netcdf4_import(): expected = [ Path(_thread_safe_nc.__file__), + Path(test_NetCDFWriteProxy.__file__), Path(system_test.__file__), Path(__file__), ] From b89f40cde8f7529cf5f2488bbbc6e8828dfadd76 Mon Sep 17 00:00:00 2001 From: Martin Yeo Date: Wed, 29 Oct 2025 12:18:42 +0000 Subject: [PATCH 3/3] Add test_abandon_many_failures. --- .../_thread_safe_nc/test_NetCDFWriteProxy.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py b/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py index 0a281036ed..b33cb515a2 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py +++ b/lib/iris/tests/unit/fileformats/netcdf/_thread_safe_nc/test_NetCDFWriteProxy.py @@ -49,12 +49,13 @@ class UnreliableDatasetMaker: debug at the Python layer - pending further investigation. """ - def __init__(self): + def __init__(self, attempts_before_success=1): + self.attempts_before_success = attempts_before_success self.call_count = 0 def __call__(self, *args, **kwargs) -> nc.Dataset: self.call_count += 1 - if self.call_count < 2: + if self.call_count <= self.attempts_before_success: raise OSError("Simulated non-deterministic HDF locking error") else: return DatasetOriginal(*args, **kwargs) @@ -74,3 +75,12 @@ def test_handle_hdf_locking_error(dataset_path, monkeypatch, write_proxy): write_proxy[0] = 1.0 except OSError: pytest.fail("NetCDFWriteProxy failed to handle HDF locking error") + + +def test_abandon_many_failures(dataset_path, monkeypatch, write_proxy): + """Test that NetCDFWriteProxy gives up after many failed attempts.""" + monkeypatch.setattr( + nc, "Dataset", UnreliableDatasetMaker(attempts_before_success=10) + ) + with pytest.raises(OSError, match="Simulated non-deterministic HDF locking error"): + write_proxy[0] = 1.0