Skip to content

Commit 295db4d

Browse files
authored
Protect NetCDF saving from bad Python-vs-HDF file lock timing (#6760)
* Protect NetCDF saving from bad Python-vs-HDF file lock timing. * Add test_NetCDFWriteProxy to list of files importing netCDF4. * Add test_abandon_many_failures.
1 parent e31fdec commit 295db4d

File tree

5 files changed

+121
-2
lines changed

5 files changed

+121
-2
lines changed

docs/src/whatsnew/latest.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ This document explains the changes made to Iris for this release
7777
using :meth:`~iris.cube.Cube.aggregated_by` or :meth:`~iris.cube.Cube.collapsed`.
7878
(:issue:`6473`, :pull:`6706`, :pull:`6719`)
7979

80+
#. `@trexfeathers`_ protected the NetCDF saving code from a transient I/O
81+
error, caused by bad synchronisation between Python-layer and HDF-layer
82+
file locking on certain filesystems. (:pull:`6760`).
83+
8084

8185
💣 Incompatible Changes
8286
=======================

lib/iris/fileformats/netcdf/_thread_safe_nc.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from abc import ABC
1212
from threading import Lock
13+
from time import sleep
1314
import typing
1415

1516
import netCDF4
@@ -386,7 +387,24 @@ def __setitem__(self, keys, array_data):
386387
with _GLOBAL_NETCDF4_LOCK:
387388
dataset = None
388389
try:
389-
dataset = netCDF4.Dataset(self.path, "r+")
390+
# Even when fully serialised - no parallelism - HDF still
391+
# occasionally fails to acquire the file. This is despite all
392+
# Python locks being available at expected moments, and the
393+
# file reporting as closed. During testing, 2nd retry always
394+
# succeeded. This is likely caused by HDF-level locking
395+
# running on a different timescale to Python-level locking -
396+
# i.e. sometimes Python has released its locks but HDF still
397+
# has not. Thought to be filesystem-dependent; further
398+
# investigation needed.
399+
for attempt in range(5):
400+
try:
401+
dataset = netCDF4.Dataset(self.path, "r+")
402+
break
403+
except OSError:
404+
if attempt < 4:
405+
sleep(0.1)
406+
else:
407+
raise
390408
var = dataset.variables[self.varname]
391409
var[keys] = array_data
392410
finally:

lib/iris/tests/test_coding_standards.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import pytest
1616

1717
import iris
18-
from iris.fileformats.netcdf import _thread_safe_nc
1918
from iris.tests import system_test
2019

2120
LICENSE_TEMPLATE = """# Copyright Iris contributors
@@ -44,6 +43,9 @@ def test_netcdf4_import():
4443
# Please avoid including these phrases in any comments/strings throughout
4544
# Iris (e.g. use "from the netCDF4 library" instead) - this allows the
4645
# below search to remain quick and simple.
46+
from iris.fileformats.netcdf import _thread_safe_nc
47+
from iris.tests.unit.fileformats.netcdf._thread_safe_nc import test_NetCDFWriteProxy
48+
4749
import_strings = ("import netCDF4", "from netCDF4")
4850

4951
files_including_import = []
@@ -55,6 +57,7 @@ def test_netcdf4_import():
5557

5658
expected = [
5759
Path(_thread_safe_nc.__file__),
60+
Path(test_NetCDFWriteProxy.__file__),
5861
Path(system_test.__file__),
5962
Path(__file__),
6063
]
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Unit tests for the :mod:`iris.fileformats.netcdf._thread_safe_nc` module.
6+
7+
Not required for a private module, but useful for specific checks.
8+
"""
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Unit tests for :class:`iris.fileformats.netcdf._thread_safe_nc.NetCDFWriteProxy`."""
6+
7+
from threading import Lock
8+
9+
import netCDF4 as nc
10+
from netCDF4 import Dataset as DatasetOriginal
11+
import pytest
12+
13+
from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, NetCDFWriteProxy
14+
15+
16+
@pytest.fixture
17+
def dataset_path(tmp_path):
18+
return tmp_path / "test.nc"
19+
20+
21+
@pytest.fixture
22+
def netcdf_variable(dataset_path):
23+
dataset = DatasetWrapper(dataset_path, "w")
24+
_ = dataset.createDimension("dim1", 1)
25+
variable = dataset.createVariable(
26+
"test_var",
27+
"f4",
28+
("dim1",),
29+
)
30+
return variable
31+
32+
33+
@pytest.fixture
34+
def write_proxy(netcdf_variable) -> NetCDFWriteProxy:
35+
dataset = netcdf_variable.group()
36+
proxy = NetCDFWriteProxy(
37+
filepath=dataset.filepath(),
38+
cf_var=netcdf_variable,
39+
file_write_lock=Lock(),
40+
)
41+
dataset.close()
42+
return proxy
43+
44+
45+
class UnreliableDatasetMaker:
46+
"""A mock operation that returns a Dataset, but fails the first time it is called.
47+
48+
This simulates non-deterministic HDF locking errors which are difficult to
49+
debug at the Python layer - pending further investigation.
50+
"""
51+
52+
def __init__(self, attempts_before_success=1):
53+
self.attempts_before_success = attempts_before_success
54+
self.call_count = 0
55+
56+
def __call__(self, *args, **kwargs) -> nc.Dataset:
57+
self.call_count += 1
58+
if self.call_count <= self.attempts_before_success:
59+
raise OSError("Simulated non-deterministic HDF locking error")
60+
else:
61+
return DatasetOriginal(*args, **kwargs)
62+
63+
64+
def test_handle_hdf_locking_error(dataset_path, monkeypatch, write_proxy):
65+
"""Test that NetCDFWriteProxy can handle non-deterministic HDF locking errors."""
66+
monkeypatch.setattr(nc, "Dataset", UnreliableDatasetMaker())
67+
with pytest.raises(OSError, match="Simulated non-deterministic HDF locking error"):
68+
dataset = nc.Dataset(write_proxy.path, "r+")
69+
var = dataset.variables[write_proxy.varname]
70+
var[0] = 1.0
71+
72+
# Reset.
73+
monkeypatch.setattr(nc, "Dataset", UnreliableDatasetMaker())
74+
try:
75+
write_proxy[0] = 1.0
76+
except OSError:
77+
pytest.fail("NetCDFWriteProxy failed to handle HDF locking error")
78+
79+
80+
def test_abandon_many_failures(dataset_path, monkeypatch, write_proxy):
81+
"""Test that NetCDFWriteProxy gives up after many failed attempts."""
82+
monkeypatch.setattr(
83+
nc, "Dataset", UnreliableDatasetMaker(attempts_before_success=10)
84+
)
85+
with pytest.raises(OSError, match="Simulated non-deterministic HDF locking error"):
86+
write_proxy[0] = 1.0

0 commit comments

Comments
 (0)