Skip to content

Commit 4c02c2d

Browse files
committed
retry write_sys call on device busy
This change adds a retry_if_busy decorator to the read_sys and write_sys functions in the filesystem module that will retry reads and writes up to 5 times with an linear backoff. This allows nova to tolerate short periods of time where sysfs retruns device busy. If the reties are exausted and offlineing a core fails a warning is log and the failure is ignored. onling a core is always treated as a hard error if retries are exausted. Closes-Bug: #2065927 Change-Id: I2a6a9f243cb403167620405e167a8dd2bbf3fa79 (cherry picked from commit 44c1b48) (cherry picked from commit 1581f66) (cherry picked from commit 6a475ac)
1 parent 68b9934 commit 4c02c2d

File tree

5 files changed

+123
-13
lines changed

5 files changed

+123
-13
lines changed

nova/exception.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,6 +1131,10 @@ class FileNotFound(NotFound):
11311131
msg_fmt = _("File %(file_path)s could not be found.")
11321132

11331133

1134+
class DeviceBusy(NovaException):
1135+
msg_fmt = _("device %(file_path)s is busy.")
1136+
1137+
11341138
class ClassNotFound(NotFound):
11351139
msg_fmt = _("Class %(class_name)s could not be found: %(exception)s")
11361140

nova/filesystem.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,48 +12,97 @@
1212

1313
"""Functions to address filesystem calls, particularly sysfs."""
1414

15+
import functools
1516
import os
17+
import time
1618

1719
from oslo_log import log as logging
1820

1921
from nova import exception
2022

2123
LOG = logging.getLogger(__name__)
24+
SYS = '/sys'
25+
RETRY_LIMIT = 5
2226

2327

24-
SYS = '/sys'
28+
# a retry decorator to handle EBUSY
29+
def retry_if_busy(func):
30+
"""Decorator to retry a function if it raises DeviceBusy.
31+
32+
This decorator will retry the function RETRY_LIMIT=5 times if it raises
33+
DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on
34+
the second retry, and so on, up to RETRY_LIMIT seconds. If the function
35+
still raises DeviceBusy after RETRY_LIMIT retries, the exception will be
36+
raised.
37+
"""
2538

39+
@functools.wraps(func)
40+
def wrapper(*args, **kwargs):
41+
for i in range(RETRY_LIMIT):
42+
try:
43+
return func(*args, **kwargs)
44+
except exception.DeviceBusy as e:
45+
# if we have retried RETRY_LIMIT times, raise the exception
46+
# otherwise, sleep and retry, i is 0-based so we need
47+
# to add 1 to it
48+
count = i + 1
49+
if count < RETRY_LIMIT:
50+
LOG.debug(
51+
f"File {e.kwargs['file_path']} is busy, "
52+
f"sleeping {count} seconds before retrying")
53+
time.sleep(count)
54+
continue
55+
raise
56+
return wrapper
2657

2758
# NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
59+
60+
61+
@retry_if_busy
2862
def read_sys(path: str) -> str:
2963
"""Reads the content of a file in the sys filesystem.
3064
3165
:param path: relative or absolute. If relative, will be prefixed by /sys.
3266
:returns: contents of that file.
3367
:raises: nova.exception.FileNotFound if we can't read that file.
68+
:raises: nova.exception.DeviceBusy if the file is busy.
3469
"""
3570
try:
3671
# The path can be absolute with a /sys prefix but that's fine.
3772
with open(os.path.join(SYS, path), mode='r') as data:
3873
return data.read()
39-
except (OSError, ValueError) as exc:
74+
except OSError as exc:
75+
# errno 16 is EBUSY, which means the file is busy.
76+
if exc.errno == 16:
77+
raise exception.DeviceBusy(file_path=path) from exc
78+
# convert permission denied to file not found
79+
raise exception.FileNotFound(file_path=path) from exc
80+
except ValueError as exc:
4081
raise exception.FileNotFound(file_path=path) from exc
4182

4283

4384
# NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
4485
# In order to correctly use it, you need to decorate the caller with a specific
4586
# privsep entrypoint.
87+
@retry_if_busy
4688
def write_sys(path: str, data: str) -> None:
4789
"""Writes the content of a file in the sys filesystem with data.
4890
4991
:param path: relative or absolute. If relative, will be prefixed by /sys.
5092
:param data: the data to write.
5193
:returns: contents of that file.
5294
:raises: nova.exception.FileNotFound if we can't write that file.
95+
:raises: nova.exception.DeviceBusy if the file is busy.
5396
"""
5497
try:
5598
# The path can be absolute with a /sys prefix but that's fine.
5699
with open(os.path.join(SYS, path), mode='w') as fd:
57100
fd.write(data)
58-
except (OSError, ValueError) as exc:
101+
except OSError as exc:
102+
# errno 16 is EBUSY, which means the file is busy.
103+
if exc.errno == 16:
104+
raise exception.DeviceBusy(file_path=path) from exc
105+
# convert permission denied to file not found
106+
raise exception.FileNotFound(file_path=path) from exc
107+
except ValueError as exc:
59108
raise exception.FileNotFound(file_path=path) from exc

nova/tests/functional/libvirt/test_power_manage.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from nova import objects
2020
from nova.tests import fixtures as nova_fixtures
2121
from nova.tests.fixtures import libvirt as fakelibvirt
22-
from nova.tests.functional.api import client
2322
from nova.tests.functional.libvirt import base
2423
from nova.virt import hardware
2524
from nova.virt.libvirt.cpu import api as cpu_api
@@ -276,23 +275,24 @@ def test_delete_server(self):
276275
self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
277276

278277
def test_delete_server_device_busy(self):
278+
# This test verifies bug 2065927 is resolved.
279279
server = self.test_create_server()
280-
281280
inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
282281
instance_pcpus = inst.numa_topology.cpu_pinning
283282
self._assert_cpu_set_state(instance_pcpus, expected='online')
284283
with mock.patch(
285-
'nova.filesystem.write_sys',
286-
side_effect=exception.FileNotFound(file_path='fake')):
287-
# This is bug 2065927
288-
self.assertRaises(
289-
client.OpenStackApiException, self._delete_server, server)
284+
'nova.filesystem.write_sys.__wrapped__',
285+
side_effect=[
286+
exception.DeviceBusy(file_path='fake'),
287+
None]):
288+
289+
self._delete_server(server)
290290
cpu_dedicated_set = hardware.get_cpu_dedicated_set()
291291
# Verify that the unused CPUs are still offline
292292
unused_cpus = cpu_dedicated_set - instance_pcpus
293293
self._assert_cpu_set_state(unused_cpus, expected='offline')
294-
# but the instance cpus are still online
295-
self._assert_cpu_set_state(instance_pcpus, expected='online')
294+
# and the pinned CPUs are offline
295+
self._assert_cpu_set_state(instance_pcpus, expected='offline')
296296

297297
def test_create_server_with_emulator_threads_isolate(self):
298298
server = self._create_server(

nova/tests/unit/test_filesystem.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# License for the specific language governing permissions and limitations
1111
# under the License.
1212

13+
import io
1314
import os
1415
from unittest import mock
1516

@@ -35,6 +36,29 @@ def test_read_sys_error(self):
3536
expected_path = os.path.join(filesystem.SYS, 'foo')
3637
m_open.assert_called_once_with(expected_path, mode='r')
3738

39+
def test_read_sys_retry(self):
40+
open_mock = mock.mock_open()
41+
with mock.patch('builtins.open', open_mock) as m_open:
42+
m_open.side_effect = [
43+
OSError(16, 'Device or resource busy'),
44+
io.StringIO('bar'),
45+
]
46+
self.assertEqual('bar', filesystem.read_sys('foo'))
47+
expected_path = os.path.join(filesystem.SYS, 'foo')
48+
m_open.assert_has_calls([
49+
mock.call(expected_path, mode='r'),
50+
mock.call(expected_path, mode='r'),
51+
])
52+
53+
def test_read_sys_retry_limit(self):
54+
open_mock = mock.mock_open()
55+
with mock.patch('builtins.open', open_mock) as m_open:
56+
m_open.side_effect = (
57+
[OSError(16, 'Device or resource busy')] *
58+
(filesystem.RETRY_LIMIT + 1))
59+
self.assertRaises(
60+
exception.DeviceBusy, filesystem.read_sys, 'foo')
61+
3862
def test_write_sys(self):
3963
open_mock = mock.mock_open()
4064
with mock.patch('builtins.open', open_mock) as m_open:
@@ -50,3 +74,26 @@ def test_write_sys_error(self):
5074
filesystem.write_sys, 'foo', 'bar')
5175
expected_path = os.path.join(filesystem.SYS, 'foo')
5276
m_open.assert_called_once_with(expected_path, mode='w')
77+
78+
def test_write_sys_retry(self):
79+
open_mock = mock.mock_open()
80+
with mock.patch('builtins.open', open_mock) as m_open:
81+
m_open.side_effect = [
82+
OSError(16, 'Device or resource busy'),
83+
io.StringIO(),
84+
]
85+
self.assertIsNone(filesystem.write_sys('foo', 'bar'))
86+
expected_path = os.path.join(filesystem.SYS, 'foo')
87+
m_open.assert_has_calls([
88+
mock.call(expected_path, mode='w'),
89+
mock.call(expected_path, mode='w'),
90+
])
91+
92+
def test_write_sys_retry_limit(self):
93+
open_mock = mock.mock_open()
94+
with mock.patch('builtins.open', open_mock) as m_open:
95+
m_open.side_effect = (
96+
[OSError(16, 'Device or resource busy')] *
97+
(filesystem.RETRY_LIMIT + 1))
98+
self.assertRaises(
99+
exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar')

nova/virt/libvirt/cpu/core.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,23 @@ def get_online(core: int) -> bool:
5656

5757
@nova.privsep.sys_admin_pctxt.entrypoint
5858
def set_online(core: int) -> bool:
59+
# failure to online a core should be considered a failure
60+
# so we don't catch any exception here.
5961
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1')
6062
return get_online(core)
6163

6264

6365
@nova.privsep.sys_admin_pctxt.entrypoint
6466
def set_offline(core: int) -> bool:
65-
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0')
67+
try:
68+
filesystem.write_sys(os.path.join(
69+
gen_cpu_path(core), 'online'), data='0')
70+
except exception.DeviceBusy:
71+
# if nova is not able to offline a core it should not break anything
72+
# so we just log a warning and return False to indicate that the core
73+
# is not offline.
74+
LOG.warning('Unable to offline CPU: %s', core)
75+
return False
6676
return not get_online(core)
6777

6878

0 commit comments

Comments
 (0)