Skip to content

Commit 30317e6

Browse files
author
Balazs Gibizer
committed
Replace blind retry with libvirt event waiting in detach
Nova so far applied a retry loop that tried to periodically detach the device from libvirt while the device was visible in the domain xml. This could lead to an issue where an already progressing detach on the libvirt side is interrupted by nova re-sending the detach request for the same device. See bug #1882521 for more information. Also if there was both a persistent and a live domain the nova tried the detach from both at the same call. This lead to confusion about the result when such call failed. Was the detach failed partially? We can do better, at least for the live detach case. Based on the libvirt developers detaching from the persistent domain always succeeds and it is a synchronous process. Detaching from the live domain can be both synchronous or asynchronous depending on the guest OS and the load on the hypervisor. But for live detach libvirt always sends an event [1] nova can wait for. So this patch does two things. 1) Separates the detach from the persistent domain from the detach from the live domain to make the error cases clearer. 2) Changes the retry mechanism. Detaching from the persistent domain is not retried. If libvirt reports device not found, while both persistent and live detach is needed, the error is ignored, and the process continues with the live detach. In any other case the error considered as fatal. Detaching from the live domain is changed to always wait for the libvirt event. In case of timeout, the live detach is retried. But a failure event from libvirt considered fatal, based on the information from the libvirt developers, so in this case the detach is not retried. Related-Bug: #1882521 [1]https://libvirt.org/html/libvirt-libvirt-domain.html#virConnectDomainEventDeviceRemovedCallback Change-Id: I7f2b6330decb92e2838aa7cee47fb228f00f47da (cherry picked from commit e56cc4f)
1 parent a5ce4d8 commit 30317e6

File tree

8 files changed

+1144
-385
lines changed

8 files changed

+1144
-385
lines changed

nova/conf/libvirt.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,6 +875,30 @@
875875
* It's recommended to consider including ``x86_64=q35`` in
876876
:oslo.config:option:`libvirt.hw_machine_type`; see
877877
:ref:`deploying-sev-capable-infrastructure` for more on this.
878+
"""),
879+
cfg.IntOpt('device_detach_attempts',
880+
default=8,
881+
min=1,
882+
help="""
883+
Maximum number of attempts the driver tries to detach a device in libvirt.
884+
885+
Related options:
886+
887+
* :oslo.config:option:`libvirt.device_detach_timeout`
888+
889+
"""),
890+
cfg.IntOpt('device_detach_timeout',
891+
default=20,
892+
min=1,
893+
help="""
894+
Maximum number of seconds the driver waits for the success or the failure
895+
event from libvirt for a given device detach attempt before it re-trigger the
896+
detach.
897+
898+
Related options:
899+
900+
* :oslo.config:option:`libvirt.device_detach_attempts`
901+
878902
"""),
879903
]
880904

nova/tests/unit/virt/libvirt/fakelibvirt.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,9 +1278,17 @@ def attachDeviceFlags(self, xml, flags):
12781278
self.attachDevice(xml)
12791279

12801280
def detachDevice(self, xml):
1281+
# TODO(gibi): this should handle nics similarly to attachDevice()
12811282
disk_info = _parse_disk_info(etree.fromstring(xml))
1282-
disk_info['_attached'] = True
1283-
return disk_info in self._def['devices']['disks']
1283+
attached_disk_info = None
1284+
for attached_disk in self._def['devices']['disks']:
1285+
if attached_disk['target_dev'] == disk_info.get('target_dev'):
1286+
attached_disk_info = attached_disk
1287+
break
1288+
1289+
if attached_disk_info:
1290+
self._def['devices']['disks'].remove(attached_disk_info)
1291+
return attached_disk_info is not None
12841292

12851293
def detachDeviceFlags(self, xml, flags):
12861294
self.detachDevice(xml)

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 745 additions & 60 deletions
Large diffs are not rendered by default.

nova/tests/unit/virt/libvirt/test_guest.py

Lines changed: 0 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from oslo_utils import encodeutils
2222

2323
from nova import context
24-
from nova import exception
2524
from nova import test
2625
from nova.tests.unit.virt.libvirt import fakelibvirt
2726
from nova.virt.libvirt import config as vconfig
@@ -213,212 +212,6 @@ def test_detach_device_persistent_live(self):
213212
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
214213
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
215214

216-
def test_detach_device_with_retry_from_transient_domain(self):
217-
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
218-
conf.to_xml.return_value = "</xml>"
219-
get_config = mock.Mock()
220-
get_config.side_effect = [conf, conf, conf, None, None]
221-
dev_path = "/dev/vdb"
222-
self.domain.isPersistent.return_value = False
223-
retry_detach = self.guest.detach_device_with_retry(
224-
get_config, dev_path, live=True, inc_sleep_time=.01)
225-
self.domain.detachDeviceFlags.assert_called_once_with(
226-
"</xml>", flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)
227-
self.domain.detachDeviceFlags.reset_mock()
228-
retry_detach()
229-
self.assertEqual(1, self.domain.detachDeviceFlags.call_count)
230-
231-
def test_detach_device_with_retry_detach_success(self):
232-
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
233-
conf.to_xml.return_value = "</xml>"
234-
get_config = mock.Mock()
235-
# Force multiple retries of detach
236-
get_config.side_effect = [conf, conf, conf, conf, conf, None, None]
237-
dev_path = "/dev/vdb"
238-
self.domain.isPersistent.return_value = True
239-
240-
retry_detach = self.guest.detach_device_with_retry(
241-
get_config, dev_path, live=True, inc_sleep_time=.01)
242-
# Ensure we've only done the initial detach call
243-
self.domain.detachDeviceFlags.assert_called_once_with(
244-
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
245-
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
246-
247-
get_config.assert_called_with(dev_path)
248-
249-
# Some time later, we can do the wait/retry to ensure detach succeeds
250-
self.domain.detachDeviceFlags.reset_mock()
251-
retry_detach()
252-
# Should have two retries before we pretend device is detached
253-
self.assertEqual(2, self.domain.detachDeviceFlags.call_count)
254-
255-
def test_detach_device_with_retry_detach_failure(self):
256-
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
257-
conf.to_xml.return_value = "</xml>"
258-
# Continue to return some value for the disk config
259-
get_config = mock.Mock(return_value=conf)
260-
self.domain.isPersistent.return_value = True
261-
262-
retry_detach = self.guest.detach_device_with_retry(
263-
get_config, "/dev/vdb", live=True, inc_sleep_time=.01,
264-
max_retry_count=3)
265-
# Ensure we've only done the initial detach call
266-
self.domain.detachDeviceFlags.assert_called_once_with(
267-
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
268-
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
269-
270-
# Some time later, we can do the wait/retry to ensure detach
271-
self.domain.detachDeviceFlags.reset_mock()
272-
# Should hit max # of retries
273-
self.assertRaises(exception.DeviceDetachFailed, retry_detach)
274-
self.assertEqual(4, self.domain.detachDeviceFlags.call_count)
275-
276-
def test_detach_device_with_retry_device_not_found(self):
277-
get_config = mock.Mock(return_value=None)
278-
self.domain.isPersistent.return_value = True
279-
ex = self.assertRaises(
280-
exception.DeviceNotFound, self.guest.detach_device_with_retry,
281-
get_config, "/dev/vdb", live=True)
282-
self.assertIn("/dev/vdb", str(ex))
283-
284-
def test_detach_device_with_retry_device_not_found_alt_name(self):
285-
"""Tests to make sure we use the alternative name in errors."""
286-
get_config = mock.Mock(return_value=None)
287-
self.domain.isPersistent.return_value = True
288-
ex = self.assertRaises(
289-
exception.DeviceNotFound, self.guest.detach_device_with_retry,
290-
get_config, mock.sentinel.device, live=True,
291-
alternative_device_name='foo')
292-
self.assertIn('foo', str(ex))
293-
294-
@mock.patch.object(libvirt_guest.Guest, "detach_device")
295-
def _test_detach_device_with_retry_second_detach_failure(
296-
self, mock_detach, error_code=None, error_message=None,
297-
supports_device_missing=False):
298-
# This simulates a retry of the transient/live domain detach
299-
# failing because the device is not found
300-
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
301-
conf.to_xml.return_value = "</xml>"
302-
self.domain.isPersistent.return_value = True
303-
304-
get_config = mock.Mock(return_value=conf)
305-
fake_device = "vdb"
306-
fake_exc = fakelibvirt.make_libvirtError(
307-
fakelibvirt.libvirtError, "",
308-
error_message=error_message,
309-
error_code=error_code,
310-
error_domain=fakelibvirt.VIR_FROM_DOMAIN)
311-
mock_detach.side_effect = [None, fake_exc]
312-
retry_detach = self.guest.detach_device_with_retry(
313-
get_config, fake_device, live=True,
314-
inc_sleep_time=.01, max_retry_count=3)
315-
# Some time later, we can do the wait/retry to ensure detach
316-
self.assertRaises(exception.DeviceNotFound, retry_detach)
317-
# Check that the save_and_reraise_exception context manager didn't log
318-
# a traceback when the libvirtError was caught and DeviceNotFound was
319-
# raised.
320-
self.assertNotIn('Original exception being dropped',
321-
self.stdlog.logger.output)
322-
323-
def test_detach_device_with_retry_second_detach_device_missing(self):
324-
self._test_detach_device_with_retry_second_detach_failure(
325-
error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING,
326-
error_message="device not found: disk vdb not found",
327-
supports_device_missing=True)
328-
329-
def _test_detach_device_with_retry_first_detach_failure(
330-
self, error_code=None, error_message=None,
331-
supports_device_missing=False):
332-
# This simulates a persistent or live domain detach failing because the
333-
# device is not found during the first attempt to detach the device.
334-
# We should still attempt to detach the device from the live config if
335-
# the detach from persistent failed OR we should retry the detach from
336-
# the live config if the first detach from live config failed.
337-
# Note that the side effects in this test case [fake_exc, None] could
338-
# not happen in real life if the first detach failed because the detach
339-
# from live raised not found. In real life, the second attempt to
340-
# detach from live would raise not found again because the device is
341-
# not present. The purpose of this test is to verify that we try to
342-
# detach a second time if the first detach fails, so we are OK with the
343-
# unrealistic side effects for detach from live failing the first time.
344-
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
345-
conf.to_xml.return_value = "</xml>"
346-
self.domain.isPersistent.return_value = True
347-
348-
get_config = mock.Mock()
349-
# Simulate an inactive or live detach attempt which fails (not found)
350-
# followed by a live config detach attempt that is successful
351-
get_config.side_effect = [conf, conf, conf, None, None]
352-
fake_device = "vdb"
353-
fake_exc = fakelibvirt.make_libvirtError(
354-
fakelibvirt.libvirtError, "",
355-
error_message=error_message,
356-
error_code=error_code,
357-
error_domain=fakelibvirt.VIR_FROM_DOMAIN)
358-
# Detach from persistent or live raises not found, detach from live
359-
# succeeds afterward
360-
self.domain.detachDeviceFlags.side_effect = [fake_exc, None]
361-
retry_detach = self.guest.detach_device_with_retry(get_config,
362-
fake_device, live=True, inc_sleep_time=.01, max_retry_count=3)
363-
# We should have tried to detach from the persistent domain
364-
self.domain.detachDeviceFlags.assert_called_once_with(
365-
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
366-
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
367-
# During the retry detach, should detach from the live domain
368-
self.domain.detachDeviceFlags.reset_mock()
369-
retry_detach()
370-
# We should have tried to detach from the live domain
371-
self.domain.detachDeviceFlags.assert_called_once_with(
372-
"</xml>", flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)
373-
374-
def test_detach_device_with_retry_first_detach_device_missing(self):
375-
self._test_detach_device_with_retry_first_detach_failure(
376-
error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING,
377-
error_message="device not found: disk vdb not found",
378-
supports_device_missing=True)
379-
380-
def test_detach_device_with_already_in_process_of_unplug_error(self):
381-
# Assert that DeviceNotFound is raised when encountering
382-
# https://bugzilla.redhat.com/show_bug.cgi?id=1878659
383-
# This is raised as QEMU returns a VIR_ERR_INTERNAL_ERROR when
384-
# a request to device_del is made while another is about to complete.
385-
386-
self.domain.isPersistent.return_value = True
387-
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
388-
conf.to_xml.return_value = "</xml>"
389-
390-
existing_unplug_exc = fakelibvirt.make_libvirtError(
391-
fakelibvirt.libvirtError, "",
392-
error_message='device vdb is already in the process of unplug',
393-
error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR,
394-
error_domain=fakelibvirt.VIR_FROM_DOMAIN
395-
)
396-
device_missing_exc = fakelibvirt.make_libvirtError(
397-
fakelibvirt.libvirtError, "",
398-
error_message='device not found: disk vdb not found',
399-
error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING,
400-
error_domain=fakelibvirt.VIR_FROM_DOMAIN
401-
)
402-
403-
# Raise VIR_ERR_INTERNAL_ERROR on the second call before raising
404-
# VIR_ERR_DEVICE_MISSING to mock the first call successfully detaching
405-
# the device asynchronously.
406-
self.domain.detachDeviceFlags.side_effect = [
407-
None,
408-
existing_unplug_exc,
409-
device_missing_exc
410-
]
411-
412-
retry_detach = self.guest.detach_device_with_retry(
413-
mock.Mock(return_value=conf),
414-
'vdb',
415-
live=True,
416-
inc_sleep_time=.01
417-
)
418-
419-
# Assert that we raise exception.DeviceNotFound
420-
self.assertRaises(exception.DeviceNotFound, retry_detach)
421-
422215
def test_get_xml_desc(self):
423216
self.guest.get_xml_desc()
424217
self.domain.XMLDesc.assert_called_once_with(flags=0)

nova/tests/unit/virt/test_virt_drivers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ def test_get_volume_connector_storage_ip(self):
416416
self.assertEqual(storage_ip, result['ip'])
417417

418418
@catch_notimplementederror
419+
@mock.patch('threading.Event.wait', new=mock.Mock())
419420
@mock.patch.object(libvirt.driver.LibvirtDriver, '_build_device_metadata',
420421
return_value=objects.InstanceDeviceMetadata())
421422
def test_attach_detach_volume(self, _):
@@ -452,6 +453,7 @@ def test_swap_volume(self, _):
452453
'/dev/sda', 2))
453454

454455
@catch_notimplementederror
456+
@mock.patch('threading.Event.wait', new=mock.Mock())
455457
@mock.patch.object(libvirt.driver.LibvirtDriver, '_build_device_metadata',
456458
return_value=objects.InstanceDeviceMetadata())
457459
def test_attach_detach_different_power_states(self, _):

0 commit comments

Comments
 (0)