Skip to content

Commit 76ea8ee

Browse files
Alexandre Arentsdavid-hill
authored andcommitted
libvirt: Abort live-migration job when monitoring fails
During live migration process, a _live_migration_monitor thread checks progress of migration on source host, if for any reason we hit infrastructure issue involving a DB/RPC/libvirt-timeout failure, an Exception is raised to the nova-compute service and instance/migration is set to ERROR state. The issue is that we may let live-migration job running out of nova control. At the end of job, guest is resumed on target host while nova still reports it on source host, this may lead to a split-brain situation if instance is restarted. This change proposes to abort live-migration job if issue occurs during _live_migration_monitor. Change-Id: Ia593b500425c81e54eb401e38264db5cc5fc1f93 Closes-Bug: #1905944 (cherry picked from commit 39f0af5)
1 parent 9609ae0 commit 76ea8ee

File tree

2 files changed

+33
-6
lines changed

2 files changed

+33
-6
lines changed

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13338,9 +13338,11 @@ def test_live_migration_monitor_force_complete_postcopy(self,
1333813338
@mock.patch.object(fakelibvirt.Connection, "_mark_running")
1333913339
@mock.patch.object(libvirt_driver.LibvirtDriver,
1334013340
"_live_migration_copy_disk_paths")
13341-
def test_live_migration_main(self, mock_copy_disk_path, mock_running,
13342-
mock_guest, mock_monitor, mock_thread,
13343-
mock_conn):
13341+
@mock.patch.object(libvirt_driver.LibvirtDriver, "live_migration_abort")
13342+
def _test_live_migration_main(self, mock_abort, mock_copy_disk_path,
13343+
mock_running, mock_guest, mock_monitor,
13344+
mock_thread, mock_conn,
13345+
mon_side_effect=None):
1334413346
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
1334513347
instance = objects.Instance(**self.test_instance)
1334613348

@@ -13353,16 +13355,23 @@ def test_live_migration_main(self, mock_copy_disk_path, mock_running,
1335313355
mock_copy_disk_path.return_value = disks_to_copy
1335413356

1335513357
mock_guest.return_value = guest
13358+
mock_monitor.side_effect = mon_side_effect
1335613359

1335713360
def fake_post():
1335813361
pass
1335913362

1336013363
def fake_recover():
1336113364
pass
1336213365

13363-
drvr._live_migration(self.context, instance, "fakehost",
13364-
fake_post, fake_recover, True,
13365-
migrate_data)
13366+
if mon_side_effect:
13367+
self.assertRaises(mon_side_effect, drvr._live_migration,
13368+
self.context, instance, "fakehost", fake_post,
13369+
fake_recover, True, migrate_data)
13370+
mock_abort.assert_called_once_with(instance)
13371+
else:
13372+
drvr._live_migration(self.context, instance, "fakehost", fake_post,
13373+
fake_recover, True, migrate_data)
13374+
1336613375
mock_copy_disk_path.assert_called_once_with(self.context, instance,
1336713376
guest)
1336813377

@@ -13379,6 +13388,12 @@ def __eq__(self, other):
1337913388
fake_post, fake_recover, True,
1338013389
migrate_data, AnyEventletEvent(), disks_to_copy[0])
1338113390

13391+
def test_live_migration_main(self):
13392+
self._test_live_migration_main()
13393+
13394+
def test_live_migration_main_monitoring_failed(self):
13395+
self._test_live_migration_main(mon_side_effect=Exception)
13396+
1338213397
@mock.patch('os.path.exists', return_value=False)
1338313398
@mock.patch('nova.virt.libvirt.utils.create_image')
1338413399
@mock.patch.object(libvirt_driver.LibvirtDriver,

nova/virt/libvirt/driver.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9967,6 +9967,18 @@ def thread_finished(thread, event):
99679967
except Exception as ex:
99689968
LOG.warning("Error monitoring migration: %(ex)s",
99699969
{"ex": ex}, instance=instance, exc_info=True)
9970+
# NOTE(aarents): Ensure job is aborted if still running before
9971+
# raising the exception so this would avoid the migration to be
9972+
# done and the libvirt guest to be resumed on the target while
9973+
# the instance record would still related to the source host.
9974+
try:
9975+
# If migration is running in post-copy mode and guest
9976+
# already running on dest host, libvirt will refuse to
9977+
# cancel migration job.
9978+
self.live_migration_abort(instance)
9979+
except libvirt.libvirtError:
9980+
LOG.warning("Error occured when trying to abort live ",
9981+
"migration job, ignoring it.", instance=instance)
99709982
raise
99719983
finally:
99729984
LOG.debug("Live migration monitoring is all done",

0 commit comments

Comments
 (0)