Skip to content

Commit 94810a8

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Track error migrations in resource tracker"
2 parents 509c01e + 255b3f2 commit 94810a8

File tree

16 files changed

+236
-86
lines changed

16 files changed

+236
-86
lines changed

nova/compute/manager.py

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8847,7 +8847,7 @@ def _rollback_volume_bdms(self, context, bdms, original_bdms, instance):
88478847
@wrap_instance_fault
88488848
def _rollback_live_migration(self, context, instance,
88498849
dest, migrate_data=None,
8850-
migration_status='error',
8850+
migration_status='failed',
88518851
source_bdms=None):
88528852
"""Recovers Instance/volume state from migrating -> running.
88538853

@@ -8988,9 +8988,8 @@ def _rollback_live_migration(self, context, instance,
89888988
phase=fields.NotificationPhase.END,
89898989
bdms=bdms)
89908990

8991-
# TODO(luyao): set migration status to 'failed' but not 'error'
8992-
# which means rollback_live_migration is done, we have successfully
8993-
# cleaned up and returned instance back to normal status.
8991+
# NOTE(luyao): we have cleanup everything and get instance
8992+
# back to normal status, now set migration status to 'failed'
89948993
self._set_migration_status(migration, migration_status)
89958994

89968995
@wrap_exception()
@@ -10508,11 +10507,14 @@ def _run_pending_deletes(self, context):
1050810507

1050910508
@periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
1051010509
def _cleanup_incomplete_migrations(self, context):
10511-
"""Delete instance files on failed resize/revert-resize operation
10512-
10513-
During resize/revert-resize operation, if that instance gets deleted
10514-
in-between then instance files might remain either on source or
10515-
destination compute node because of race condition.
10510+
"""Cleanup on failed resize/revert-resize operation and
10511+
failed rollback live migration operation.
10512+
10513+
During resize/revert-resize operation, or after a failed rollback
10514+
live migration operation, if that instance gets deleted then instance
10515+
files might remain either on source or destination compute node and
10516+
other specific resources might not be cleaned up because of the race
10517+
condition.
1051610518
"""
1051710519
LOG.debug('Cleaning up deleted instances with incomplete migration ')
1051810520
migration_filters = {'host': CONF.host,
@@ -10534,21 +10536,29 @@ def _cleanup_incomplete_migrations(self, context):
1053410536
context, inst_filters, expected_attrs=attrs, use_slave=True)
1053510537

1053610538
for instance in instances:
10537-
if instance.host != CONF.host:
10538-
for migration in migrations:
10539-
if instance.uuid == migration.instance_uuid:
10540-
# Delete instance files if not cleanup properly either
10541-
# from the source or destination compute nodes when
10542-
# the instance is deleted during resizing.
10543-
self.driver.delete_instance_files(instance)
10544-
try:
10545-
migration.status = 'failed'
10546-
migration.save()
10547-
except exception.MigrationNotFound:
10548-
LOG.warning("Migration %s is not found.",
10549-
migration.id,
10550-
instance=instance)
10551-
break
10539+
if instance.host == CONF.host:
10540+
continue
10541+
for migration in migrations:
10542+
if instance.uuid != migration.instance_uuid:
10543+
continue
10544+
self.driver.delete_instance_files(instance)
10545+
# we are not sure whether the migration_context is applied
10546+
# during incompleted migrating, we need to apply/revert
10547+
# migration_context to get instance object content matching
10548+
# current host.
10549+
revert = (True if migration.source_compute == CONF.host
10550+
else False)
10551+
with instance.mutated_migration_context(revert=revert):
10552+
self.driver.cleanup_lingering_instance_resources(instance)
10553+
10554+
try:
10555+
migration.status = 'failed'
10556+
migration.save()
10557+
except exception.MigrationNotFound:
10558+
LOG.warning("Migration %s is not found.",
10559+
migration.id,
10560+
instance=instance)
10561+
break
1055210562

1055310563
@messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,
1055410564
exception.QemuGuestAgentNotEnabled,

nova/compute/resource_tracker.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -919,9 +919,9 @@ def _update_available_resource(self, context, resources, startup=False):
919919
instance_by_uuid = self._update_usage_from_instances(
920920
context, instances, nodename)
921921

922-
# Grab all in-progress migrations:
923-
migrations = objects.MigrationList.get_in_progress_by_host_and_node(
924-
context, self.host, nodename)
922+
# Grab all in-progress migrations and error migrations:
923+
migrations = objects.MigrationList.get_in_progress_and_error(
924+
context, self.host, nodename)
925925

926926
self._pair_instances_to_migrations(migrations, instance_by_uuid)
927927
self._update_usage_from_migrations(context, migrations, nodename)
@@ -1360,6 +1360,11 @@ def _update_usage_from_migrations(self, context, migrations, nodename):
13601360

13611361
try:
13621362
if uuid not in instances:
1363+
# Track migrating instance even if it is deleted but still
1364+
# has database record. This kind of instance might be
1365+
# deleted during unfinished migrating but exist in the
1366+
# hypervisor.
1367+
migration._context = context.elevated(read_deleted='yes')
13631368
instances[uuid] = migration.instance
13641369
except exception.InstanceNotFound as e:
13651370
# migration referencing deleted instance

nova/db/api.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,14 @@ def migration_get_by_sort_filters(context, sort_keys, sort_dirs, values):
470470
values)
471471

472472

473+
def migration_get_in_progress_and_error_by_host_and_node(context, host, node):
474+
"""Finds all in progress migrations and error migrations for the given
475+
host + node.
476+
"""
477+
return IMPL.migration_get_in_progress_and_error_by_host_and_node(
478+
context, host, node)
479+
480+
473481
####################
474482

475483

nova/db/sqlalchemy/api.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3204,6 +3204,10 @@ def migration_get_all_by_filters(context, filters,
32043204
elif "source_compute" in filters:
32053205
host = filters['source_compute']
32063206
query = query.filter(models.Migration.source_compute == host)
3207+
if "node" in filters:
3208+
node = filters['node']
3209+
query = query.filter(or_(models.Migration.source_node == node,
3210+
models.Migration.dest_node == node))
32073211
if "migration_type" in filters:
32083212
migtype = filters["migration_type"]
32093213
query = query.filter(models.Migration.migration_type == migtype)
@@ -3272,6 +3276,20 @@ def migration_migrate_to_uuid(context, count):
32723276
return done, done
32733277

32743278

3279+
@pick_context_manager_reader
3280+
def migration_get_in_progress_and_error_by_host_and_node(context, host, node):
3281+
return model_query(context, models.Migration).\
3282+
filter(or_(and_(models.Migration.source_compute == host,
3283+
models.Migration.source_node == node),
3284+
and_(models.Migration.dest_compute == host,
3285+
models.Migration.dest_node == node))).\
3286+
filter(~models.Migration.status.in_(['confirmed', 'reverted',
3287+
'failed', 'completed',
3288+
'cancelled', 'done'])).\
3289+
options(_joinedload_all('instance.system_metadata')).\
3290+
all()
3291+
3292+
32753293
########################
32763294
# User-provided metadata
32773295

nova/objects/instance.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,8 +1047,8 @@ def _set_migration_context_to_instance(self, prefix):
10471047
setattr(self, inst_attr_name, attr_value)
10481048

10491049
@contextlib.contextmanager
1050-
def mutated_migration_context(self):
1051-
"""Context manager to temporarily apply the migration context.
1050+
def mutated_migration_context(self, revert=False):
1051+
"""Context manager to temporarily apply/revert the migration context.
10521052
10531053
Calling .save() from within the context manager means that the mutated
10541054
context will be saved which can cause incorrect resource tracking, and
@@ -1063,7 +1063,10 @@ def mutated_migration_context(self):
10631063
current_values = {}
10641064
for attr_name in _MIGRATION_CONTEXT_ATTRS:
10651065
current_values[attr_name] = getattr(self, attr_name)
1066-
self.apply_migration_context()
1066+
if revert:
1067+
self.revert_migration_context()
1068+
else:
1069+
self.apply_migration_context()
10671070
try:
10681071
yield
10691072
finally:

nova/objects/migration.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,9 @@ class MigrationList(base.ObjectListBase, base.NovaObject):
222222
# for an instance.
223223
# Version 1.4: Added sort_keys, sort_dirs, limit, marker kwargs to
224224
# get_by_filters for migrations pagination support.
225-
VERSION = '1.4'
225+
# Version 1.5: Added a new function to get in progress migrations
226+
# and error migrations for a given host + node.
227+
VERSION = '1.5'
226228

227229
fields = {
228230
'objects': fields.ListOfObjectsField('Migration'),
@@ -266,3 +268,11 @@ def get_in_progress_by_instance(cls, context, instance_uuid,
266268
context, instance_uuid, migration_type)
267269
return base.obj_make_list(context, cls(context), objects.Migration,
268270
db_migrations)
271+
272+
@base.remotable_classmethod
273+
def get_in_progress_and_error(cls, context, host, node):
274+
db_migrations = \
275+
db.migration_get_in_progress_and_error_by_host_and_node(
276+
context, host, node)
277+
return base.obj_make_list(context, cls(context), objects.Migration,
278+
db_migrations)

nova/tests/functional/libvirt/test_numa_live_migration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ def _test(self, pin_dest=False):
354354
# rollback test, so server_a is expected to remain on host_a.
355355
if pin_dest:
356356
self._rpc_pin_host('host_b')
357-
self._live_migrate(self.server_a, 'error')
357+
self._live_migrate(self.server_a, 'failed')
358358
self.assertEqual('host_a', self.get_host(self.server_a['id']))
359359
self.assertIsNone(self._get_migration_context(self.server_a['id']))
360360

@@ -448,7 +448,7 @@ def _test(self, pin_source, pin_cond, expect_success=True):
448448
['completed'])
449449
else:
450450
final_host = 'source'
451-
self._wait_for_migration_status(self.migrating_server, ['error'])
451+
self._wait_for_migration_status(self.migrating_server, ['failed'])
452452
self.assertEqual(final_host,
453453
self.get_host(self.migrating_server['id']))
454454
self.assertTrue(self.migrate_stub_ran)

nova/tests/functional/regressions/test_bug_1889108.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_vol_attachments_during_driver_pre_live_mig_failure(
8282
# Migrate the instance and wait until the migration errors out thanks
8383
# to our mocked version of pre_live_migration raising
8484
# test.TestingException
85-
self._live_migrate(server, 'error')
85+
self._live_migrate(server, 'failed')
8686

8787
# Assert that we called the fake pre_live_migration method
8888
mock_plm.assert_called_once()

nova/tests/functional/test_servers.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2870,8 +2870,10 @@ def stub_pre_live_migration(context, instance, block_device_info,
28702870
}
28712871
self.api.post_server_action(server['id'], post)
28722872
# The compute manager will put the migration record into error status
2873-
# when pre_live_migration fails, so wait for that to happen.
2874-
migration = self._wait_for_migration_status(server, ['error'])
2873+
# when start _cleanup_pre_live_migration, then set it to failed status
2874+
# at the end of _rollback_live_migration
2875+
migration = self._wait_for_migration_status(
2876+
server, ['error', 'failed'])
28752877
# The _rollback_live_migration method in the compute manager will reset
28762878
# the task_state on the instance, so wait for that to happen.
28772879
server = self._wait_for_server_parameter(

nova/tests/unit/compute/test_compute.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6258,7 +6258,7 @@ def do_it(mock_client, mock_setup):
62586258
self.assertEqual('src_host', instance.host)
62596259
self.assertEqual(vm_states.ACTIVE, instance.vm_state)
62606260
self.assertIsNone(instance.task_state)
6261-
self.assertEqual('error', migration.status)
6261+
self.assertEqual('failed', migration.status)
62626262
mock_get_disk.assert_called()
62636263
mock_pre.assert_called_once_with(c,
62646264
instance, True, mock_get_disk.return_value, dest_host,
@@ -6658,7 +6658,7 @@ def _test(mock_drop_claim, mock_nw_api, mock_lmcf, mock_ra,
66586658

66596659
_test()
66606660

6661-
self.assertEqual(migration_status or 'error', migration.status)
6661+
self.assertEqual(migration_status or 'failed', migration.status)
66626662
self.assertEqual(0, instance.progress)
66636663

66646664
def test_rollback_live_migration_set_migration_status(self):

0 commit comments

Comments
 (0)