Skip to content

Commit e4601c7

Browse files
mriedemlyarwood
authored andcommitted
Ensure source compute is up when confirming a resize
When _poll_unconfirmed_resizes runs or a user tries to confirm a resize in the API, if the source compute service is down the migration status will be stuck in "confirming" status if it never reached the source compute. Subsequent runs of _poll_unconfirmed_resizes will not be able to auto-confirm the resize nor will the user be able to manually confirm the resize. An admin could reset the status on the server to ACTIVE or ERROR but that means the source compute never gets cleaned up since you can only confirm or revert a resize on a server with VERIFY_RESIZE status. This adds a check in the API before updating the migration record such that if the source compute service is down the API returns a 409 response as an indication to try again later. SingleCellSimple._fake_target_cell is updated so that tests using it can assert when a context was targeted without having to stub nova.context.target_cell. As a result some HostManager unit tests needed to be updated. Change-Id: I33aa5e32cb321e5a16da51e227af2f67ed9e6713 Closes-Bug: #1855927
1 parent 8ca9b72 commit e4601c7

File tree

7 files changed

+129
-35
lines changed

7 files changed

+129
-35
lines changed

nova/api/openstack/compute/servers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,10 @@ def _action_confirm_resize(self, req, id, body):
885885
except exception.MigrationNotFound:
886886
msg = _("Instance has not been resized.")
887887
raise exc.HTTPBadRequest(explanation=msg)
888-
except exception.InstanceIsLocked as e:
888+
except (
889+
exception.InstanceIsLocked,
890+
exception.ServiceUnavailable,
891+
) as e:
889892
raise exc.HTTPConflict(explanation=e.format_message())
890893
except exception.InstanceInvalidState as state_error:
891894
common.raise_http_conflict_for_instance_invalid_state(state_error,

nova/compute/api.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3775,10 +3775,40 @@ def revert_resize(self, context, instance):
37753775
migration.dest_compute,
37763776
reqspec)
37773777

3778+
@staticmethod
3779+
def _get_source_compute_service(context, migration):
3780+
"""Find the source compute Service object given the Migration.
3781+
3782+
:param context: nova auth RequestContext target at the destination
3783+
compute cell
3784+
:param migration: Migration object for the move operation
3785+
:return: Service object representing the source host nova-compute
3786+
"""
3787+
if migration.cross_cell_move:
3788+
# The source compute could be in another cell so look up the
3789+
# HostMapping to determine the source cell.
3790+
hm = objects.HostMapping.get_by_host(
3791+
context, migration.source_compute)
3792+
with nova_context.target_cell(context, hm.cell_mapping) as cctxt:
3793+
return objects.Service.get_by_compute_host(
3794+
cctxt, migration.source_compute)
3795+
# Same-cell migration so just use the context we have.
3796+
return objects.Service.get_by_compute_host(
3797+
context, migration.source_compute)
3798+
37783799
@check_instance_lock
37793800
@check_instance_state(vm_state=[vm_states.RESIZED])
37803801
def confirm_resize(self, context, instance, migration=None):
3781-
"""Confirms a migration/resize and deletes the 'old' instance."""
3802+
"""Confirms a migration/resize and deletes the 'old' instance.
3803+
3804+
:param context: nova auth RequestContext
3805+
:param instance: Instance object to confirm the resize
3806+
:param migration: Migration object; provided if called from the
3807+
_poll_unconfirmed_resizes periodic task on the dest compute.
3808+
:raises: MigrationNotFound if migration is not provided and a migration
3809+
cannot be found for the instance with status "finished".
3810+
:raises: ServiceUnavailable if the source compute service is down.
3811+
"""
37823812
elevated = context.elevated()
37833813
# NOTE(melwitt): We're not checking quota here because there isn't a
37843814
# change in resource usage when confirming a resize. Resource
@@ -3789,6 +3819,13 @@ def confirm_resize(self, context, instance, migration=None):
37893819
migration = objects.Migration.get_by_instance_and_status(
37903820
elevated, instance.uuid, 'finished')
37913821

3822+
# Check if the source compute service is up before modifying the
3823+
# migration record because once we do we cannot come back through this
3824+
# method since it will be looking for a "finished" status migration.
3825+
source_svc = self._get_source_compute_service(context, migration)
3826+
if not self.servicegroup_api.service_is_up(source_svc):
3827+
raise exception.ServiceUnavailable()
3828+
37923829
migration.status = 'confirming'
37933830
migration.save()
37943831

nova/tests/fixtures.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,12 @@ def _fake_cell_list(self, *args):
324324

325325
@contextmanager
326326
def _fake_target_cell(self, context, target_cell):
327-
# NOTE(danms): Just pass through the context without actually
328-
# targeting anything.
327+
# Just do something simple and set/unset the cell_uuid on the context.
328+
if target_cell:
329+
context.cell_uuid = getattr(target_cell, 'uuid',
330+
uuidsentinel.cell1)
331+
else:
332+
context.cell_uuid = None
329333
yield context
330334

331335
def _fake_set_target_cell(self, context, cell_mapping):

nova/tests/functional/test_cross_cell_migrate.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,10 @@ def test_anti_affinity_group(self):
963963
server2['id'], {'migrate': {'host': 'host2'}})
964964
self._wait_for_migration_status(server2, ['error'])
965965

966+
@mock.patch('nova.compute.api.API._get_source_compute_service',
967+
new=mock.Mock())
968+
@mock.patch('nova.servicegroup.api.API.service_is_up',
969+
new=mock.Mock(return_value=True))
966970
def test_poll_unconfirmed_resizes_with_upcall(self):
967971
"""Tests the _poll_unconfirmed_resizes periodic task with a cross-cell
968972
resize once the instance is in VERIFY_RESIZE status on the dest host.
@@ -993,6 +997,10 @@ def test_poll_unconfirmed_resizes_with_upcall(self):
993997
self._assert_confirm(
994998
server, source_rp_uuid, target_rp_uuid, new_flavor)
995999

1000+
@mock.patch('nova.compute.api.API._get_source_compute_service',
1001+
new=mock.Mock())
1002+
@mock.patch('nova.servicegroup.api.API.service_is_up',
1003+
new=mock.Mock(return_value=True))
9961004
def test_poll_unconfirmed_resizes_with_no_upcall(self):
9971005
"""Tests the _poll_unconfirmed_resizes periodic task with a cross-cell
9981006
resize once the instance is in VERIFY_RESIZE status on the dest host.

nova/tests/functional/test_servers.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
from nova.tests.functional.api import client
4747
from nova.tests.functional import integrated_helpers
4848
from nova.tests.unit.api.openstack import fakes
49-
from nova.tests.unit import cast_as_call
5049
from nova.tests.unit import fake_block_device
5150
from nova.tests.unit import fake_notifier
5251
from nova.tests.unit import fake_requests
@@ -3359,34 +3358,37 @@ def test_source_host_down_during_confirm(self):
33593358
self.api.put_service(
33603359
source_service['id'], {'status': 'disabled', 'forced_down': True})
33613360
# Now configure auto-confirm and call the method on the target compute
3362-
# so we do not have to wait for the periodic to run. Also configure
3363-
# the RPC call from the API to the source compute to timeout after 1
3364-
# second.
3365-
self.flags(resize_confirm_window=1, rpc_response_timeout=1)
3361+
# so we do not have to wait for the periodic to run.
3362+
self.flags(resize_confirm_window=1)
33663363
# Stub timeutils so the DB API query finds the unconfirmed migration.
33673364
future = timeutils.utcnow() + datetime.timedelta(hours=1)
33683365
ctxt = context.get_admin_context()
33693366
with osloutils_fixture.TimeFixture(future):
3370-
# Use the CastAsCall fixture since the fake rpc is not going to
3371-
# simulate a failure from the service being down.
3372-
with cast_as_call.CastAsCall(self):
3373-
self.computes['host2'].manager._poll_unconfirmed_resizes(ctxt)
3367+
self.computes['host2'].manager._poll_unconfirmed_resizes(ctxt)
33743368
self.assertIn('Error auto-confirming resize',
33753369
self.stdlog.logger.output)
3376-
self.assertIn('No reply on topic compute', self.stdlog.logger.output)
3377-
# FIXME(mriedem): This is bug 1855927 where the migration status is
3378-
# left in "confirming" so the _poll_unconfirmed_resizes task will not
3379-
# process it again nor can the user confirm the resize in the API since
3380-
# the migration status is not "finished".
3381-
self._wait_for_migration_status(server, ['confirming'])
3370+
self.assertIn('Service is unavailable at this time',
3371+
self.stdlog.logger.output)
3372+
# The source compute service check should have been done before the
3373+
# migration status was updated so it should still be "finished".
3374+
self._wait_for_migration_status(server, ['finished'])
3375+
# Try to confirm in the API while the source compute service is still
3376+
# down to assert the 409 (rather than a 500) error.
33823377
ex = self.assertRaises(client.OpenStackApiException,
33833378
self.api.post_server_action,
33843379
server['id'], {'confirmResize': None})
3385-
self.assertEqual(400, ex.response.status_code)
3386-
self.assertIn('Instance has not been resized', six.text_type(ex))
3387-
# That error message is bogus because the server is resized.
3388-
server = self.api.get_server(server['id'])
3389-
self.assertEqual('VERIFY_RESIZE', server['status'])
3380+
self.assertEqual(409, ex.response.status_code)
3381+
self.assertIn('Service is unavailable at this time', six.text_type(ex))
3382+
# Bring the source compute back up and try to confirm the resize which
3383+
# should work since the migration status is still "finished".
3384+
self.restart_compute_service(self.computes['host1'])
3385+
self.api.put_service(
3386+
source_service['id'], {'status': 'enabled', 'forced_down': False})
3387+
# Use the API to confirm the resize because _poll_unconfirmed_resizes
3388+
# requires mucking with the current time which causes problems with
3389+
# the service_is_up check in the API.
3390+
self.api.post_server_action(server['id'], {'confirmResize': None})
3391+
self._wait_for_state_change(server, 'ACTIVE')
33903392

33913393

33923394
class ServerLiveMigrateForceAndAbort(

nova/tests/unit/compute/test_api.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,10 +1644,13 @@ def test(mock_inst_get, mock_map_get):
16441644

16451645
test()
16461646

1647+
@mock.patch('nova.compute.api.API._get_source_compute_service')
1648+
@mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True)
16471649
@mock.patch.object(objects.Migration, 'save')
16481650
@mock.patch.object(objects.Migration, 'get_by_instance_and_status')
16491651
@mock.patch.object(context.RequestContext, 'elevated')
16501652
def _test_confirm_resize(self, mock_elevated, mock_get, mock_save,
1653+
mock_service_is_up, mock_get_service,
16511654
mig_ref_passed=False):
16521655
params = dict(vm_state=vm_states.RESIZED)
16531656
fake_inst = self._create_instance_obj(params=params)
@@ -1678,6 +1681,8 @@ def _check_mig(expected_task_state=None):
16781681
self.compute_api.confirm_resize(self.context, fake_inst)
16791682

16801683
mock_elevated.assert_called_once_with()
1684+
mock_service_is_up.assert_called_once_with(
1685+
mock_get_service.return_value)
16811686
mock_save.assert_called_once_with()
16821687
mock_record.assert_called_once_with(self.context, fake_inst,
16831688
'confirmResize')
@@ -1694,6 +1699,34 @@ def test_confirm_resize(self):
16941699
def test_confirm_resize_with_migration_ref(self):
16951700
self._test_confirm_resize(mig_ref_passed=True)
16961701

1702+
@mock.patch('nova.objects.HostMapping.get_by_host',
1703+
return_value=objects.HostMapping(
1704+
cell_mapping=objects.CellMapping(
1705+
database_connection='fake://', transport_url='none://',
1706+
uuid=uuids.cell_uuid)))
1707+
@mock.patch('nova.objects.Service.get_by_compute_host')
1708+
def test_get_source_compute_service(self, mock_service_get, mock_hm_get):
1709+
# First start with a same-cell migration.
1710+
migration = objects.Migration(source_compute='source.host',
1711+
cross_cell_move=False)
1712+
self.compute_api._get_source_compute_service(self.context, migration)
1713+
mock_hm_get.assert_not_called()
1714+
mock_service_get.assert_called_once_with(self.context, 'source.host')
1715+
# Make sure the context was not targeted.
1716+
ctxt = mock_service_get.call_args[0][0]
1717+
self.assertIsNone(ctxt.cell_uuid)
1718+
1719+
# Now test with a cross-cell migration.
1720+
mock_service_get.reset_mock()
1721+
migration.cross_cell_move = True
1722+
self.compute_api._get_source_compute_service(self.context, migration)
1723+
mock_hm_get.assert_called_once_with(self.context, 'source.host')
1724+
mock_service_get.assert_called_once_with(
1725+
test.MatchType(context.RequestContext), 'source.host')
1726+
# Make sure the context was targeted.
1727+
ctxt = mock_service_get.call_args[0][0]
1728+
self.assertEqual(uuids.cell_uuid, ctxt.cell_uuid)
1729+
16971730
@mock.patch('nova.virt.hardware.numa_get_constraints')
16981731
@mock.patch('nova.network.neutron.API.get_requested_resource_for_instance',
16991732
return_value=[])
@@ -7460,8 +7493,10 @@ def test_get_migrations_sorted_filter_duplicates_using_created_at(self):
74607493
self._test_get_migrations_sorted_filter_duplicates(
74617494
[older, newer], newer)
74627495

7496+
@mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True)
74637497
@mock.patch('nova.objects.Migration.get_by_instance_and_status')
7464-
def test_confirm_resize_cross_cell_move_true(self, mock_migration_get):
7498+
def test_confirm_resize_cross_cell_move_true(self, mock_migration_get,
7499+
mock_service_is_up):
74657500
"""Tests confirm_resize where Migration.cross_cell_move is True"""
74667501
instance = fake_instance.fake_instance_obj(
74677502
self.context, vm_state=vm_states.RESIZED, task_state=None,
@@ -7475,12 +7510,15 @@ def test_confirm_resize_cross_cell_move_true(self, mock_migration_get):
74757510
mock.patch.object(self.compute_api, '_record_action_start'),
74767511
mock.patch.object(self.compute_api.compute_task_api,
74777512
'confirm_snapshot_based_resize'),
7513+
mock.patch.object(self.compute_api, '_get_source_compute_service'),
74787514
) as (
74797515
mock_elevated, mock_migration_save, mock_record_action,
7480-
mock_conductor_confirm
7516+
mock_conductor_confirm, mock_get_service
74817517
):
74827518
self.compute_api.confirm_resize(self.context, instance)
74837519
mock_elevated.assert_called_once_with()
7520+
mock_service_is_up.assert_called_once_with(
7521+
mock_get_service.return_value)
74847522
mock_migration_save.assert_called_once_with()
74857523
self.assertEqual('confirming', migration.status)
74867524
mock_record_action.assert_called_once_with(

nova/tests/unit/scheduler/test_host_manager.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,8 @@ def test_init_instance_info_compute_nodes(self, mock_get_all,
219219
inst2 = objects.Instance(host='host1', uuid=uuids.instance_2)
220220
inst3 = objects.Instance(host='host2', uuid=uuids.instance_3)
221221
cell = objects.CellMapping(database_connection='',
222-
target_url='')
222+
target_url='',
223+
uuid=uuids.cell_uuid)
223224
mock_get_by_filters.return_value = objects.InstanceList(
224225
objects=[inst1, inst2, inst3])
225226
hm = self.host_manager
@@ -589,7 +590,7 @@ def test_get_host_states(self, mock_get_by_host, mock_get_all,
589590
mock_get_by_host.return_value = []
590591
mock_get_all.return_value = fakes.COMPUTE_NODES
591592
mock_get_by_binary.return_value = fakes.SERVICES
592-
context = 'fake_context'
593+
context = nova_context.get_admin_context()
593594
compute_nodes, services = self.host_manager._get_computes_for_cells(
594595
context, self.host_manager.enabled_cells)
595596

@@ -786,7 +787,7 @@ def test_host_state_update(self, mock_get_by_host):
786787

787788
@mock.patch('nova.objects.InstanceList.get_uuids_by_host')
788789
def test_host_state_not_updated(self, mock_get_by_host):
789-
context = 'fake_context'
790+
context = nova_context.get_admin_context()
790791
hm = self.host_manager
791792
inst1 = objects.Instance(uuid=uuids.instance)
792793
cn1 = objects.ComputeNode(host='host1')
@@ -806,10 +807,11 @@ def test_host_state_not_updated(self, mock_get_by_host):
806807

807808
@mock.patch('nova.objects.InstanceList.get_uuids_by_host')
808809
def test_recreate_instance_info(self, mock_get_by_host):
810+
context = nova_context.get_admin_context()
809811
host_name = 'fake_host'
810-
inst1 = fake_instance.fake_instance_obj('fake_context',
812+
inst1 = fake_instance.fake_instance_obj(context,
811813
uuid=uuids.instance_1)
812-
inst2 = fake_instance.fake_instance_obj('fake_context',
814+
inst2 = fake_instance.fake_instance_obj(context,
813815
uuid=uuids.instance_2)
814816
orig_inst_dict = {inst1.uuid: inst1, inst2.uuid: inst2}
815817
mock_get_by_host.return_value = [uuids.instance_1, uuids.instance_2]
@@ -818,7 +820,7 @@ def test_recreate_instance_info(self, mock_get_by_host):
818820
'instances': orig_inst_dict,
819821
'updated': True,
820822
}}
821-
self.host_manager._recreate_instance_info('fake_context', host_name)
823+
self.host_manager._recreate_instance_info(context, host_name)
822824
new_info = self.host_manager._instance_info[host_name]
823825
self.assertEqual(len(new_info['instances']),
824826
len(mock_get_by_host.return_value))
@@ -1231,7 +1233,7 @@ def test_get_host_states(self, mock_get_by_host, mock_get_all,
12311233
mock_get_by_host.return_value = []
12321234
mock_get_all.return_value = fakes.COMPUTE_NODES
12331235
mock_get_by_binary.return_value = fakes.SERVICES
1234-
context = 'fake_context'
1236+
context = nova_context.get_admin_context()
12351237

12361238
compute_nodes, services = self.host_manager._get_computes_for_cells(
12371239
context, self.host_manager.enabled_cells)
@@ -1256,7 +1258,7 @@ def test_get_host_states_after_delete_one(self, mock_get_by_host,
12561258
mock_get_by_host.return_value = []
12571259
mock_get_all.side_effect = [fakes.COMPUTE_NODES, running_nodes]
12581260
mock_get_by_binary.side_effect = [fakes.SERVICES, fakes.SERVICES]
1259-
context = 'fake_context'
1261+
context = nova_context.get_admin_context()
12601262

12611263
# first call: all nodes
12621264
compute_nodes, services = self.host_manager._get_computes_for_cells(
@@ -1286,7 +1288,7 @@ def test_get_host_states_after_delete_all(self, mock_get_by_host,
12861288
mock_get_by_host.return_value = []
12871289
mock_get_all.side_effect = [fakes.COMPUTE_NODES, []]
12881290
mock_get_by_binary.side_effect = [fakes.SERVICES, fakes.SERVICES]
1289-
context = 'fake_context'
1291+
context = nova_context.get_admin_context()
12901292

12911293
# first call: all nodes
12921294
compute_nodes, services = self.host_manager._get_computes_for_cells(

0 commit comments

Comments
 (0)