Skip to content

Commit d71e9f6

Browse files
JohnGarbuttrloo
authored andcommitted
Ironic: retry when node not available
After a baremetal instance is deleted, and its allocation is removed in placement, the ironic node might start cleaning. Eventually nova will notice and update the inventory to be reserved. During this window, a new instance may have already picked this ironic node. When that race happens today the build fails with an error: "Failed to reserve node ..." This change tries to ensure the remaining alternative hosts are attempted before aborting the build. Clearly the race is still there, but this makes it less painful. Related-Bug: #1974070 Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d (cherry picked from commit 8a47606)
1 parent d92d093 commit d71e9f6

File tree

4 files changed

+70
-3
lines changed

4 files changed

+70
-3
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,8 @@ def _build_resources(self, context, instance, requested_networks,
27362736
block_device_mapping)
27372737
resources['block_device_info'] = block_device_info
27382738
except (exception.InstanceNotFound,
2739-
exception.UnexpectedDeletingTaskStateError):
2739+
exception.UnexpectedDeletingTaskStateError,
2740+
exception.ComputeResourcesUnavailable):
27402741
with excutils.save_and_reraise_exception():
27412742
self._build_resources_cleanup(instance, network_info)
27422743
except (exception.UnexpectedTaskStateError,

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7925,6 +7925,42 @@ def test_failed_bdm_prep_from_delete_raises_unexpected(self, mock_clean,
79257925
mock_prepspawn.assert_called_once_with(self.instance)
79267926
mock_failedspawn.assert_called_once_with(self.instance)
79277927

7928+
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
7929+
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
7930+
@mock.patch.object(virt_driver.ComputeDriver,
7931+
'prepare_networks_before_block_device_mapping')
7932+
@mock.patch.object(virt_driver.ComputeDriver,
7933+
'clean_networks_preparation')
7934+
def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
7935+
mock_prepspawn, mock_failedspawn):
7936+
mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
7937+
reason="asdf")
7938+
with mock.patch.object(self.compute,
7939+
'_build_networks_for_instance',
7940+
return_value=self.network_info
7941+
) as _build_networks_for_instance:
7942+
7943+
try:
7944+
with self.compute._build_resources(self.context, self.instance,
7945+
self.requested_networks, self.security_groups,
7946+
self.image, self.block_device_mapping,
7947+
self.resource_provider_mapping, self.accel_uuids):
7948+
pass
7949+
except Exception as e:
7950+
self.assertIsInstance(e,
7951+
exception.ComputeResourcesUnavailable)
7952+
7953+
_build_networks_for_instance.assert_has_calls(
7954+
[mock.call(self.context, self.instance,
7955+
self.requested_networks, self.security_groups,
7956+
self.resource_provider_mapping,
7957+
self.network_arqs)])
7958+
7959+
mock_prepnet.assert_not_called()
7960+
mock_clean.assert_called_once_with(self.instance, self.network_info)
7961+
mock_prepspawn.assert_called_once_with(self.instance)
7962+
mock_failedspawn.assert_called_once_with(self.instance)
7963+
79287964
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
79297965
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
79307966
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2500,7 +2500,10 @@ def test_ironicclient_bad_response(self, mock_error):
25002500

25012501
@mock.patch.object(cw.IronicClientWrapper, 'call')
25022502
def test_prepare_for_spawn(self, mock_call):
2503-
node = ironic_utils.get_test_node(driver='fake')
2503+
node = ironic_utils.get_test_node(
2504+
driver='fake', instance_uuid=None,
2505+
provision_state=ironic_states.AVAILABLE,
2506+
power_state=ironic_states.POWER_OFF)
25042507
self.mock_conn.get_node.return_value = node
25052508
instance = fake_instance.fake_instance_obj(self.ctx,
25062509
node=node.uuid)
@@ -2532,14 +2535,29 @@ def test_prepare_for_spawn_invalid_instance(self):
25322535
instance)
25332536

25342537
def test_prepare_for_spawn_conflict(self):
2535-
node = ironic_utils.get_test_node(driver='fake')
2538+
node = ironic_utils.get_test_node(
2539+
driver='fake', instance_uuid=None,
2540+
provision_state=ironic_states.AVAILABLE,
2541+
power_state=ironic_states.POWER_OFF)
25362542
self.mock_conn.get_node.return_value = node
25372543
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
25382544
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
25392545
self.assertRaises(exception.InstanceDeployFailure,
25402546
self.driver.prepare_for_spawn,
25412547
instance)
25422548

2549+
def test_prepare_for_spawn_not_available(self):
2550+
node = ironic_utils.get_test_node(
2551+
driver='fake', instance_uuid=None,
2552+
provision_state=ironic_states.CLEANWAIT,
2553+
power_state=ironic_states.POWER_OFF)
2554+
self.mock_conn.get_node.return_value = node
2555+
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
2556+
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
2557+
self.assertRaises(exception.ComputeResourcesUnavailable,
2558+
self.driver.prepare_for_spawn,
2559+
instance)
2560+
25432561
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
25442562
def test_failed_spawn_cleanup(self, mock_cleanup):
25452563
node = ironic_utils.get_test_node(driver='fake')

nova/virt/ironic/driver.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,18 @@ def prepare_for_spawn(self, instance):
397397
_("Ironic node uuid not supplied to "
398398
"driver for instance %s.") % instance.uuid)
399399
node = self._get_node(node_uuid)
400+
401+
# Its possible this node has just moved from deleting
402+
# to cleaning. Placement will update the inventory
403+
# as all reserved, but this instance might have got here
404+
# before that happened, but after the previous allocation
405+
# got deleted. We trigger a re-schedule to another node.
406+
if (self._node_resources_used(node) or
407+
self._node_resources_unavailable(node)):
408+
msg = "Chosen ironic node %s is not available" % node_uuid
409+
LOG.info(msg, instance=instance)
410+
raise exception.ComputeResourcesUnavailable(reason=msg)
411+
400412
self._set_instance_id(node, instance)
401413

402414
def failed_spawn_cleanup(self, instance):

0 commit comments

Comments
 (0)