Skip to content

Commit 8a47606

Browse files
committed
Ironic: retry when node not available
After a baremetal instance is deleted, and its allocation is removed in placement, the ironic node might start cleaning. Eventually nova will notice and update the inventory to be reserved. During this window, a new instance may have already picked this ironic node. When that race happens today the build fails with an error: "Failed to reserve node ..." This change tries to ensure the remaining alternative hosts are attempted before aborting the build. Clearly the race is still there, but this makes it less painful. Related-Bug: #1974070 Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d
1 parent 2f4feea commit 8a47606

File tree

4 files changed

+70
-3
lines changed

4 files changed

+70
-3
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,8 @@ def _build_resources(self, context, instance, requested_networks,
27362736
block_device_mapping)
27372737
resources['block_device_info'] = block_device_info
27382738
except (exception.InstanceNotFound,
2739-
exception.UnexpectedDeletingTaskStateError):
2739+
exception.UnexpectedDeletingTaskStateError,
2740+
exception.ComputeResourcesUnavailable):
27402741
with excutils.save_and_reraise_exception():
27412742
self._build_resources_cleanup(instance, network_info)
27422743
except (exception.UnexpectedTaskStateError,

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7925,6 +7925,42 @@ def test_failed_bdm_prep_from_delete_raises_unexpected(self, mock_clean,
79257925
mock_prepspawn.assert_called_once_with(self.instance)
79267926
mock_failedspawn.assert_called_once_with(self.instance)
79277927

7928+
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
7929+
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
7930+
@mock.patch.object(virt_driver.ComputeDriver,
7931+
'prepare_networks_before_block_device_mapping')
7932+
@mock.patch.object(virt_driver.ComputeDriver,
7933+
'clean_networks_preparation')
7934+
def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
7935+
mock_prepspawn, mock_failedspawn):
7936+
mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
7937+
reason="asdf")
7938+
with mock.patch.object(self.compute,
7939+
'_build_networks_for_instance',
7940+
return_value=self.network_info
7941+
) as _build_networks_for_instance:
7942+
7943+
try:
7944+
with self.compute._build_resources(self.context, self.instance,
7945+
self.requested_networks, self.security_groups,
7946+
self.image, self.block_device_mapping,
7947+
self.resource_provider_mapping, self.accel_uuids):
7948+
pass
7949+
except Exception as e:
7950+
self.assertIsInstance(e,
7951+
exception.ComputeResourcesUnavailable)
7952+
7953+
_build_networks_for_instance.assert_has_calls(
7954+
[mock.call(self.context, self.instance,
7955+
self.requested_networks, self.security_groups,
7956+
self.resource_provider_mapping,
7957+
self.network_arqs)])
7958+
7959+
mock_prepnet.assert_not_called()
7960+
mock_clean.assert_called_once_with(self.instance, self.network_info)
7961+
mock_prepspawn.assert_called_once_with(self.instance)
7962+
mock_failedspawn.assert_called_once_with(self.instance)
7963+
79287964
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
79297965
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
79307966
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2542,7 +2542,10 @@ def test_ironicclient_bad_response(self, mock_error):
25422542

25432543
@mock.patch.object(cw.IronicClientWrapper, 'call')
25442544
def test_prepare_for_spawn(self, mock_call):
2545-
node = ironic_utils.get_test_node(driver='fake')
2545+
node = ironic_utils.get_test_node(
2546+
driver='fake', instance_uuid=None,
2547+
provision_state=ironic_states.AVAILABLE,
2548+
power_state=ironic_states.POWER_OFF)
25462549
self.mock_conn.get_node.return_value = node
25472550
instance = fake_instance.fake_instance_obj(self.ctx,
25482551
node=node.uuid)
@@ -2574,14 +2577,29 @@ def test_prepare_for_spawn_invalid_instance(self):
25742577
instance)
25752578

25762579
def test_prepare_for_spawn_conflict(self):
2577-
node = ironic_utils.get_test_node(driver='fake')
2580+
node = ironic_utils.get_test_node(
2581+
driver='fake', instance_uuid=None,
2582+
provision_state=ironic_states.AVAILABLE,
2583+
power_state=ironic_states.POWER_OFF)
25782584
self.mock_conn.get_node.return_value = node
25792585
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
25802586
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
25812587
self.assertRaises(exception.InstanceDeployFailure,
25822588
self.driver.prepare_for_spawn,
25832589
instance)
25842590

2591+
def test_prepare_for_spawn_not_available(self):
2592+
node = ironic_utils.get_test_node(
2593+
driver='fake', instance_uuid=None,
2594+
provision_state=ironic_states.CLEANWAIT,
2595+
power_state=ironic_states.POWER_OFF)
2596+
self.mock_conn.get_node.return_value = node
2597+
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
2598+
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
2599+
self.assertRaises(exception.ComputeResourcesUnavailable,
2600+
self.driver.prepare_for_spawn,
2601+
instance)
2602+
25852603
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
25862604
def test_failed_spawn_cleanup(self, mock_cleanup):
25872605
node = ironic_utils.get_test_node(driver='fake')

nova/virt/ironic/driver.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,18 @@ def prepare_for_spawn(self, instance):
397397
_("Ironic node uuid not supplied to "
398398
"driver for instance %s.") % instance.uuid)
399399
node = self._get_node(node_uuid)
400+
401+
# Its possible this node has just moved from deleting
402+
# to cleaning. Placement will update the inventory
403+
# as all reserved, but this instance might have got here
404+
# before that happened, but after the previous allocation
405+
# got deleted. We trigger a re-schedule to another node.
406+
if (self._node_resources_used(node) or
407+
self._node_resources_unavailable(node)):
408+
msg = "Chosen ironic node %s is not available" % node_uuid
409+
LOG.info(msg, instance=instance)
410+
raise exception.ComputeResourcesUnavailable(reason=msg)
411+
400412
self._set_instance_id(node, instance)
401413

402414
def failed_spawn_cleanup(self, instance):

0 commit comments

Comments
 (0)