Skip to content

Commit 772f5a1

Browse files
committed
Make compute node rebalance safter
Many bugs around nova-compute rebalancing are focused around problems when the compute node and placement resources are deleted, and sometimes they never get re-created. To limit this class of bugs, we add a check to ensure a compute node is only ever deleted when it is known to have been deleted in Ironic. There is a risk this might leave orphaned compute nodes and resource providers that need manual clean up because users do not want to delete the node in Ironic, but are removing it from nova management. But on balance, it seems safer to leave these cases up to the operator to resolve manually, and collect feedback on how to better help those users. blueprint ironic-shards Change-Id: I7cd9e5ab878cea05462cac24de581dca6d50b3c3
1 parent 9068db0 commit 772f5a1

File tree

6 files changed

+88
-4
lines changed

6 files changed

+88
-4
lines changed

nova/compute/manager.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10575,6 +10575,19 @@ def update_available_resource(self, context, startup=False):
1057510575
# Delete orphan compute node not reported by driver but still in db
1057610576
for cn in compute_nodes_in_db:
1057710577
if cn.hypervisor_hostname not in nodenames:
10578+
# if the node could be migrated, we don't delete
10579+
# the compute node database records
10580+
if not self.driver.is_node_deleted(cn.hypervisor_hostname):
10581+
LOG.warning(
10582+
"Found orphan compute node %(id)s "
10583+
"hypervisor host is %(hh)s, "
10584+
"nodes are %(nodes)s. "
10585+
"We are not deleting this as the driver "
10586+
"says this node has not been deleted.",
10587+
{'id': cn.id, 'hh': cn.hypervisor_hostname,
10588+
'nodes': nodenames})
10589+
continue
10590+
1057810591
LOG.info("Deleting orphan compute node %(id)s "
1057910592
"hypervisor host is %(hh)s, "
1058010593
"nodes are %(nodes)s",

nova/tests/functional/regressions/test_bug_1839560.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# under the License.
1212

1313
from oslo_log import log as logging
14+
from unittest import mock
1415

1516
from nova import context
1617
from nova.db.main import api as db_api
@@ -20,6 +21,7 @@
2021
from nova.tests.functional import fixtures as func_fixtures
2122
from nova.tests.functional import integrated_helpers
2223
from nova import utils
24+
from nova.virt import fake as fake_driver
2325

2426
LOG = logging.getLogger(__name__)
2527

@@ -56,7 +58,10 @@ def setUp(self):
5658
# for each node.
5759
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
5860

59-
def test_update_available_resource_node_recreate(self):
61+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
62+
def test_update_available_resource_node_recreate(self, mock_delete):
63+
# make the fake driver allow nodes to be delete, like ironic driver
64+
mock_delete.return_value = True
6065
# First we create a compute service to manage a couple of fake nodes.
6166
compute = self.start_service('compute', 'node1')
6267
# When start_service runs, it will create the node1 ComputeNode.

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from nova import context
1616
from nova import objects
1717
from nova.tests.functional import integrated_helpers
18+
from nova.virt import fake as fake_driver
1819

1920

2021
class NodeRebalanceDeletedComputeNodeRaceTestCase(
@@ -54,7 +55,10 @@ def setUp(self):
5455
self.nodename = 'fake-node'
5556
self.ctxt = context.get_admin_context()
5657

57-
def test_node_rebalance_deleted_compute_node_race(self):
58+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
59+
def test_node_rebalance_deleted_compute_node_race(self, mock_delete):
60+
# make the fake driver allow nodes to be delete, like ironic driver
61+
mock_delete.return_value = True
5862
# Simulate a service running and then stopping. host_a runs, creates
5963
# fake-node, then is stopped. The fake-node compute node is destroyed.
6064
# This leaves a soft-deleted node in the DB.

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,13 +396,14 @@ def test_update_available_resource_for_node_pci_placement_failed_later(
396396
{'node': mock.sentinel.node}
397397
)
398398

399+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
399400
@mock.patch.object(manager, 'LOG')
400401
@mock.patch.object(manager.ComputeManager,
401402
'_update_available_resource_for_node')
402403
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
403404
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
404405
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
405-
update_mock, mock_log):
406+
update_mock, mock_log, mock_deleted):
406407
mock_rt = self._mock_rt()
407408
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
408409
self.compute, 'reportclient')).mock
@@ -415,6 +416,7 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
415416

416417
get_db_nodes.return_value = db_nodes
417418
get_avail_nodes.return_value = avail_nodes
419+
mock_deleted.return_value = True
418420
self.compute.update_available_resource(self.context, startup=True)
419421
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
420422
use_slave=True, startup=True)
@@ -460,12 +462,13 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
460462
update_mock.assert_not_called()
461463
del_rp_mock.assert_not_called()
462464

465+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
463466
@mock.patch.object(manager.ComputeManager,
464467
'_update_available_resource_for_node')
465468
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
466469
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
467470
def test_update_available_resource_destroy_rebalance(
468-
self, get_db_nodes, get_avail_nodes, update_mock):
471+
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
469472
mock_rt = self._mock_rt()
470473
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
471474
self.compute, 'reportclient')).mock
@@ -476,6 +479,7 @@ def test_update_available_resource_destroy_rebalance(
476479
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
477480
action='destroy', reason='host changed')
478481
get_avail_nodes.return_value = set()
482+
mock_deleted.return_value = True
479483
self.compute.update_available_resource(self.context)
480484
get_db_nodes.assert_called_once_with(self.context, set(),
481485
use_slave=True, startup=False)
@@ -486,6 +490,36 @@ def test_update_available_resource_destroy_rebalance(
486490
mock_rt.remove_node.assert_called_once_with('node1')
487491
rc_mock.invalidate_resource_provider.assert_called_once_with(
488492
db_nodes[0].uuid)
493+
mock_deleted.assert_called_once_with('node1')
494+
495+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
496+
@mock.patch.object(manager.ComputeManager,
497+
'_update_available_resource_for_node')
498+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
499+
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
500+
def test_update_available_resource_no_destroy_rebalance(
501+
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
502+
mock_rt = self._mock_rt()
503+
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
504+
self.compute, 'reportclient')).mock
505+
db_nodes = [self._make_compute_node('node1', 1)]
506+
get_db_nodes.return_value = db_nodes
507+
# Destroy can fail if nodes were rebalanced between getting the node
508+
# list and calling destroy.
509+
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
510+
action='destroy', reason='host changed')
511+
get_avail_nodes.return_value = set()
512+
mock_deleted.return_value = False
513+
self.compute.update_available_resource(self.context)
514+
get_db_nodes.assert_called_once_with(self.context, set(),
515+
use_slave=True, startup=False)
516+
self.assertEqual(0, update_mock.call_count)
517+
518+
db_nodes[0].destroy.assert_not_called()
519+
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
520+
mock_rt.remove_node.assert_not_called()
521+
rc_mock.invalidate_resource_provider.assert_not_called()
522+
mock_deleted.assert_called_once_with('node1')
489523

490524
@mock.patch('nova.context.get_admin_context')
491525
def test_pre_start_hook(self, get_admin_context):

nova/virt/driver.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,26 @@ def get_available_resource(self, nodename):
11601160
"""
11611161
raise NotImplementedError()
11621162

1163+
def is_node_deleted(self, nodename):
1164+
"""Check this compute node has been deleted.
1165+
1166+
This method is called when the compute manager notices that a
1167+
node that was previously reported as available is no longer
1168+
available.
1169+
In this case, we need to know if the node has actually been
1170+
deleted, or if it is simply no longer managed by this
1171+
nova-compute service.
1172+
If True is returned, the database and placement records will
1173+
be removed.
1174+
1175+
:param nodename:
1176+
node which the caller wants to check if its deleted
1177+
:returns: True if the node is safe to delete
1178+
"""
1179+
# For most driver, compute nodes should never get deleted
1180+
# during a periodic task, they are only deleted via the API
1181+
return False
1182+
11631183
def pre_live_migration(self, context, instance, block_device_info,
11641184
network_info, disk_info, migrate_data):
11651185
"""Prepare an instance for live migration

nova/virt/ironic/driver.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,14 @@ def node_is_available(self, nodename):
700700
except sdk_exc.ResourceNotFound:
701701
return False
702702

703+
def is_node_deleted(self, nodename):
704+
# check if the node is missing in Ironic
705+
try:
706+
self._get_node(nodename)
707+
return False
708+
except sdk_exc.ResourceNotFound:
709+
return True
710+
703711
def _refresh_hash_ring(self, ctxt):
704712
peer_list = None
705713
if CONF.ironic.shard is not None:

0 commit comments

Comments
 (0)