Skip to content

Commit eee5b39

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Make compute node rebalance safter"
2 parents 0318016 + 772f5a1 commit eee5b39

File tree

6 files changed

+88
-4
lines changed

6 files changed

+88
-4
lines changed

nova/compute/manager.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10638,6 +10638,19 @@ def update_available_resource(self, context, startup=False):
1063810638
# Delete orphan compute node not reported by driver but still in db
1063910639
for cn in compute_nodes_in_db:
1064010640
if cn.hypervisor_hostname not in nodenames:
10641+
# if the node could be migrated, we don't delete
10642+
# the compute node database records
10643+
if not self.driver.is_node_deleted(cn.hypervisor_hostname):
10644+
LOG.warning(
10645+
"Found orphan compute node %(id)s "
10646+
"hypervisor host is %(hh)s, "
10647+
"nodes are %(nodes)s. "
10648+
"We are not deleting this as the driver "
10649+
"says this node has not been deleted.",
10650+
{'id': cn.id, 'hh': cn.hypervisor_hostname,
10651+
'nodes': nodenames})
10652+
continue
10653+
1064110654
LOG.info("Deleting orphan compute node %(id)s "
1064210655
"hypervisor host is %(hh)s, "
1064310656
"nodes are %(nodes)s",

nova/tests/functional/regressions/test_bug_1839560.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# under the License.
1212

1313
from oslo_log import log as logging
14+
from unittest import mock
1415

1516
from nova import context
1617
from nova.db.main import api as db_api
@@ -20,6 +21,7 @@
2021
from nova.tests.functional import fixtures as func_fixtures
2122
from nova.tests.functional import integrated_helpers
2223
from nova import utils
24+
from nova.virt import fake as fake_driver
2325

2426
LOG = logging.getLogger(__name__)
2527

@@ -56,7 +58,10 @@ def setUp(self):
5658
# for each node.
5759
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
5860

59-
def test_update_available_resource_node_recreate(self):
61+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
62+
def test_update_available_resource_node_recreate(self, mock_delete):
63+
# make the fake driver allow nodes to be delete, like ironic driver
64+
mock_delete.return_value = True
6065
# First we create a compute service to manage a couple of fake nodes.
6166
compute = self.start_service('compute', 'node1')
6267
# When start_service runs, it will create the node1 ComputeNode.

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from nova import context
1616
from nova import objects
1717
from nova.tests.functional import integrated_helpers
18+
from nova.virt import fake as fake_driver
1819

1920

2021
class NodeRebalanceDeletedComputeNodeRaceTestCase(
@@ -54,7 +55,10 @@ def setUp(self):
5455
self.nodename = 'fake-node'
5556
self.ctxt = context.get_admin_context()
5657

57-
def test_node_rebalance_deleted_compute_node_race(self):
58+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
59+
def test_node_rebalance_deleted_compute_node_race(self, mock_delete):
60+
# make the fake driver allow nodes to be delete, like ironic driver
61+
mock_delete.return_value = True
5862
# Simulate a service running and then stopping. host_a runs, creates
5963
# fake-node, then is stopped. The fake-node compute node is destroyed.
6064
# This leaves a soft-deleted node in the DB.

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,13 +396,14 @@ def test_update_available_resource_for_node_pci_placement_failed_later(
396396
{'node': mock.sentinel.node}
397397
)
398398

399+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
399400
@mock.patch.object(manager, 'LOG')
400401
@mock.patch.object(manager.ComputeManager,
401402
'_update_available_resource_for_node')
402403
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
403404
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
404405
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
405-
update_mock, mock_log):
406+
update_mock, mock_log, mock_deleted):
406407
mock_rt = self._mock_rt()
407408
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
408409
self.compute, 'reportclient')).mock
@@ -415,6 +416,7 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
415416

416417
get_db_nodes.return_value = db_nodes
417418
get_avail_nodes.return_value = avail_nodes
419+
mock_deleted.return_value = True
418420
self.compute.update_available_resource(self.context, startup=True)
419421
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
420422
use_slave=True, startup=True)
@@ -460,12 +462,13 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
460462
update_mock.assert_not_called()
461463
del_rp_mock.assert_not_called()
462464

465+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
463466
@mock.patch.object(manager.ComputeManager,
464467
'_update_available_resource_for_node')
465468
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
466469
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
467470
def test_update_available_resource_destroy_rebalance(
468-
self, get_db_nodes, get_avail_nodes, update_mock):
471+
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
469472
mock_rt = self._mock_rt()
470473
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
471474
self.compute, 'reportclient')).mock
@@ -476,6 +479,7 @@ def test_update_available_resource_destroy_rebalance(
476479
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
477480
action='destroy', reason='host changed')
478481
get_avail_nodes.return_value = set()
482+
mock_deleted.return_value = True
479483
self.compute.update_available_resource(self.context)
480484
get_db_nodes.assert_called_once_with(self.context, set(),
481485
use_slave=True, startup=False)
@@ -486,6 +490,36 @@ def test_update_available_resource_destroy_rebalance(
486490
mock_rt.remove_node.assert_called_once_with('node1')
487491
rc_mock.invalidate_resource_provider.assert_called_once_with(
488492
db_nodes[0].uuid)
493+
mock_deleted.assert_called_once_with('node1')
494+
495+
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
496+
@mock.patch.object(manager.ComputeManager,
497+
'_update_available_resource_for_node')
498+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
499+
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
500+
def test_update_available_resource_no_destroy_rebalance(
501+
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
502+
mock_rt = self._mock_rt()
503+
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
504+
self.compute, 'reportclient')).mock
505+
db_nodes = [self._make_compute_node('node1', 1)]
506+
get_db_nodes.return_value = db_nodes
507+
# Destroy can fail if nodes were rebalanced between getting the node
508+
# list and calling destroy.
509+
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
510+
action='destroy', reason='host changed')
511+
get_avail_nodes.return_value = set()
512+
mock_deleted.return_value = False
513+
self.compute.update_available_resource(self.context)
514+
get_db_nodes.assert_called_once_with(self.context, set(),
515+
use_slave=True, startup=False)
516+
self.assertEqual(0, update_mock.call_count)
517+
518+
db_nodes[0].destroy.assert_not_called()
519+
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
520+
mock_rt.remove_node.assert_not_called()
521+
rc_mock.invalidate_resource_provider.assert_not_called()
522+
mock_deleted.assert_called_once_with('node1')
489523

490524
@mock.patch('nova.context.get_admin_context')
491525
def test_pre_start_hook(self, get_admin_context):

nova/virt/driver.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,26 @@ def get_available_resource(self, nodename):
11601160
"""
11611161
raise NotImplementedError()
11621162

1163+
def is_node_deleted(self, nodename):
1164+
"""Check this compute node has been deleted.
1165+
1166+
This method is called when the compute manager notices that a
1167+
node that was previously reported as available is no longer
1168+
available.
1169+
In this case, we need to know if the node has actually been
1170+
deleted, or if it is simply no longer managed by this
1171+
nova-compute service.
1172+
If True is returned, the database and placement records will
1173+
be removed.
1174+
1175+
:param nodename:
1176+
node which the caller wants to check if its deleted
1177+
:returns: True if the node is safe to delete
1178+
"""
1179+
# For most driver, compute nodes should never get deleted
1180+
# during a periodic task, they are only deleted via the API
1181+
return False
1182+
11631183
def pre_live_migration(self, context, instance, block_device_info,
11641184
network_info, disk_info, migrate_data):
11651185
"""Prepare an instance for live migration

nova/virt/ironic/driver.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,14 @@ def node_is_available(self, nodename):
700700
except sdk_exc.ResourceNotFound:
701701
return False
702702

703+
def is_node_deleted(self, nodename):
704+
# check if the node is missing in Ironic
705+
try:
706+
self._get_node(nodename)
707+
return False
708+
except sdk_exc.ResourceNotFound:
709+
return True
710+
703711
def _refresh_hash_ring(self, ctxt):
704712
peer_list = None
705713
if CONF.ironic.shard is not None:

0 commit comments

Comments
 (0)