Skip to content

Commit 054b08e

Browse files
markgoddardjovial
authored andcommitted
Prevent deletion of a compute node belonging to another host
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The main race condition involved is in update_available_resources in the compute manager. When the list of compute nodes is queried, there is a compute node belonging to the host that it does not expect to be managing, i.e. it is an orphan. Between that time and deleting the orphan, the real owner of the compute node takes ownership of it ( in the resource tracker). However, the node is still deleted as the first host is unaware of the ownership change. This change prevents this from occurring by filtering on the host when deleting a compute node. If another compute host has taken ownership of a node, it will have updated the host field and this will prevent deletion from occurring. The first host sees this has happened via the ComputeHostNotFound exception, and avoids deleting its resource provider. Closes-Bug: #1853009 Related-Bug: #1841481 Change-Id: I260c1fded79a85d4899e94df4d9036a1ee437f02 (cherry picked from commit 65a6756fa17ebfd644c76953c943ba8a4ad3787d)
1 parent 4f1e4b3 commit 054b08e

File tree

10 files changed

+96
-26
lines changed

10 files changed

+96
-26
lines changed

nova/compute/manager.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9908,17 +9908,30 @@ def update_available_resource(self, context, startup=False):
99089908
"nodes are %(nodes)s",
99099909
{'id': cn.id, 'hh': cn.hypervisor_hostname,
99109910
'nodes': nodenames})
9911-
cn.destroy()
9912-
self.rt.remove_node(cn.hypervisor_hostname)
9913-
# Delete the corresponding resource provider in placement,
9914-
# along with any associated allocations.
99159911
try:
9916-
self.reportclient.delete_resource_provider(context, cn,
9917-
cascade=True)
9918-
except keystone_exception.ClientException as e:
9919-
LOG.error(
9920-
"Failed to delete compute node resource provider "
9921-
"for compute node %s: %s", cn.uuid, six.text_type(e))
9912+
cn.destroy()
9913+
except exception.ComputeHostNotFound:
9914+
# NOTE(mgoddard): it's possible that another compute
9915+
# service took ownership of this compute node since we
9916+
# queried it due to a rebalance, and this will cause the
9917+
# deletion to fail. Ignore the error in that case.
9918+
LOG.info("Ignoring failure to delete orphan compute node "
9919+
"%(id)s on hypervisor host %(hh)s due to "
9920+
"possible node rebalance",
9921+
{'id': cn.id, 'hh': cn.hypervisor_hostname})
9922+
self.rt.remove_node(cn.hypervisor_hostname)
9923+
self.reportclient.invalidate_resource_provider(cn.uuid)
9924+
else:
9925+
self.rt.remove_node(cn.hypervisor_hostname)
9926+
# Delete the corresponding resource provider in placement,
9927+
# along with any associated allocations.
9928+
try:
9929+
self.reportclient.delete_resource_provider(
9930+
context, cn, cascade=True)
9931+
except keystone_exception.ClientException as e:
9932+
LOG.error(
9933+
"Failed to delete compute node resource provider "
9934+
"for compute node %s: %s", cn.uuid, str(e))
99229935

99239936
for nodename in nodenames:
99249937
self._update_available_resource_for_node(context, nodename,

nova/db/api.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,15 +344,16 @@ def compute_node_update(context, compute_id, values):
344344
return IMPL.compute_node_update(context, compute_id, values)
345345

346346

347-
def compute_node_delete(context, compute_id):
347+
def compute_node_delete(context, compute_id, compute_host):
348348
"""Delete a compute node from the database.
349349
350350
:param context: The security context
351351
:param compute_id: ID of the compute node
352+
:param compute_host: Hostname of the compute service
352353
353354
Raises ComputeHostNotFound if compute node with the given ID doesn't exist.
354355
"""
355-
return IMPL.compute_node_delete(context, compute_id)
356+
return IMPL.compute_node_delete(context, compute_id, compute_host)
356357

357358

358359
def compute_node_statistics(context):

nova/db/sqlalchemy/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -763,10 +763,10 @@ def compute_node_update(context, compute_id, values):
763763

764764

765765
@pick_context_manager_writer
766-
def compute_node_delete(context, compute_id):
766+
def compute_node_delete(context, compute_id, compute_host):
767767
"""Delete a ComputeNode record."""
768768
result = model_query(context, models.ComputeNode).\
769-
filter_by(id=compute_id).\
769+
filter_by(id=compute_id, host=compute_host).\
770770
soft_delete(synchronize_session=False)
771771

772772
if not result:

nova/objects/compute_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ def save(self, prune_stats=False):
360360

361361
@base.remotable
362362
def destroy(self):
363-
db.compute_node_delete(self._context, self.id)
363+
db.compute_node_delete(self._context, self.id, self.host)
364364

365365
def update_from_virt_driver(self, resources):
366366
# NOTE(pmurray): the virt driver provides a dict of values that

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,24 +133,35 @@ def test_node_rebalance_deleted_compute_node_race(self):
133133
# Mock out the compute node query to simulate a race condition where
134134
# the list includes an orphan compute node that is newly owned by
135135
# host_b by the time host_a attempts to delete it.
136-
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
137-
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
138136
with mock.patch(
139137
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
140138
) as mock_get:
141139
mock_get.return_value = [cn]
142140
host_a.manager.update_available_resource(self.ctxt)
143141

144-
# Verify that the node was deleted.
142+
# Verify that the node was almost deleted, but was saved by the host
143+
# check
145144
self.assertIn(
146145
'Deleting orphan compute node %s hypervisor host '
147146
'is fake-node, nodes are' % cn.id,
148147
self.stdlog.logger.output)
149-
hypervisors = self.api.api_get(
150-
'/os-hypervisors/detail').body['hypervisors']
151-
self.assertEqual(0, len(hypervisors), hypervisors)
148+
self.assertIn(
149+
'Ignoring failure to delete orphan compute node %s on '
150+
'hypervisor host fake-node' % cn.id,
151+
self.stdlog.logger.output)
152+
self._assert_hypervisor_api(self.nodename, 'host_b')
152153
rps = self._get_all_providers()
153-
self.assertEqual(0, len(rps), rps)
154+
self.assertEqual(1, len(rps), rps)
155+
self.assertEqual(self.nodename, rps[0]['name'])
156+
157+
# Simulate deletion of an orphan by host_a. It shouldn't happen
158+
# anymore, but let's assume it already did.
159+
cn = objects.ComputeNode.get_by_host_and_nodename(
160+
self.ctxt, 'host_b', self.nodename)
161+
cn.destroy()
162+
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
163+
host_a.manager.reportclient.delete_resource_provider(
164+
self.ctxt, cn, cascade=True)
154165

155166
# host_b[3]: Should recreate compute node and resource provider.
156167
# FIXME(mgoddard): Resource provider not recreated here, due to

nova/tests/unit/compute/test_compute.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,10 @@ def fake_get_compute_nodes_in_db(self, context, *args, **kwargs):
202202
context, objects.ComputeNode(), cn)
203203
for cn in fake_compute_nodes]
204204

205-
def fake_compute_node_delete(context, compute_node_id):
205+
def fake_compute_node_delete(context, compute_node_id,
206+
compute_node_host):
206207
self.assertEqual(2, compute_node_id)
208+
self.assertEqual('fake_phyp1', compute_node_host)
207209

208210
self.stub_out(
209211
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,33 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
411411
update_mock.assert_not_called()
412412
del_rp_mock.assert_not_called()
413413

414+
@mock.patch.object(manager.ComputeManager,
415+
'_update_available_resource_for_node')
416+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
417+
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
418+
def test_update_available_resource_destroy_rebalance(
419+
self, get_db_nodes, get_avail_nodes, update_mock):
420+
mock_rt = self._mock_rt()
421+
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
422+
self.compute, 'reportclient')).mock
423+
db_nodes = [self._make_compute_node('node1', 1)]
424+
get_db_nodes.return_value = db_nodes
425+
# Destroy can fail if nodes were rebalanced between getting the node
426+
# list and calling destroy.
427+
db_nodes[0].destroy.side_effect = exception.ComputeHostNotFound(
428+
'node1')
429+
get_avail_nodes.return_value = set()
430+
self.compute.update_available_resource(self.context)
431+
get_db_nodes.assert_called_once_with(self.context, set(),
432+
use_slave=True, startup=False)
433+
self.assertEqual(0, update_mock.call_count)
434+
435+
db_nodes[0].destroy.assert_called_once_with()
436+
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
437+
mock_rt.remove_node.assert_called_once_with('node1')
438+
rc_mock.invalidate_resource_provider.assert_called_once_with(
439+
db_nodes[0].uuid)
440+
414441
@mock.patch('nova.context.get_admin_context')
415442
def test_pre_start_hook(self, get_admin_context):
416443
"""Very simple test just to make sure update_available_resource is

nova/tests/unit/db/test_db_api.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5393,7 +5393,7 @@ def test_compute_node_get_all_deleted_compute_node(self):
53935393

53945394
# Now delete the newly-created compute node to ensure the related
53955395
# compute node stats are wiped in a cascaded fashion
5396-
db.compute_node_delete(self.ctxt, node['id'])
5396+
db.compute_node_delete(self.ctxt, node['id'], node['host'])
53975397

53985398
# Clean up the service
53995399
db.service_destroy(self.ctxt, service['id'])
@@ -5567,10 +5567,18 @@ def test_compute_node_update(self):
55675567

55685568
def test_compute_node_delete(self):
55695569
compute_node_id = self.item['id']
5570-
db.compute_node_delete(self.ctxt, compute_node_id)
5570+
compute_node_host = self.item['host']
5571+
db.compute_node_delete(self.ctxt, compute_node_id, compute_node_host)
55715572
nodes = db.compute_node_get_all(self.ctxt)
55725573
self.assertEqual(len(nodes), 0)
55735574

5575+
def test_compute_node_delete_different_host(self):
5576+
compute_node_id = self.item['id']
5577+
compute_node_host = 'invalid-host'
5578+
self.assertRaises(exception.ComputeHostNotFound,
5579+
db.compute_node_delete,
5580+
self.ctxt, compute_node_id, compute_node_host)
5581+
55745582
def test_compute_node_search_by_hypervisor(self):
55755583
nodes_created = []
55765584
new_service = copy.copy(self.service_dict)

nova/tests/unit/objects/test_compute_node.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,9 @@ def test_set_id_failure(self, mock_get, db_mock):
414414
def test_destroy(self, mock_delete):
415415
compute = compute_node.ComputeNode(context=self.context)
416416
compute.id = 123
417+
compute.host = 'fake'
417418
compute.destroy()
418-
mock_delete.assert_called_once_with(self.context, 123)
419+
mock_delete.assert_called_once_with(self.context, 123, 'fake')
419420

420421
@mock.patch.object(db, 'compute_node_get_all')
421422
def test_get_all(self, mock_get_all):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
5+
where a rebalance operation could result in a compute node being deleted
6+
from the database and not recreated. See `bug 1853009
7+
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.

0 commit comments

Comments
 (0)