Skip to content

Commit 68cad8f

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Prevent deletion of a compute node belonging to another host" into stable/wallaby
2 parents 4cf6323 + cbbca58 commit 68cad8f

File tree

9 files changed

+127
-27
lines changed

9 files changed

+127
-27
lines changed

nova/compute/manager.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10014,17 +10014,30 @@ def update_available_resource(self, context, startup=False):
1001410014
"nodes are %(nodes)s",
1001510015
{'id': cn.id, 'hh': cn.hypervisor_hostname,
1001610016
'nodes': nodenames})
10017-
cn.destroy()
10018-
self.rt.remove_node(cn.hypervisor_hostname)
10019-
# Delete the corresponding resource provider in placement,
10020-
# along with any associated allocations.
1002110017
try:
10022-
self.reportclient.delete_resource_provider(context, cn,
10023-
cascade=True)
10024-
except keystone_exception.ClientException as e:
10025-
LOG.error(
10026-
"Failed to delete compute node resource provider "
10027-
"for compute node %s: %s", cn.uuid, str(e))
10018+
cn.destroy()
10019+
except exception.ObjectActionError:
10020+
# NOTE(mgoddard): it's possible that another compute
10021+
# service took ownership of this compute node since we
10022+
# queried it due to a rebalance, and this will cause the
10023+
# deletion to fail. Ignore the error in that case.
10024+
LOG.info("Ignoring failure to delete orphan compute node "
10025+
"%(id)s on hypervisor host %(hh)s due to "
10026+
"possible node rebalance",
10027+
{'id': cn.id, 'hh': cn.hypervisor_hostname})
10028+
self.rt.remove_node(cn.hypervisor_hostname)
10029+
self.reportclient.invalidate_resource_provider(cn.uuid)
10030+
else:
10031+
self.rt.remove_node(cn.hypervisor_hostname)
10032+
# Delete the corresponding resource provider in placement,
10033+
# along with any associated allocations.
10034+
try:
10035+
self.reportclient.delete_resource_provider(
10036+
context, cn, cascade=True)
10037+
except keystone_exception.ClientException as e:
10038+
LOG.error(
10039+
"Failed to delete compute node resource provider "
10040+
"for compute node %s: %s", cn.uuid, str(e))
1002810041

1002910042
for nodename in nodenames:
1003010043
self._update_available_resource_for_node(context, nodename,

nova/db/sqlalchemy/api.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -761,14 +761,24 @@ def compute_node_update(context, compute_id, values):
761761

762762

763763
@pick_context_manager_writer
764-
def compute_node_delete(context, compute_id):
764+
def compute_node_delete(context, compute_id, constraint=None):
765765
"""Delete a ComputeNode record."""
766-
result = model_query(context, models.ComputeNode).\
767-
filter_by(id=compute_id).\
768-
soft_delete(synchronize_session=False)
766+
query = model_query(context, models.ComputeNode).filter_by(id=compute_id)
767+
768+
if constraint is not None:
769+
query = constraint.apply(models.ComputeNode, query)
770+
771+
result = query.soft_delete(synchronize_session=False)
769772

770773
if not result:
771-
raise exception.ComputeHostNotFound(host=compute_id)
774+
# The soft_delete could fail for one of two reasons:
775+
# 1) The compute node no longer exists
776+
# 2) The constraint, if specified, was not met
777+
# Try to read the compute node and let it raise ComputeHostNotFound if
778+
# 1) happened.
779+
compute_node_get(context, compute_id)
780+
# Else, raise ConstraintNotMet if 2) happened.
781+
raise exception.ConstraintNotMet()
772782

773783

774784
@pick_context_manager_reader

nova/objects/compute_node.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,20 @@ def save(self, prune_stats=False):
360360

361361
@base.remotable
362362
def destroy(self):
363-
db.compute_node_delete(self._context, self.id)
363+
if self.obj_attr_is_set('host') and self.host:
364+
# NOTE(melwitt): If our host is set, avoid a race between
365+
# nova-computes during ironic driver node rebalances which can
366+
# change node ownership.
367+
constraint = db.constraint(host=db.equal_any(self.host))
368+
else:
369+
constraint = None
370+
371+
try:
372+
sa_api.compute_node_delete(
373+
self._context, self.id, constraint=constraint)
374+
except exception.ConstraintNotMet:
375+
raise exception.ObjectActionError(action='destroy',
376+
reason='host changed')
364377

365378
def update_from_virt_driver(self, resources):
366379
# NOTE(pmurray): the virt driver provides a dict of values that

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,24 +133,35 @@ def test_node_rebalance_deleted_compute_node_race(self):
133133
# Mock out the compute node query to simulate a race condition where
134134
# the list includes an orphan compute node that is newly owned by
135135
# host_b by the time host_a attempts to delete it.
136-
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
137-
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
138136
with mock.patch(
139137
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
140138
) as mock_get:
141139
mock_get.return_value = [cn]
142140
host_a.manager.update_available_resource(self.ctxt)
143141

144-
# Verify that the node was deleted.
142+
# Verify that the node was almost deleted, but was saved by the host
143+
# check
145144
self.assertIn(
146145
'Deleting orphan compute node %s hypervisor host '
147146
'is fake-node, nodes are' % cn.id,
148147
self.stdlog.logger.output)
149-
hypervisors = self.api.api_get(
150-
'/os-hypervisors/detail').body['hypervisors']
151-
self.assertEqual(0, len(hypervisors), hypervisors)
148+
self.assertIn(
149+
'Ignoring failure to delete orphan compute node %s on '
150+
'hypervisor host fake-node' % cn.id,
151+
self.stdlog.logger.output)
152+
self._assert_hypervisor_api(self.nodename, 'host_b')
152153
rps = self._get_all_providers()
153-
self.assertEqual(0, len(rps), rps)
154+
self.assertEqual(1, len(rps), rps)
155+
self.assertEqual(self.nodename, rps[0]['name'])
156+
157+
# Simulate deletion of an orphan by host_a. It shouldn't happen
158+
# anymore, but let's assume it already did.
159+
cn = objects.ComputeNode.get_by_host_and_nodename(
160+
self.ctxt, 'host_b', self.nodename)
161+
cn.destroy()
162+
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
163+
host_a.manager.reportclient.delete_resource_provider(
164+
self.ctxt, cn, cascade=True)
154165

155166
# host_b[3]: Should recreate compute node and resource provider.
156167
# FIXME(mgoddard): Resource provider not recreated here, due to

nova/tests/unit/compute/test_compute.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,13 +200,14 @@ def fake_get_compute_nodes_in_db(self, context, *args, **kwargs):
200200
context, objects.ComputeNode(), cn)
201201
for cn in fake_compute_nodes]
202202

203-
def fake_compute_node_delete(context, compute_node_id):
203+
def fake_compute_node_delete(context, compute_node_id,
204+
constraint=None):
204205
self.assertEqual(2, compute_node_id)
205206

206207
self.stub_out(
207208
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
208209
fake_get_compute_nodes_in_db)
209-
self.stub_out('nova.db.api.compute_node_delete',
210+
self.stub_out('nova.db.sqlalchemy.api.compute_node_delete',
210211
fake_compute_node_delete)
211212

212213
self.compute.update_available_resource(

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,33 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
409409
update_mock.assert_not_called()
410410
del_rp_mock.assert_not_called()
411411

412+
@mock.patch.object(manager.ComputeManager,
413+
'_update_available_resource_for_node')
414+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
415+
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
416+
def test_update_available_resource_destroy_rebalance(
417+
self, get_db_nodes, get_avail_nodes, update_mock):
418+
mock_rt = self._mock_rt()
419+
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
420+
self.compute, 'reportclient')).mock
421+
db_nodes = [self._make_compute_node('node1', 1)]
422+
get_db_nodes.return_value = db_nodes
423+
# Destroy can fail if nodes were rebalanced between getting the node
424+
# list and calling destroy.
425+
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
426+
action='destroy', reason='host changed')
427+
get_avail_nodes.return_value = set()
428+
self.compute.update_available_resource(self.context)
429+
get_db_nodes.assert_called_once_with(self.context, set(),
430+
use_slave=True, startup=False)
431+
self.assertEqual(0, update_mock.call_count)
432+
433+
db_nodes[0].destroy.assert_called_once_with()
434+
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
435+
mock_rt.remove_node.assert_called_once_with('node1')
436+
rc_mock.invalidate_resource_provider.assert_called_once_with(
437+
db_nodes[0].uuid)
438+
412439
@mock.patch('nova.context.get_admin_context')
413440
def test_pre_start_hook(self, get_admin_context):
414441
"""Very simple test just to make sure update_available_resource is

nova/tests/unit/db/test_db_api.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5545,6 +5545,13 @@ def test_compute_node_delete(self):
55455545
nodes = db.compute_node_get_all(self.ctxt)
55465546
self.assertEqual(len(nodes), 0)
55475547

5548+
def test_compute_node_delete_different_host(self):
5549+
compute_node_id = self.item['id']
5550+
constraint = db.constraint(host=db.equal_any('different-host'))
5551+
self.assertRaises(exception.ConstraintNotMet,
5552+
sqlalchemy_api.compute_node_delete,
5553+
self.ctxt, compute_node_id, constraint=constraint)
5554+
55485555
def test_compute_node_search_by_hypervisor(self):
55495556
nodes_created = []
55505557
new_service = copy.copy(self.service_dict)

nova/tests/unit/objects/test_compute_node.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
from nova import conf
2626
from nova.db import api as db
27+
from nova.db.sqlalchemy import api as sa_api
2728
from nova import exception
2829
from nova import objects
2930
from nova.objects import base
@@ -410,12 +411,22 @@ def test_set_id_failure(self, mock_get, db_mock):
410411
self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
411412
compute, 'id', 124)
412413

413-
@mock.patch.object(db, 'compute_node_delete')
414+
@mock.patch.object(sa_api, 'compute_node_delete')
414415
def test_destroy(self, mock_delete):
415416
compute = compute_node.ComputeNode(context=self.context)
416417
compute.id = 123
417418
compute.destroy()
418-
mock_delete.assert_called_once_with(self.context, 123)
419+
mock_delete.assert_called_once_with(self.context, 123, constraint=None)
420+
421+
def test_destroy_host_constraint(self):
422+
# Create compute node with host='fake'
423+
compute = fake_compute_with_resources.obj_clone()
424+
compute._context = self.context
425+
compute.host = 'fake'
426+
compute.create()
427+
# Simulate a compute node ownership change due to a node rebalance
428+
compute.host = 'different'
429+
self.assertRaises(exception.ObjectActionError, compute.destroy)
419430

420431
@mock.patch.object(db, 'compute_node_get_all')
421432
def test_get_all(self, mock_get_all):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
5+
where a rebalance operation could result in a compute node being deleted
6+
from the database and not recreated. See `bug 1853009
7+
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.

0 commit comments

Comments
 (0)