Skip to content

Commit 1edabab

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Add functional regression test for bug 1853009" into stable/wallaby
2 parents 89c4ff5 + c260e75 commit 1edabab

File tree

1 file changed

+190
-0
lines changed

1 file changed

+190
-0
lines changed
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
2+
# not use this file except in compliance with the License. You may obtain
3+
# a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10+
# License for the specific language governing permissions and limitations
11+
# under the License.
12+
13+
import mock
14+
15+
from nova import context
16+
from nova import objects
17+
from nova.tests.functional import integrated_helpers
18+
19+
20+
class NodeRebalanceDeletedComputeNodeRaceTestCase(
21+
integrated_helpers.ProviderUsageBaseTestCase,
22+
):
23+
"""Regression test for bug 1853009 observed in Rocky & later.
24+
25+
When an ironic node re-balances from one host to another, there can be a
26+
race where the old host deletes the orphan compute node after the new host
27+
has taken ownership of it which results in the new host failing to create
28+
the compute node and resource provider because the ResourceTracker does not
29+
detect a change.
30+
"""
31+
# Make sure we're using the fake driver that has predictable uuids
32+
# for each node.
33+
compute_driver = 'fake.PredictableNodeUUIDDriver'
34+
35+
def _assert_hypervisor_api(self, nodename, expected_host):
36+
# We should have one compute node shown by the API.
37+
hypervisors = self.api.api_get(
38+
'/os-hypervisors/detail'
39+
).body['hypervisors']
40+
self.assertEqual(1, len(hypervisors), hypervisors)
41+
hypervisor = hypervisors[0]
42+
self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
43+
self.assertEqual(expected_host, hypervisor['service']['host'])
44+
45+
def _start_compute(self, host):
46+
host = self.start_service('compute', host)
47+
# Ironic compute driver has rebalances_nodes = True.
48+
host.manager.driver.rebalances_nodes = True
49+
return host
50+
51+
def setUp(self):
52+
super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()
53+
54+
self.nodename = 'fake-node'
55+
self.ctxt = context.get_admin_context()
56+
57+
def test_node_rebalance_deleted_compute_node_race(self):
58+
# Simulate a service running and then stopping. host_a runs, creates
59+
# fake-node, then is stopped. The fake-node compute node is destroyed.
60+
# This leaves a soft-deleted node in the DB.
61+
host_a = self._start_compute('host_a')
62+
host_a.manager.driver._set_nodes([self.nodename])
63+
host_a.manager.update_available_resource(self.ctxt)
64+
host_a.stop()
65+
cn = objects.ComputeNode.get_by_host_and_nodename(
66+
self.ctxt, 'host_a', self.nodename,
67+
)
68+
cn.destroy()
69+
70+
self.assertEqual(0, len(objects.ComputeNodeList.get_all(self.ctxt)))
71+
72+
# Now we create a new compute service to manage our node.
73+
host_b = self._start_compute('host_b')
74+
host_b.manager.driver._set_nodes([self.nodename])
75+
76+
# When start_service runs, it will create a host_b ComputeNode. We want
77+
# to delete that and inject our fake node into the driver which will
78+
# be re-balanced to another host later. First assert this actually
79+
# exists.
80+
self._assert_hypervisor_api('host_b', expected_host='host_b')
81+
82+
# Now run the update_available_resource periodic to register fake-node
83+
# and have it managed by host_b. This will also detect the "host_b"
84+
# node as orphaned and delete it along with its resource provider.
85+
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
86+
self.ctxt, 'host_b', 'host_b',
87+
)
88+
89+
# host_b[1]: Finds no compute record in RT. Tries to create one
90+
# (_init_compute_node).
91+
# FIXME(mgoddard): This shows a traceback with SQL rollback due to
92+
# soft-deleted node. The create seems to succeed but breaks the RT
93+
# update for this node. See
94+
# https://bugs.launchpad.net/nova/+bug/1853159.
95+
host_b.manager.update_available_resource(self.ctxt)
96+
self.assertIn(
97+
'Deleting orphan compute node %s hypervisor host '
98+
'is host_b, nodes are' % cn_host_b_node.id,
99+
self.stdlog.logger.output)
100+
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
101+
# There should only be one resource provider (fake-node).
102+
original_rps = self._get_all_providers()
103+
self.assertEqual(1, len(original_rps), original_rps)
104+
self.assertEqual(self.nodename, original_rps[0]['name'])
105+
106+
# Simulate a re-balance by restarting host_a and make it manage
107+
# fake-node. At this point both host_b and host_a think they own
108+
# fake-node.
109+
host_a = self._start_compute('host_a')
110+
host_a.manager.driver._set_nodes([self.nodename])
111+
112+
# host_a[1]: Finds no compute record in RT, 'moves' existing node from
113+
# host_b
114+
host_a.manager.update_available_resource(self.ctxt)
115+
# Assert that fake-node was re-balanced from host_b to host_a.
116+
self.assertIn(
117+
'ComputeNode fake-node moving from host_b to host_a',
118+
self.stdlog.logger.output)
119+
self._assert_hypervisor_api(self.nodename, expected_host='host_a')
120+
121+
# host_a[2]: Begins periodic update, queries compute nodes for this
122+
# host, finds the fake-node.
123+
cn = objects.ComputeNode.get_by_host_and_nodename(
124+
self.ctxt, 'host_a', self.nodename,
125+
)
126+
127+
# host_b[2]: Finds no compute record in RT, 'moves' existing node from
128+
# host_a
129+
host_b.manager.update_available_resource(self.ctxt)
130+
# Assert that fake-node was re-balanced from host_a to host_b.
131+
self.assertIn(
132+
'ComputeNode fake-node moving from host_a to host_b',
133+
self.stdlog.logger.output)
134+
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
135+
136+
# Complete rebalance, as host_a realises it does not own fake-node.
137+
host_a.manager.driver._set_nodes([])
138+
139+
# host_a[2]: Deletes orphan compute node.
140+
# Mock out the compute node query to simulate a race condition where
141+
# the list includes an orphan compute node that is newly owned by
142+
# host_b by the time host_a attempts to delete it.
143+
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
144+
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
145+
with mock.patch(
146+
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
147+
) as mock_get:
148+
mock_get.return_value = [cn]
149+
host_a.manager.update_available_resource(self.ctxt)
150+
151+
# Verify that the node was deleted.
152+
self.assertIn(
153+
'Deleting orphan compute node %s hypervisor host '
154+
'is fake-node, nodes are' % cn.id,
155+
self.stdlog.logger.output)
156+
hypervisors = self.api.api_get(
157+
'/os-hypervisors/detail').body['hypervisors']
158+
self.assertEqual(0, len(hypervisors), hypervisors)
159+
rps = self._get_all_providers()
160+
self.assertEqual(0, len(rps), rps)
161+
162+
# host_b[3]: Should recreate compute node and resource provider.
163+
# FIXME(mgoddard): Compute node not recreated here, because it is
164+
# already in RT.compute_nodes. See
165+
# https://bugs.launchpad.net/nova/+bug/1853009.
166+
# FIXME(mgoddard): Resource provider not recreated here, because it
167+
# exists in the provider tree. See
168+
# https://bugs.launchpad.net/nova/+bug/1841481.
169+
host_b.manager.update_available_resource(self.ctxt)
170+
171+
# Verify that the node was not recreated.
172+
hypervisors = self.api.api_get(
173+
'/os-hypervisors/detail').body['hypervisors']
174+
self.assertEqual(0, len(hypervisors), hypervisors)
175+
176+
# But the compute node exists in the RT.
177+
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
178+
179+
# There is no RP.
180+
rps = self._get_all_providers()
181+
self.assertEqual(0, len(rps), rps)
182+
183+
# But the RP exists in the provider tree.
184+
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
185+
self.nodename))
186+
187+
# This fails due to the lack of a resource provider.
188+
self.assertIn(
189+
'Skipping removal of allocations for deleted instances',
190+
self.stdlog.logger.output)

0 commit comments

Comments
 (0)