Skip to content

Commit 4b9eba6

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "[ironic] Minimize window for a resource provider to be lost" into stable/victoria
2 parents 8097c2b + bc5fc2b commit 4b9eba6

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3441,6 +3441,9 @@ def _test__refresh_cache(self, instances, nodes, hosts, mock_instances,
34413441
mock_instances.return_value = instances
34423442
mock_nodes.return_value = nodes
34433443
mock_hosts.side_effect = hosts
3444+
parent_mock = mock.MagicMock()
3445+
parent_mock.attach_mock(mock_nodes, 'get_node_list')
3446+
parent_mock.attach_mock(mock_instances, 'get_uuids_by_host')
34443447
if not can_send_146:
34453448
mock_can_send.side_effect = (
34463449
exception.IronicAPIVersionNotAvailable(version='1.46'))
@@ -3453,6 +3456,15 @@ def _test__refresh_cache(self, instances, nodes, hosts, mock_instances,
34533456

34543457
self.driver._refresh_cache()
34553458

3459+
# assert if get_node_list() is called before get_uuids_by_host()
3460+
parent_mock.assert_has_calls(
3461+
[
3462+
mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS,
3463+
**kwargs),
3464+
mock.call.get_uuids_by_host(mock.ANY, self.host)
3465+
]
3466+
)
3467+
34563468
mock_hash_ring.assert_called_once_with(mock.ANY)
34573469
mock_instances.assert_called_once_with(mock.ANY, self.host)
34583470
mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS,

nova/virt/ironic/driver.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -788,10 +788,15 @@ def _refresh_hash_ring(self, ctxt):
788788
def _refresh_cache(self):
789789
ctxt = nova_context.get_admin_context()
790790
self._refresh_hash_ring(ctxt)
791-
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
792791
node_cache = {}
793792

794793
def _get_node_list(**kwargs):
794+
# NOTE(TheJulia): This call can take a substantial amount
795+
# of time as it may be attempting to retrieve thousands of
796+
# baremetal nodes. Depending on the version of Ironic,
797+
# this can be as long as 2-10 seconds per every thousand
798+
# nodes, and this call may retrieve all nodes in a deployment,
799+
# depending on if any filter paramters are applied.
795800
return self._get_node_list(fields=_NODE_FIELDS, **kwargs)
796801

797802
# NOTE(jroll) if partition_key is set, we need to limit nodes that
@@ -815,6 +820,15 @@ def _get_node_list(**kwargs):
815820
else:
816821
nodes = _get_node_list()
817822

823+
# NOTE(saga): As _get_node_list() will take a long
824+
# time to return in large clusters we need to call it before
825+
# get_uuids_by_host() method. Otherwise the instances list we get from
826+
# get_uuids_by_host() method will become stale.
827+
# A stale instances list can cause a node that is managed by this
828+
# compute host to be excluded in error and cause the compute node
829+
# to be orphaned and associated resource provider to be deleted.
830+
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
831+
818832
for node in nodes:
819833
# NOTE(jroll): we always manage the nodes for instances we manage
820834
if node.instance_uuid in instances:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
fixes:
3+
- |
4+
Minimizes a race condition window when using the ``ironic`` virt driver
5+
where the data generated for the Resource Tracker may attempt to compare
6+
potentially stale instance information with the latest known baremetal
7+
node information. While this doesn't completely prevent nor resolve the
8+
underlying race condition identified in
9+
`bug 1841481 <https://bugs.launchpad.net/nova/+bug/1841481>`_,
10+
this change allows Nova to have the latest state information, as opposed
11+
to state information which may be out of date due to the time which it may
12+
take to retrieve the status from Ironic. This issue was most observable
13+
on baremetal clusters with several thousand physical nodes.

0 commit comments

Comments
 (0)