Skip to content

Commit 23c5f3d

Browse files
committed
Make resource tracker use UUIDs instead of names
This makes the resource tracker look up and create ComputeNode objects by uuid instead of nodename. For drivers like ironic that already provide 'uuid' in the resources dict, we can use that. For those that do not, we force the uuid to be the locally-persisted node uuid, and use that to find/create the ComputeNode object. A (happy) side-effect of this is that if we find a deleted compute node object that matches that of our hypervisor, we undelete it instead of re-creating one with a new uuid, which may clash with our old one. This means we remove some of the special-casing of ironic rebalance, although the tests for that still largely stay the same. Change-Id: I6a582a38c302fd1554a49abc38cfeda7c324d911
1 parent 0efdbb3 commit 23c5f3d

File tree

9 files changed

+195
-169
lines changed

9 files changed

+195
-169
lines changed

nova/compute/manager.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
from nova.virt import driver
9898
from nova.virt import event as virtevent
9999
from nova.virt import hardware
100+
import nova.virt.node
100101
from nova.virt import storage_users
101102
from nova.virt import virtapi
102103
from nova.volume import cinder
@@ -1471,27 +1472,27 @@ def _get_nodes(self, context):
14711472
:return: a dict of ComputeNode objects keyed by the UUID of the given
14721473
node.
14731474
"""
1474-
nodes_by_uuid = {}
14751475
try:
1476-
node_names = self.driver.get_available_nodes()
1476+
node_uuids = self.driver.get_available_node_uuids()
14771477
except exception.VirtDriverNotReady:
14781478
LOG.warning(
14791479
"Virt driver is not ready. If this is the first time this "
1480-
"service is starting on this host, then you can ignore this "
1481-
"warning.")
1480+
"service is starting on this host, then you can ignore "
1481+
"this warning.")
14821482
return {}
14831483

1484-
for node_name in node_names:
1485-
try:
1486-
node = objects.ComputeNode.get_by_host_and_nodename(
1487-
context, self.host, node_name)
1488-
nodes_by_uuid[node.uuid] = node
1489-
except exception.ComputeHostNotFound:
1490-
LOG.warning(
1491-
"Compute node %s not found in the database. If this is "
1492-
"the first time this service is starting on this host, "
1493-
"then you can ignore this warning.", node_name)
1494-
return nodes_by_uuid
1484+
nodes = objects.ComputeNodeList.get_all_by_uuids(context, node_uuids)
1485+
if not nodes:
1486+
# NOTE(danms): This should only happen if the compute_id is
1487+
# pre-provisioned on a host that has never started.
1488+
LOG.warning('Compute nodes %s for host %s were not found in the '
1489+
'database. If this is the first time this service is '
1490+
'starting on this host, then you can ignore this '
1491+
'warning.',
1492+
node_uuids, self.host)
1493+
return {}
1494+
1495+
return {n.uuid: n for n in nodes}
14951496

14961497
def _ensure_existing_node_identity(self, service_ref):
14971498
"""If we are upgrading from an older service version, we need

nova/compute/resource_tracker.py

Lines changed: 31 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from nova.scheduler.client import report
5050
from nova import utils
5151
from nova.virt import hardware
52+
from nova.virt import node
5253

5354

5455
CONF = nova.conf.CONF
@@ -668,50 +669,6 @@ def disabled(self, nodename):
668669
return (nodename not in self.compute_nodes or
669670
not self.driver.node_is_available(nodename))
670671

671-
def _check_for_nodes_rebalance(self, context, resources, nodename):
672-
"""Check if nodes rebalance has happened.
673-
674-
The ironic driver maintains a hash ring mapping bare metal nodes
675-
to compute nodes. If a compute dies, the hash ring is rebuilt, and
676-
some of its bare metal nodes (more precisely, those not in ACTIVE
677-
state) are assigned to other computes.
678-
679-
This method checks for this condition and adjusts the database
680-
accordingly.
681-
682-
:param context: security context
683-
:param resources: initial values
684-
:param nodename: node name
685-
:returns: True if a suitable compute node record was found, else False
686-
"""
687-
if not self.driver.rebalances_nodes:
688-
return False
689-
690-
# Its possible ironic just did a node re-balance, so let's
691-
# check if there is a compute node that already has the correct
692-
# hypervisor_hostname. We can re-use that rather than create a
693-
# new one and have to move existing placement allocations
694-
cn_candidates = objects.ComputeNodeList.get_by_hypervisor(
695-
context, nodename)
696-
697-
if len(cn_candidates) == 1:
698-
cn = cn_candidates[0]
699-
LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s",
700-
{"name": nodename, "old": cn.host, "new": self.host})
701-
cn.host = self.host
702-
self.compute_nodes[nodename] = cn
703-
self._copy_resources(cn, resources)
704-
self._setup_pci_tracker(context, cn, resources)
705-
self._update(context, cn)
706-
return True
707-
elif len(cn_candidates) > 1:
708-
LOG.error(
709-
"Found more than one ComputeNode for nodename %s. "
710-
"Please clean up the orphaned ComputeNode records in your DB.",
711-
nodename)
712-
713-
return False
714-
715672
def _init_compute_node(self, context, resources):
716673
"""Initialize the compute node if it does not already exist.
717674
@@ -729,6 +686,7 @@ def _init_compute_node(self, context, resources):
729686
False otherwise
730687
"""
731688
nodename = resources['hypervisor_hostname']
689+
node_uuid = resources['uuid']
732690

733691
# if there is already a compute node just use resources
734692
# to initialize
@@ -740,16 +698,30 @@ def _init_compute_node(self, context, resources):
740698

741699
# now try to get the compute node record from the
742700
# database. If we get one we use resources to initialize
743-
cn = self._get_compute_node(context, nodename)
701+
702+
# We use read_deleted=True so that we will find and recover a deleted
703+
# node object, if necessary.
704+
with utils.temporary_mutation(context, read_deleted='yes'):
705+
cn = self._get_compute_node(context, node_uuid)
706+
if cn and cn.deleted:
707+
# Undelete and save this right now so that everything below
708+
# can continue without read_deleted=yes
709+
LOG.info('Undeleting compute node %s', cn.uuid)
710+
cn.deleted = False
711+
cn.deleted_at = None
712+
cn.save()
744713
if cn:
714+
if cn.host != self.host:
715+
LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s",
716+
{"name": nodename, "old": cn.host, "new": self.host})
717+
cn.host = self.host
718+
self._update(context, cn)
719+
745720
self.compute_nodes[nodename] = cn
746721
self._copy_resources(cn, resources)
747722
self._setup_pci_tracker(context, cn, resources)
748723
return False
749724

750-
if self._check_for_nodes_rebalance(context, resources, nodename):
751-
return False
752-
753725
# there was no local copy and none in the database
754726
# so we need to create a new compute node. This needs
755727
# to be initialized with resource values.
@@ -889,6 +861,14 @@ def update_available_resource(self, context, nodename, startup=False):
889861
# contains a non-None value, even for non-Ironic nova-compute hosts. It
890862
# is this value that will be populated in the compute_nodes table.
891863
resources['host_ip'] = CONF.my_ip
864+
if 'uuid' not in resources:
865+
# NOTE(danms): Any driver that does not provide a uuid per
866+
# node gets the locally-persistent compute_id. Only ironic
867+
# should be setting the per-node uuid (and returning
868+
# multiple nodes in general). If this is the first time we
869+
# are creating a compute node on this host, we will
870+
# generate and persist this uuid for the future.
871+
resources['uuid'] = node.get_local_node_uuid()
892872

893873
# We want the 'cpu_info' to be None from the POV of the
894874
# virt driver, but the DB requires it to be non-null so
@@ -1014,14 +994,13 @@ def _update_available_resource(self, context, resources, startup=False):
1014994
if startup:
1015995
self._check_resources(context)
1016996

1017-
def _get_compute_node(self, context, nodename):
997+
def _get_compute_node(self, context, node_uuid):
1018998
"""Returns compute node for the host and nodename."""
1019999
try:
1020-
return objects.ComputeNode.get_by_host_and_nodename(
1021-
context, self.host, nodename)
1000+
return objects.ComputeNode.get_by_uuid(context, node_uuid)
10221001
except exception.NotFound:
10231002
LOG.warning("No compute node record for %(host)s:%(node)s",
1024-
{'host': self.host, 'node': nodename})
1003+
{'host': self.host, 'node': node_uuid})
10251004

10261005
def _report_hypervisor_resource_view(self, resources):
10271006
"""Log the hypervisor's view of free resources.

nova/objects/compute_node.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,8 +388,11 @@ def update_from_virt_driver(self, resources):
388388
# The uuid field is read-only so it should only be set when
389389
# creating the compute node record for the first time. Ignore
390390
# it otherwise.
391-
if key == 'uuid' and 'uuid' in self:
392-
continue
391+
if (key == 'uuid' and 'uuid' in self and
392+
resources[key] != self.uuid):
393+
raise exception.InvalidNodeConfiguration(
394+
reason='Attempt to overwrite node %s with %s!' % (
395+
self.uuid, resources[key]))
393396
setattr(self, key, resources[key])
394397

395398
# supported_instances has a different name in compute_node

nova/tests/functional/libvirt/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,9 @@ def _start_compute(hostname, host_info):
181181
with mock.patch('nova.virt.node.get_local_node_uuid') as m:
182182
m.return_value = str(getattr(uuids, 'node_%s' % hostname))
183183
self.computes[hostname] = _start_compute(hostname, host_info)
184+
# We need to trigger libvirt.Host() to capture the node-local
185+
# uuid while we have it mocked out.
186+
self.computes[hostname].driver._host.get_node_uuid()
184187

185188
self.compute_rp_uuids[hostname] = self.placement.get(
186189
'/resource_providers?name=%s' % hostname).body[

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,26 +1160,25 @@ def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
11601160
self.context, {active_instance.uuid, evacuating_instance.uuid},
11611161
mock_get_nodes.return_value.keys())
11621162

1163-
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
1163+
@mock.patch.object(objects.ComputeNodeList, 'get_all_by_uuids')
11641164
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
1165-
def test_get_nodes(self, mock_driver_get_nodes, mock_get_by_host_and_node):
1165+
def test_get_nodes(self, mock_driver_get_nodes, mock_get_by_uuid):
11661166
mock_driver_get_nodes.return_value = ['fake_node1', 'fake_node2']
11671167
# NOTE(danms): The fake driver, by default, uses
11681168
# uuidsentinel.$node_name, so we can predict the uuids it will
11691169
# return here.
11701170
cn1 = objects.ComputeNode(uuid=uuids.fake_node1)
11711171
cn2 = objects.ComputeNode(uuid=uuids.fake_node2)
1172-
mock_get_by_host_and_node.side_effect = [cn1, cn2]
1172+
mock_get_by_uuid.return_value = [cn1, cn2]
11731173

11741174
nodes = self.compute._get_nodes(self.context)
11751175

11761176
self.assertEqual({uuids.fake_node1: cn1, uuids.fake_node2: cn2}, nodes)
11771177

11781178
mock_driver_get_nodes.assert_called_once_with()
1179-
mock_get_by_host_and_node.assert_has_calls([
1180-
mock.call(self.context, self.compute.host, 'fake_node1'),
1181-
mock.call(self.context, self.compute.host, 'fake_node2')
1182-
])
1179+
mock_get_by_uuid.assert_called_once_with(self.context,
1180+
[uuids.fake_node1,
1181+
uuids.fake_node2])
11831182

11841183
@mock.patch.object(manager.LOG, 'warning')
11851184
@mock.patch.object(
@@ -1201,29 +1200,25 @@ def test_get_nodes_driver_not_ready(
12011200
"is starting on this host, then you can ignore this warning.")
12021201

12031202
@mock.patch.object(manager.LOG, 'warning')
1204-
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
1205-
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
1203+
@mock.patch.object(objects.ComputeNodeList, 'get_all_by_uuids')
1204+
@mock.patch.object(fake_driver.FakeDriver, 'get_available_node_uuids')
12061205
def test_get_nodes_node_not_found(
1207-
self, mock_driver_get_nodes, mock_get_by_host_and_node,
1206+
self, mock_driver_get_nodes, mock_get_all_by_uuids,
12081207
mock_log_warning):
1209-
mock_driver_get_nodes.return_value = ['fake-node1', 'fake-node2']
1210-
cn2 = objects.ComputeNode(uuid=uuids.cn2)
1211-
mock_get_by_host_and_node.side_effect = [
1212-
exception.ComputeHostNotFound(host='fake-node1'), cn2]
1208+
mock_driver_get_nodes.return_value = ['fake-node1']
1209+
mock_get_all_by_uuids.return_value = []
12131210

12141211
nodes = self.compute._get_nodes(self.context)
12151212

1216-
self.assertEqual({uuids.cn2: cn2}, nodes)
1213+
self.assertEqual({}, nodes)
12171214

12181215
mock_driver_get_nodes.assert_called_once_with()
1219-
mock_get_by_host_and_node.assert_has_calls([
1220-
mock.call(self.context, self.compute.host, 'fake-node1'),
1221-
mock.call(self.context, self.compute.host, 'fake-node2')
1222-
])
1216+
mock_get_all_by_uuids.assert_called_once_with(self.context,
1217+
['fake-node1'])
12231218
mock_log_warning.assert_called_once_with(
1224-
"Compute node %s not found in the database. If this is the first "
1225-
"time this service is starting on this host, then you can ignore "
1226-
"this warning.", 'fake-node1')
1219+
"Compute nodes %s for host %s were not found in the database. "
1220+
"If this is the first time this service is starting on this host, "
1221+
"then you can ignore this warning.", ['fake-node1'], 'fake-mini')
12271222

12281223
def test_init_host_disk_devices_configuration_failure(self):
12291224
self.flags(max_disk_devices_to_attach=0, group='compute')

0 commit comments

Comments
 (0)