Merge "Make allocation candidates available for scheduler filters"

Zuul · openstack-gerrit · commit b54beee42980 · 2022-12-17T17:12:31.000Z
diff --git a/nova/scheduler/filters/__init__.py b/nova/scheduler/filters/__init__.py
@@ -28,6 +28,9 @@ class BaseHostFilter(filters.BaseFilter):
     # other parameters. We care about running policy filters (i.e.
     # ImagePropertiesFilter) but not things that check usage on the
     # existing compute node, etc.
+    # This also means that filters marked with RUN_ON_REBUILD = True cannot
+    # filter on allocation candidates or need to handle the rebuild case
+    # specially.
     RUN_ON_REBUILD = False
 
     def _filter_one(self, obj, spec):
diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py
@@ -153,6 +153,8 @@ def __init__(self, host, node, cell_uuid):
 
         self.updated = None
 
+        self.allocation_candidates = []
+
     def update(self, compute=None, service=None, aggregates=None,
             inst_dict=None):
         """Update all information about a host."""
@@ -314,13 +316,21 @@ def _locked_consume_from_request(self, spec_obj):
         self.num_io_ops += 1
 
     def __repr__(self):
-        return ("(%(host)s, %(node)s) ram: %(free_ram)sMB "
-                "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
-                "instances: %(num_instances)s" %
-                {'host': self.host, 'node': self.nodename,
-                 'free_ram': self.free_ram_mb, 'free_disk': self.free_disk_mb,
-                 'num_io_ops': self.num_io_ops,
-                 'num_instances': self.num_instances})
+        return (
+            "(%(host)s, %(node)s) ram: %(free_ram)sMB "
+            "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
+            "instances: %(num_instances)s, "
+            "allocation_candidates: %(num_a_c)s"
+            % {
+                "host": self.host,
+                "node": self.nodename,
+                "free_ram": self.free_ram_mb,
+                "free_disk": self.free_disk_mb,
+                "num_io_ops": self.num_io_ops,
+                "num_instances": self.num_instances,
+                "num_a_c": len(self.allocation_candidates),
+            }
+        )
 
 
 class HostManager(object):
diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py
@@ -20,6 +20,7 @@
 """
 
 import collections
+import copy
 import random
 
 from oslo_log import log as logging
@@ -299,12 +300,29 @@ def _schedule(
         # host, we virtually consume resources on it so subsequent
         # selections can adjust accordingly.
 
+        def hosts_with_alloc_reqs(hosts_gen):
+            """Extend the HostState objects returned by the generator with
+            the allocation requests of that host
+            """
+            for host in hosts_gen:
+                host.allocation_candidates = copy.deepcopy(
+                    alloc_reqs_by_rp_uuid[host.uuid])
+                yield host
+
         # Note: remember, we are using a generator-iterator here. So only
         # traverse this list once. This can bite you if the hosts
         # are being scanned in a filter or weighing function.
         hosts = self._get_all_host_states(
             elevated, spec_obj, provider_summaries)
 
+        # alloc_reqs_by_rp_uuid is None during rebuild, so this mean we cannot
+        # run filters that are using allocation candidates during rebuild
+        if alloc_reqs_by_rp_uuid is not None:
+            # wrap the generator to extend the HostState objects with the
+            # allocation requests for that given host. This is needed to
+            # support scheduler filters filtering on allocation candidates.
+            hosts = hosts_with_alloc_reqs(hosts)
+
         # NOTE(sbauza): The RequestSpec.num_instances field contains the number
         # of instances created when the RequestSpec was used to first boot some
         # instances. This is incorrect when doing a move or resize operation,
@@ -332,6 +350,13 @@ def _schedule(
             # the older dict format representing HostState objects.
             # TODO(stephenfin): Remove this when we bump scheduler the RPC API
             # version to 5.0
+            # NOTE(gibi): We cannot remove this branch as it is actively used
+            # when nova calls the scheduler during rebuild (not evacuate) to
+            # check if the current host is still good for the new image used
+            # for the rebuild. In this case placement cannot be used to
+            # generate candidates as that would require space on the current
+            # compute for double allocation. So no allocation candidates for
+            # rebuild and therefore alloc_reqs_by_rp_uuid is None
             return self._legacy_find_hosts(
                 context, num_instances, spec_obj, hosts, num_alts,
                 instance_uuids=instance_uuids)
@@ -345,6 +370,9 @@ def _schedule(
         # The list of hosts that have been selected (and claimed).
         claimed_hosts = []
 
+        # The allocation request allocated on the given claimed host
+        claimed_alloc_reqs = []
+
         for num, instance_uuid in enumerate(instance_uuids):
             # In a multi-create request, the first request spec from the list
             # is passed to the scheduler and that request spec's instance_uuid
@@ -371,21 +399,20 @@ def _schedule(
             # resource provider UUID
             claimed_host = None
             for host in hosts:
-                cn_uuid = host.uuid
-                if cn_uuid not in alloc_reqs_by_rp_uuid:
-                    msg = ("A host state with uuid = '%s' that did not have a "
-                           "matching allocation_request was encountered while "
-                           "scheduling. This host was skipped.")
-                    LOG.debug(msg, cn_uuid)
+                if not host.allocation_candidates:
+                    LOG.debug(
+                        "The nova scheduler removed every allocation candidate"
+                        "for host %s so this host was skipped.",
+                        host
+                    )
                     continue
 
-                alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid]
                 # TODO(jaypipes): Loop through all allocation_requests instead
                 # of just trying the first one. For now, since we'll likely
                 # want to order the allocation_requests in the future based on
                 # information in the provider summaries, we'll just try to
                 # claim resources using the first allocation_request
-                alloc_req = alloc_reqs[0]
+                alloc_req = host.allocation_candidates[0]
                 if utils.claim_resources(
                     elevated, self.placement_client, spec_obj, instance_uuid,
                     alloc_req,
@@ -405,6 +432,15 @@ def _schedule(
 
             claimed_instance_uuids.append(instance_uuid)
             claimed_hosts.append(claimed_host)
+            claimed_alloc_reqs.append(alloc_req)
+
+            # update the provider mapping in the request spec based
+            # on the allocated candidate as the _consume_selected_host depends
+            # on this information to temporally consume PCI devices tracked in
+            # placement
+            for request_group in spec_obj.requested_resources:
+                request_group.provider_uuids = alloc_req[
+                    'mappings'][request_group.requester_id]
 
             # Now consume the resources so the filter/weights will change for
             # the next instance.
@@ -416,11 +452,19 @@ def _schedule(
         self._ensure_sufficient_hosts(
             context, claimed_hosts, num_instances, claimed_instance_uuids)
 
-        # We have selected and claimed hosts for each instance. Now we need to
-        # find alternates for each host.
+        # We have selected and claimed hosts for each instance along with a
+        # claimed allocation request. Now we need to find alternates for each
+        # host.
         return self._get_alternate_hosts(
-            claimed_hosts, spec_obj, hosts, num, num_alts,
-            alloc_reqs_by_rp_uuid, allocation_request_version)
+            claimed_hosts,
+            spec_obj,
+            hosts,
+            num,
+            num_alts,
+            alloc_reqs_by_rp_uuid,
+            allocation_request_version,
+            claimed_alloc_reqs,
+        )
 
     def _ensure_sufficient_hosts(
         self, context, hosts, required_count, claimed_uuids=None,
@@ -532,7 +576,21 @@ def _consume_selected_host(selected_host, spec_obj, instance_uuid=None):
     def _get_alternate_hosts(
         self, selected_hosts, spec_obj, hosts, index, num_alts,
         alloc_reqs_by_rp_uuid=None, allocation_request_version=None,
+        selected_alloc_reqs=None,
     ):
+        """Generate the main Selection and possible alternate Selection
+        objects for each "instance".
+
+        :param selected_hosts: This is a list of HostState objects. Each
+            HostState represents the main selection for a given instance being
+            scheduled (we can have multiple instances during multi create).
+        :param selected_alloc_reqs: This is a list of allocation requests that
+            are already allocated in placement for the main Selection for each
+            instance. This list is matching with selected_hosts by index. So
+            for the first instance the selected host is selected_host[0] and
+            the already allocated placement candidate is
+            selected_alloc_reqs[0].
+        """
         # We only need to filter/weigh the hosts again if we're dealing with
         # more than one instance and are going to be picking alternates.
         if index > 0 and num_alts > 0:
@@ -546,11 +604,10 @@ def _get_alternate_hosts(
         # representing the selected host along with alternates from the same
         # cell.
         selections_to_return = []
-        for selected_host in selected_hosts:
+        for i, selected_host in enumerate(selected_hosts):
             # This is the list of hosts for one particular instance.
             if alloc_reqs_by_rp_uuid:
-                selected_alloc_req = alloc_reqs_by_rp_uuid.get(
-                        selected_host.uuid)[0]
+                selected_alloc_req = selected_alloc_reqs[i]
             else:
                 selected_alloc_req = None
 
@@ -571,15 +628,17 @@ def _get_alternate_hosts(
                 if len(selected_plus_alts) >= num_alts + 1:
                     break
 
+                # TODO(gibi): In theory we could generate alternatives on the
+                # same host if that host has different possible allocation
+                # candidates for the request. But we don't do that today
                 if host.cell_uuid == cell_uuid and host not in selected_hosts:
                     if alloc_reqs_by_rp_uuid is not None:
-                        alt_uuid = host.uuid
-                        if alt_uuid not in alloc_reqs_by_rp_uuid:
+                        if not host.allocation_candidates:
                             msg = ("A host state with uuid = '%s' that did "
-                                   "not have a matching allocation_request "
+                                   "not have any remaining allocation_request "
                                    "was encountered while scheduling. This "
                                    "host was skipped.")
-                            LOG.debug(msg, alt_uuid)
+                            LOG.debug(msg, host.uuid)
                             continue
 
                         # TODO(jaypipes): Loop through all allocation_requests
@@ -588,7 +647,13 @@ def _get_alternate_hosts(
                         # the future based on information in the provider
                         # summaries, we'll just try to claim resources using
                         # the first allocation_request
-                        alloc_req = alloc_reqs_by_rp_uuid[alt_uuid][0]
+                        # NOTE(gibi): we are using, and re-using, allocation
+                        # candidates for alternatives here. This is OK as
+                        # these candidates are not yet allocated in placement
+                        # and we don't know if an alternate will ever be used.
+                        # To increase our success we could try to use different
+                        # candidate for different alternative though.
+                        alloc_req = host.allocation_candidates[0]
                         alt_selection = objects.Selection.from_host_state(
                             host, alloc_req, allocation_request_version)
                     else:
diff --git a/nova/tests/unit/scheduler/test_manager.py b/nova/tests/unit/scheduler/test_manager.py