Make allocation candidates available for scheduler filters

gibizer · gibizer · commit 3d818c3473cd · 2022-10-17T13:56:18.000+02:00
This patch extends the HostState object with an allocation_candidates
list populated by the scheduler manager. Also this changes the generic
scheduler logic to allocate the candidate of the selected host based on
the candidates in the host state.

So after this patch scheduler filters can be extended to filter the
allocation_candidates list of the HostState object while processing a
host and restrict which candidate can be allocated if the host passes
the all the filters. Potentially all candidates can be removed by
multiple consecutive filters making the host as a non viable scheduling
target.

blueprint: pci-device-tracking-in-placement
Change-Id: Id0afff271d345a94aa83fc886e9c3231c3ff2570
diff --git a/nova/scheduler/filters/__init__.py b/nova/scheduler/filters/__init__.py
@@ -28,6 +28,9 @@ class BaseHostFilter(filters.BaseFilter):
     # other parameters. We care about running policy filters (i.e.
     # ImagePropertiesFilter) but not things that check usage on the
     # existing compute node, etc.
+    # This also means that filters marked with RUN_ON_REBUILD = True cannot
+    # filter on allocation candidates or need to handle the rebuild case
+    # specially.
     RUN_ON_REBUILD = False
 
     def _filter_one(self, obj, spec):
diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py
@@ -153,6 +153,8 @@ def __init__(self, host, node, cell_uuid):
 
         self.updated = None
 
+        self.allocation_candidates = []
+
     def update(self, compute=None, service=None, aggregates=None,
             inst_dict=None):
         """Update all information about a host."""
@@ -314,13 +316,21 @@ def _locked_consume_from_request(self, spec_obj):
         self.num_io_ops += 1
 
     def __repr__(self):
-        return ("(%(host)s, %(node)s) ram: %(free_ram)sMB "
-                "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
-                "instances: %(num_instances)s" %
-                {'host': self.host, 'node': self.nodename,
-                 'free_ram': self.free_ram_mb, 'free_disk': self.free_disk_mb,
-                 'num_io_ops': self.num_io_ops,
-                 'num_instances': self.num_instances})
+        return (
+            "(%(host)s, %(node)s) ram: %(free_ram)sMB "
+            "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
+            "instances: %(num_instances)s, "
+            "allocation_candidates: %(num_a_c)s"
+            % {
+                "host": self.host,
+                "node": self.nodename,
+                "free_ram": self.free_ram_mb,
+                "free_disk": self.free_disk_mb,
+                "num_io_ops": self.num_io_ops,
+                "num_instances": self.num_instances,
+                "num_a_c": len(self.allocation_candidates),
+            }
+        )
 
 
 class HostManager(object):
diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py
@@ -20,6 +20,7 @@
 """
 
 import collections
+import copy
 import random
 
 from oslo_log import log as logging
@@ -299,12 +300,29 @@ def _schedule(
         # host, we virtually consume resources on it so subsequent
         # selections can adjust accordingly.
 
+        def hosts_with_alloc_reqs(hosts_gen):
+            """Extend the HostState objects returned by the generator with
+            the allocation requests of that host
+            """
+            for host in hosts_gen:
+                host.allocation_candidates = copy.deepcopy(
+                    alloc_reqs_by_rp_uuid[host.uuid])
+                yield host
+
         # Note: remember, we are using a generator-iterator here. So only
         # traverse this list once. This can bite you if the hosts
         # are being scanned in a filter or weighing function.
         hosts = self._get_all_host_states(
             elevated, spec_obj, provider_summaries)
 
+        # alloc_reqs_by_rp_uuid is None during rebuild, so this mean we cannot
+        # run filters that are using allocation candidates during rebuild
+        if alloc_reqs_by_rp_uuid is not None:
+            # wrap the generator to extend the HostState objects with the
+            # allocation requests for that given host. This is needed to
+            # support scheduler filters filtering on allocation candidates.
+            hosts = hosts_with_alloc_reqs(hosts)
+
         # NOTE(sbauza): The RequestSpec.num_instances field contains the number
         # of instances created when the RequestSpec was used to first boot some
         # instances. This is incorrect when doing a move or resize operation,
@@ -332,6 +350,13 @@ def _schedule(
             # the older dict format representing HostState objects.
             # TODO(stephenfin): Remove this when we bump scheduler the RPC API
             # version to 5.0
+            # NOTE(gibi): We cannot remove this branch as it is actively used
+            # when nova calls the scheduler during rebuild (not evacuate) to
+            # check if the current host is still good for the new image used
+            # for the rebuild. In this case placement cannot be used to
+            # generate candidates as that would require space on the current
+            # compute for double allocation. So no allocation candidates for
+            # rebuild and therefore alloc_reqs_by_rp_uuid is None
             return self._legacy_find_hosts(
                 context, num_instances, spec_obj, hosts, num_alts,
                 instance_uuids=instance_uuids)
@@ -345,6 +370,9 @@ def _schedule(
         # The list of hosts that have been selected (and claimed).
         claimed_hosts = []
 
+        # The allocation request allocated on the given claimed host
+        claimed_alloc_reqs = []
+
         for num, instance_uuid in enumerate(instance_uuids):
             # In a multi-create request, the first request spec from the list
             # is passed to the scheduler and that request spec's instance_uuid
@@ -371,21 +399,20 @@ def _schedule(
             # resource provider UUID
             claimed_host = None
             for host in hosts:
-                cn_uuid = host.uuid
-                if cn_uuid not in alloc_reqs_by_rp_uuid:
-                    msg = ("A host state with uuid = '%s' that did not have a "
-                           "matching allocation_request was encountered while "
-                           "scheduling. This host was skipped.")
-                    LOG.debug(msg, cn_uuid)
+                if not host.allocation_candidates:
+                    LOG.debug(
+                        "The nova scheduler removed every allocation candidate"
+                        "for host %s so this host was skipped.",
+                        host
+                    )
                     continue
 
-                alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid]
                 # TODO(jaypipes): Loop through all allocation_requests instead
                 # of just trying the first one. For now, since we'll likely
                 # want to order the allocation_requests in the future based on
                 # information in the provider summaries, we'll just try to
                 # claim resources using the first allocation_request
-                alloc_req = alloc_reqs[0]
+                alloc_req = host.allocation_candidates[0]
                 if utils.claim_resources(
                     elevated, self.placement_client, spec_obj, instance_uuid,
                     alloc_req,
@@ -405,6 +432,15 @@ def _schedule(
 
             claimed_instance_uuids.append(instance_uuid)
             claimed_hosts.append(claimed_host)
+            claimed_alloc_reqs.append(alloc_req)
+
+            # update the provider mapping in the request spec based
+            # on the allocated candidate as the _consume_selected_host depends
+            # on this information to temporally consume PCI devices tracked in
+            # placement
+            for request_group in spec_obj.requested_resources:
+                request_group.provider_uuids = alloc_req[
+                    'mappings'][request_group.requester_id]
 
             # Now consume the resources so the filter/weights will change for
             # the next instance.
@@ -416,11 +452,19 @@ def _schedule(
         self._ensure_sufficient_hosts(
             context, claimed_hosts, num_instances, claimed_instance_uuids)
 
-        # We have selected and claimed hosts for each instance. Now we need to
-        # find alternates for each host.
+        # We have selected and claimed hosts for each instance along with a
+        # claimed allocation request. Now we need to find alternates for each
+        # host.
         return self._get_alternate_hosts(
-            claimed_hosts, spec_obj, hosts, num, num_alts,
-            alloc_reqs_by_rp_uuid, allocation_request_version)
+            claimed_hosts,
+            spec_obj,
+            hosts,
+            num,
+            num_alts,
+            alloc_reqs_by_rp_uuid,
+            allocation_request_version,
+            claimed_alloc_reqs,
+        )
 
     def _ensure_sufficient_hosts(
         self, context, hosts, required_count, claimed_uuids=None,
@@ -532,7 +576,21 @@ def _consume_selected_host(selected_host, spec_obj, instance_uuid=None):
     def _get_alternate_hosts(
         self, selected_hosts, spec_obj, hosts, index, num_alts,
         alloc_reqs_by_rp_uuid=None, allocation_request_version=None,
+        selected_alloc_reqs=None,
     ):
+        """Generate the main Selection and possible alternate Selection
+        objects for each "instance".
+
+        :param selected_hosts: This is a list of HostState objects. Each
+            HostState represents the main selection for a given instance being
+            scheduled (we can have multiple instances during multi create).
+        :param selected_alloc_reqs: This is a list of allocation requests that
+            are already allocated in placement for the main Selection for each
+            instance. This list is matching with selected_hosts by index. So
+            for the first instance the selected host is selected_host[0] and
+            the already allocated placement candidate is
+            selected_alloc_reqs[0].
+        """
         # We only need to filter/weigh the hosts again if we're dealing with
         # more than one instance and are going to be picking alternates.
         if index > 0 and num_alts > 0:
@@ -546,11 +604,10 @@ def _get_alternate_hosts(
         # representing the selected host along with alternates from the same
         # cell.
         selections_to_return = []
-        for selected_host in selected_hosts:
+        for i, selected_host in enumerate(selected_hosts):
             # This is the list of hosts for one particular instance.
             if alloc_reqs_by_rp_uuid:
-                selected_alloc_req = alloc_reqs_by_rp_uuid.get(
-                        selected_host.uuid)[0]
+                selected_alloc_req = selected_alloc_reqs[i]
             else:
                 selected_alloc_req = None
 
@@ -571,15 +628,17 @@ def _get_alternate_hosts(
                 if len(selected_plus_alts) >= num_alts + 1:
                     break
 
+                # TODO(gibi): In theory we could generate alternatives on the
+                # same host if that host has different possible allocation
+                # candidates for the request. But we don't do that today
                 if host.cell_uuid == cell_uuid and host not in selected_hosts:
                     if alloc_reqs_by_rp_uuid is not None:
-                        alt_uuid = host.uuid
-                        if alt_uuid not in alloc_reqs_by_rp_uuid:
+                        if not host.allocation_candidates:
                             msg = ("A host state with uuid = '%s' that did "
-                                   "not have a matching allocation_request "
+                                   "not have any remaining allocation_request "
                                    "was encountered while scheduling. This "
                                    "host was skipped.")
-                            LOG.debug(msg, alt_uuid)
+                            LOG.debug(msg, host.uuid)
                             continue
 
                         # TODO(jaypipes): Loop through all allocation_requests
@@ -588,7 +647,13 @@ def _get_alternate_hosts(
                         # the future based on information in the provider
                         # summaries, we'll just try to claim resources using
                         # the first allocation_request
-                        alloc_req = alloc_reqs_by_rp_uuid[alt_uuid][0]
+                        # NOTE(gibi): we are using, and re-using, allocation
+                        # candidates for alternatives here. This is OK as
+                        # these candidates are not yet allocated in placement
+                        # and we don't know if an alternate will ever be used.
+                        # To increase our success we could try to use different
+                        # candidate for different alternative though.
+                        alloc_req = host.allocation_candidates[0]
                         alt_selection = objects.Selection.from_host_state(
                             host, alloc_req, allocation_request_version)
                     else:
diff --git a/nova/tests/unit/scheduler/test_manager.py b/nova/tests/unit/scheduler/test_manager.py