Skip to content

Commit 2f1d657

Browse files
yusuke-okadagibizer
authored andcommitted
Fix failed count for anti-affinity check
The late anti-affinity check runs in the compute manager to avoid parallel scheduling requests to invalidate the anti-affinity server group policy. When the check fails the instance is re-scheduled. However this failure counted as a real instance boot failure of the compute host and can lead to de-prioritization of the compute host in the scheduler via BuildFailureWeigher. As the late anti-affinity check does not indicate any fault of the compute host itself it should not be counted towards the build failure counter. This patch adds new build results to handle this case. Closes-Bug: #1996732 Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718 Signed-off-by: Yusuke Okada <[email protected]> (cherry picked from commit 56d320a) (cherry picked from commit 1b56714)
1 parent 77db642 commit 2f1d657

File tree

5 files changed

+265
-14
lines changed

5 files changed

+265
-14
lines changed

nova/compute/build_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
ACTIVE = 'active' # Instance is running
2525
FAILED = 'failed' # Instance failed to build and was not rescheduled
2626
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
27+
# Instance failed by policy violation (such as affinity or anti-affinity)
28+
# and was not rescheduled. In this case, the node's failed count won't be
29+
# increased.
30+
FAILED_BY_POLICY = 'failed_by_policy'
31+
# Instance failed by policy violation (such as affinity or anti-affinity)
32+
# but was rescheduled. In this case, the node's failed count won't be
33+
# increased.
34+
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

nova/compute/manager.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1803,11 +1803,8 @@ def _do_validation(context, instance, group):
18031803
else:
18041804
max_server = 1
18051805
if len(members_on_host) >= max_server:
1806-
msg = _("Anti-affinity instance group policy "
1807-
"was violated.")
1808-
raise exception.RescheduledException(
1809-
instance_uuid=instance.uuid,
1810-
reason=msg)
1806+
raise exception.GroupAffinityViolation(
1807+
instance_uuid=instance.uuid, policy='Anti-affinity')
18111808

18121809
# NOTE(ganso): The check for affinity below does not work and it
18131810
# can easily be violated because the lock happens in different
@@ -1817,10 +1814,8 @@ def _do_validation(context, instance, group):
18171814
elif group.policy and 'affinity' == group.policy:
18181815
group_hosts = group.get_hosts(exclude=[instance.uuid])
18191816
if group_hosts and self.host not in group_hosts:
1820-
msg = _("Affinity instance group policy was violated.")
1821-
raise exception.RescheduledException(
1822-
instance_uuid=instance.uuid,
1823-
reason=msg)
1817+
raise exception.GroupAffinityViolation(
1818+
instance_uuid=instance.uuid, policy='Affinity')
18241819

18251820
_do_validation(context, instance, group)
18261821

@@ -2260,6 +2255,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
22602255
self.reportclient.delete_allocation_for_instance(
22612256
context, instance.uuid, force=True)
22622257

2258+
if result in (build_results.FAILED_BY_POLICY,
2259+
build_results.RESCHEDULED_BY_POLICY):
2260+
return
22632261
if result in (build_results.FAILED,
22642262
build_results.RESCHEDULED):
22652263
self._build_failed(node)
@@ -2358,6 +2356,8 @@ def _do_build_and_run_instance(self, context, instance, image,
23582356
self._nil_out_instance_obj_host_and_node(instance)
23592357
self._set_instance_obj_error_state(instance,
23602358
clean_task_state=True)
2359+
if isinstance(e, exception.RescheduledByPolicyException):
2360+
return build_results.FAILED_BY_POLICY
23612361
return build_results.FAILED
23622362
LOG.debug(e.format_message(), instance=instance)
23632363
# This will be used for logging the exception
@@ -2384,6 +2384,10 @@ def _do_build_and_run_instance(self, context, instance, image,
23842384
injected_files, requested_networks, security_groups,
23852385
block_device_mapping, request_spec=request_spec,
23862386
host_lists=[host_list])
2387+
2388+
if isinstance(e, exception.RescheduledByPolicyException):
2389+
return build_results.RESCHEDULED_BY_POLICY
2390+
23872391
return build_results.RESCHEDULED
23882392
except (exception.InstanceNotFound,
23892393
exception.UnexpectedDeletingTaskStateError):
@@ -2601,6 +2605,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
26012605
bdms=block_device_mapping)
26022606
raise exception.BuildAbortException(instance_uuid=instance.uuid,
26032607
reason=e.format_message())
2608+
except exception.GroupAffinityViolation as e:
2609+
LOG.exception('Failed to build and run instance',
2610+
instance=instance)
2611+
self._notify_about_instance_usage(context, instance,
2612+
'create.error', fault=e)
2613+
compute_utils.notify_about_instance_create(
2614+
context, instance, self.host,
2615+
phase=fields.NotificationPhase.ERROR, exception=e,
2616+
bdms=block_device_mapping)
2617+
raise exception.RescheduledByPolicyException(
2618+
instance_uuid=instance.uuid, reason=str(e))
26042619
except Exception as e:
26052620
LOG.exception('Failed to build and run instance',
26062621
instance=instance)

nova/exception.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,15 @@ class RescheduledException(NovaException):
14871487
"%(reason)s")
14881488

14891489

1490+
class RescheduledByPolicyException(RescheduledException):
1491+
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
1492+
"%(reason)s")
1493+
1494+
1495+
class GroupAffinityViolation(NovaException):
1496+
msg_fmt = _("%(policy)s instance group policy was violated")
1497+
1498+
14901499
class InstanceFaultRollback(NovaException):
14911500
def __init__(self, inner_exception=None):
14921501
message = _("Instance rollback performed due to: %s")

nova/tests/functional/test_server_group.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from nova.compute import instance_actions
2121
from nova import context
2222
from nova.db.main import api as db
23+
from nova import objects
2324
from nova import test
2425
from nova.tests import fixtures as nova_fixtures
2526
from nova.tests.functional.api import client
@@ -495,6 +496,85 @@ def test_soft_affinity_not_supported(self):
495496
self.assertIn('Invalid input', ex.response.text)
496497
self.assertIn('soft-affinity', ex.response.text)
497498

499+
@mock.patch('nova.scheduler.filters.affinity_filter.'
500+
'ServerGroupAffinityFilter.host_passes', return_value=True)
501+
def test_failed_count_with_affinity_violation(self, mock_host_passes):
502+
"""Check failed count not incremented after violation of the late
503+
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
504+
"""
505+
506+
created_group = self.api.post_server_groups(self.affinity)
507+
flavor = self.api.get_flavors()[2]
508+
509+
# Ensure the first instance is on compute1
510+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
511+
compute2_service_id = self.admin_api.get_services(
512+
host=self.compute2.host, binary='nova-compute')[0]['id']
513+
self.admin_api.put_service(compute2_service_id,
514+
{'status': 'disabled'})
515+
516+
self._boot_a_server_to_group(created_group, flavor=flavor)
517+
518+
# Ensure the second instance is on compute2
519+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
520+
self.admin_api.put_service(compute2_service_id,
521+
{'status': 'enabled'})
522+
compute1_service_id = self.admin_api.get_services(
523+
host=self.compute.host, binary='nova-compute')[0]['id']
524+
self.admin_api.put_service(compute1_service_id,
525+
{'status': 'disabled'})
526+
527+
# Expects GroupAffinityViolation exception
528+
failed_server = self._boot_a_server_to_group(created_group,
529+
flavor=flavor,
530+
expected_status='ERROR')
531+
532+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
533+
'hosts available for retrying build failures for '
534+
'instance %s.' % failed_server['id'],
535+
failed_server['fault']['message'])
536+
537+
ctxt = context.get_admin_context()
538+
computes = objects.ComputeNodeList.get_all(ctxt)
539+
540+
for node in computes:
541+
self.assertEqual(node.stats.get('failed_builds'), '0')
542+
543+
@mock.patch('nova.scheduler.filters.affinity_filter.'
544+
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
545+
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
546+
"""Check failed count after violation of the late affinity check.
547+
https://bugs.launchpad.net/nova/+bug/1996732
548+
"""
549+
550+
created_group = self.api.post_server_groups(self.anti_affinity)
551+
flavor = self.api.get_flavors()[2]
552+
553+
# Ensure two instances are scheduled on the same host
554+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
555+
compute2_service_id = self.admin_api.get_services(
556+
host=self.compute2.host, binary='nova-compute')[0]['id']
557+
self.admin_api.put_service(compute2_service_id,
558+
{'status': 'disabled'})
559+
560+
self._boot_a_server_to_group(created_group, flavor=flavor)
561+
562+
# Expects GroupAffinityViolation exception
563+
failed_server = self._boot_a_server_to_group(created_group,
564+
flavor=flavor,
565+
expected_status='ERROR')
566+
567+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
568+
'hosts available for retrying build failures for '
569+
'instance %s.' % failed_server['id'],
570+
failed_server['fault']['message'])
571+
572+
ctxt = context.get_admin_context()
573+
computes = objects.ComputeNodeList.get_all(ctxt)
574+
575+
for node in computes:
576+
self.assertEqual(node.stats.get('failed_builds'), '0')
577+
498578

499579
class ServerGroupAffinityConfTest(ServerGroupTestBase):
500580
api_major_version = 'v2.1'

0 commit comments

Comments
 (0)