Skip to content

Commit 1b56714

Browse files
yusuke-okadagibizer
authored andcommitted
Fix failed count for anti-affinity check
The late anti-affinity check runs in the compute manager to avoid parallel scheduling requests to invalidate the anti-affinity server group policy. When the check fails the instance is re-scheduled. However this failure counted as a real instance boot failure of the compute host and can lead to de-prioritization of the compute host in the scheduler via BuildFailureWeigher. As the late anti-affinity check does not indicate any fault of the compute host itself it should not be counted towards the build failure counter. This patch adds new build results to handle this case. Closes-Bug: #1996732 Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718 Signed-off-by: Yusuke Okada <[email protected]> (cherry picked from commit 56d320a)
1 parent b9089ac commit 1b56714

File tree

5 files changed

+265
-14
lines changed

5 files changed

+265
-14
lines changed

nova/compute/build_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
ACTIVE = 'active' # Instance is running
2525
FAILED = 'failed' # Instance failed to build and was not rescheduled
2626
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
27+
# Instance failed by policy violation (such as affinity or anti-affinity)
28+
# and was not rescheduled. In this case, the node's failed count won't be
29+
# increased.
30+
FAILED_BY_POLICY = 'failed_by_policy'
31+
# Instance failed by policy violation (such as affinity or anti-affinity)
32+
# but was rescheduled. In this case, the node's failed count won't be
33+
# increased.
34+
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

nova/compute/manager.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,11 +1891,8 @@ def _do_validation(context, instance, group):
18911891
else:
18921892
max_server = 1
18931893
if len(members_on_host) >= max_server:
1894-
msg = _("Anti-affinity instance group policy "
1895-
"was violated.")
1896-
raise exception.RescheduledException(
1897-
instance_uuid=instance.uuid,
1898-
reason=msg)
1894+
raise exception.GroupAffinityViolation(
1895+
instance_uuid=instance.uuid, policy='Anti-affinity')
18991896

19001897
# NOTE(ganso): The check for affinity below does not work and it
19011898
# can easily be violated because the lock happens in different
@@ -1905,10 +1902,8 @@ def _do_validation(context, instance, group):
19051902
elif group.policy and 'affinity' == group.policy:
19061903
group_hosts = group.get_hosts(exclude=[instance.uuid])
19071904
if group_hosts and self.host not in group_hosts:
1908-
msg = _("Affinity instance group policy was violated.")
1909-
raise exception.RescheduledException(
1910-
instance_uuid=instance.uuid,
1911-
reason=msg)
1905+
raise exception.GroupAffinityViolation(
1906+
instance_uuid=instance.uuid, policy='Affinity')
19121907

19131908
_do_validation(context, instance, group)
19141909

@@ -2348,6 +2343,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
23482343
self.reportclient.delete_allocation_for_instance(
23492344
context, instance.uuid, force=True)
23502345

2346+
if result in (build_results.FAILED_BY_POLICY,
2347+
build_results.RESCHEDULED_BY_POLICY):
2348+
return
23512349
if result in (build_results.FAILED,
23522350
build_results.RESCHEDULED):
23532351
self._build_failed(node)
@@ -2446,6 +2444,8 @@ def _do_build_and_run_instance(self, context, instance, image,
24462444
self._nil_out_instance_obj_host_and_node(instance)
24472445
self._set_instance_obj_error_state(instance,
24482446
clean_task_state=True)
2447+
if isinstance(e, exception.RescheduledByPolicyException):
2448+
return build_results.FAILED_BY_POLICY
24492449
return build_results.FAILED
24502450
LOG.debug(e.format_message(), instance=instance)
24512451
# This will be used for logging the exception
@@ -2472,6 +2472,10 @@ def _do_build_and_run_instance(self, context, instance, image,
24722472
injected_files, requested_networks, security_groups,
24732473
block_device_mapping, request_spec=request_spec,
24742474
host_lists=[host_list])
2475+
2476+
if isinstance(e, exception.RescheduledByPolicyException):
2477+
return build_results.RESCHEDULED_BY_POLICY
2478+
24752479
return build_results.RESCHEDULED
24762480
except (exception.InstanceNotFound,
24772481
exception.UnexpectedDeletingTaskStateError):
@@ -2691,6 +2695,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
26912695
bdms=block_device_mapping)
26922696
raise exception.BuildAbortException(instance_uuid=instance.uuid,
26932697
reason=e.format_message())
2698+
except exception.GroupAffinityViolation as e:
2699+
LOG.exception('Failed to build and run instance',
2700+
instance=instance)
2701+
self._notify_about_instance_usage(context, instance,
2702+
'create.error', fault=e)
2703+
compute_utils.notify_about_instance_create(
2704+
context, instance, self.host,
2705+
phase=fields.NotificationPhase.ERROR, exception=e,
2706+
bdms=block_device_mapping)
2707+
raise exception.RescheduledByPolicyException(
2708+
instance_uuid=instance.uuid, reason=str(e))
26942709
except Exception as e:
26952710
LOG.exception('Failed to build and run instance',
26962711
instance=instance)

nova/exception.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,6 +1502,15 @@ class RescheduledException(NovaException):
15021502
"%(reason)s")
15031503

15041504

1505+
class RescheduledByPolicyException(RescheduledException):
1506+
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
1507+
"%(reason)s")
1508+
1509+
1510+
class GroupAffinityViolation(NovaException):
1511+
msg_fmt = _("%(policy)s instance group policy was violated")
1512+
1513+
15051514
class InstanceFaultRollback(NovaException):
15061515
def __init__(self, inner_exception=None):
15071516
message = _("Instance rollback performed due to: %s")

nova/tests/functional/test_server_group.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from nova.compute import instance_actions
2121
from nova import context
2222
from nova.db.main import api as db
23+
from nova import objects
2324
from nova import test
2425
from nova.tests import fixtures as nova_fixtures
2526
from nova.tests.functional.api import client
@@ -499,6 +500,85 @@ def test_soft_affinity_not_supported(self):
499500
self.assertIn('Invalid input', ex.response.text)
500501
self.assertIn('soft-affinity', ex.response.text)
501502

503+
@mock.patch('nova.scheduler.filters.affinity_filter.'
504+
'ServerGroupAffinityFilter.host_passes', return_value=True)
505+
def test_failed_count_with_affinity_violation(self, mock_host_passes):
506+
"""Check failed count not incremented after violation of the late
507+
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
508+
"""
509+
510+
created_group = self.api.post_server_groups(self.affinity)
511+
flavor = self.api.get_flavors()[2]
512+
513+
# Ensure the first instance is on compute1
514+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
515+
compute2_service_id = self.admin_api.get_services(
516+
host=self.compute2.host, binary='nova-compute')[0]['id']
517+
self.admin_api.put_service(compute2_service_id,
518+
{'status': 'disabled'})
519+
520+
self._boot_a_server_to_group(created_group, flavor=flavor)
521+
522+
# Ensure the second instance is on compute2
523+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
524+
self.admin_api.put_service(compute2_service_id,
525+
{'status': 'enabled'})
526+
compute1_service_id = self.admin_api.get_services(
527+
host=self.compute.host, binary='nova-compute')[0]['id']
528+
self.admin_api.put_service(compute1_service_id,
529+
{'status': 'disabled'})
530+
531+
# Expects GroupAffinityViolation exception
532+
failed_server = self._boot_a_server_to_group(created_group,
533+
flavor=flavor,
534+
expected_status='ERROR')
535+
536+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
537+
'hosts available for retrying build failures for '
538+
'instance %s.' % failed_server['id'],
539+
failed_server['fault']['message'])
540+
541+
ctxt = context.get_admin_context()
542+
computes = objects.ComputeNodeList.get_all(ctxt)
543+
544+
for node in computes:
545+
self.assertEqual(node.stats.get('failed_builds'), '0')
546+
547+
@mock.patch('nova.scheduler.filters.affinity_filter.'
548+
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
549+
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
550+
"""Check failed count after violation of the late affinity check.
551+
https://bugs.launchpad.net/nova/+bug/1996732
552+
"""
553+
554+
created_group = self.api.post_server_groups(self.anti_affinity)
555+
flavor = self.api.get_flavors()[2]
556+
557+
# Ensure two instances are scheduled on the same host
558+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
559+
compute2_service_id = self.admin_api.get_services(
560+
host=self.compute2.host, binary='nova-compute')[0]['id']
561+
self.admin_api.put_service(compute2_service_id,
562+
{'status': 'disabled'})
563+
564+
self._boot_a_server_to_group(created_group, flavor=flavor)
565+
566+
# Expects GroupAffinityViolation exception
567+
failed_server = self._boot_a_server_to_group(created_group,
568+
flavor=flavor,
569+
expected_status='ERROR')
570+
571+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
572+
'hosts available for retrying build failures for '
573+
'instance %s.' % failed_server['id'],
574+
failed_server['fault']['message'])
575+
576+
ctxt = context.get_admin_context()
577+
computes = objects.ComputeNodeList.get_all(ctxt)
578+
579+
for node in computes:
580+
self.assertEqual(node.stats.get('failed_builds'), '0')
581+
502582

503583
class ServerGroupAffinityConfTest(ServerGroupTestBase):
504584
api_major_version = 'v2.1'

0 commit comments

Comments
 (0)