Skip to content

Commit 38ac797

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Fix failed count for anti-affinity check" into stable/2023.1
2 parents 5fda99a + 1b56714 commit 38ac797

File tree

5 files changed

+265
-14
lines changed

5 files changed

+265
-14
lines changed

nova/compute/build_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
ACTIVE = 'active' # Instance is running
2525
FAILED = 'failed' # Instance failed to build and was not rescheduled
2626
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
27+
# Instance failed by policy violation (such as affinity or anti-affinity)
28+
# and was not rescheduled. In this case, the node's failed count won't be
29+
# increased.
30+
FAILED_BY_POLICY = 'failed_by_policy'
31+
# Instance failed by policy violation (such as affinity or anti-affinity)
32+
# but was rescheduled. In this case, the node's failed count won't be
33+
# increased.
34+
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

nova/compute/manager.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,11 +1891,8 @@ def _do_validation(context, instance, group):
18911891
else:
18921892
max_server = 1
18931893
if len(members_on_host) >= max_server:
1894-
msg = _("Anti-affinity instance group policy "
1895-
"was violated.")
1896-
raise exception.RescheduledException(
1897-
instance_uuid=instance.uuid,
1898-
reason=msg)
1894+
raise exception.GroupAffinityViolation(
1895+
instance_uuid=instance.uuid, policy='Anti-affinity')
18991896

19001897
# NOTE(ganso): The check for affinity below does not work and it
19011898
# can easily be violated because the lock happens in different
@@ -1905,10 +1902,8 @@ def _do_validation(context, instance, group):
19051902
elif group.policy and 'affinity' == group.policy:
19061903
group_hosts = group.get_hosts(exclude=[instance.uuid])
19071904
if group_hosts and self.host not in group_hosts:
1908-
msg = _("Affinity instance group policy was violated.")
1909-
raise exception.RescheduledException(
1910-
instance_uuid=instance.uuid,
1911-
reason=msg)
1905+
raise exception.GroupAffinityViolation(
1906+
instance_uuid=instance.uuid, policy='Affinity')
19121907

19131908
_do_validation(context, instance, group)
19141909

@@ -2348,6 +2343,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
23482343
self.reportclient.delete_allocation_for_instance(
23492344
context, instance.uuid, force=True)
23502345

2346+
if result in (build_results.FAILED_BY_POLICY,
2347+
build_results.RESCHEDULED_BY_POLICY):
2348+
return
23512349
if result in (build_results.FAILED,
23522350
build_results.RESCHEDULED):
23532351
self._build_failed(node)
@@ -2446,6 +2444,8 @@ def _do_build_and_run_instance(self, context, instance, image,
24462444
self._nil_out_instance_obj_host_and_node(instance)
24472445
self._set_instance_obj_error_state(instance,
24482446
clean_task_state=True)
2447+
if isinstance(e, exception.RescheduledByPolicyException):
2448+
return build_results.FAILED_BY_POLICY
24492449
return build_results.FAILED
24502450
LOG.debug(e.format_message(), instance=instance)
24512451
# This will be used for logging the exception
@@ -2472,6 +2472,10 @@ def _do_build_and_run_instance(self, context, instance, image,
24722472
injected_files, requested_networks, security_groups,
24732473
block_device_mapping, request_spec=request_spec,
24742474
host_lists=[host_list])
2475+
2476+
if isinstance(e, exception.RescheduledByPolicyException):
2477+
return build_results.RESCHEDULED_BY_POLICY
2478+
24752479
return build_results.RESCHEDULED
24762480
except (exception.InstanceNotFound,
24772481
exception.UnexpectedDeletingTaskStateError):
@@ -2691,6 +2695,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
26912695
bdms=block_device_mapping)
26922696
raise exception.BuildAbortException(instance_uuid=instance.uuid,
26932697
reason=e.format_message())
2698+
except exception.GroupAffinityViolation as e:
2699+
LOG.exception('Failed to build and run instance',
2700+
instance=instance)
2701+
self._notify_about_instance_usage(context, instance,
2702+
'create.error', fault=e)
2703+
compute_utils.notify_about_instance_create(
2704+
context, instance, self.host,
2705+
phase=fields.NotificationPhase.ERROR, exception=e,
2706+
bdms=block_device_mapping)
2707+
raise exception.RescheduledByPolicyException(
2708+
instance_uuid=instance.uuid, reason=str(e))
26942709
except Exception as e:
26952710
LOG.exception('Failed to build and run instance',
26962711
instance=instance)

nova/exception.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,6 +1502,15 @@ class RescheduledException(NovaException):
15021502
"%(reason)s")
15031503

15041504

1505+
class RescheduledByPolicyException(RescheduledException):
1506+
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
1507+
"%(reason)s")
1508+
1509+
1510+
class GroupAffinityViolation(NovaException):
1511+
msg_fmt = _("%(policy)s instance group policy was violated")
1512+
1513+
15051514
class InstanceFaultRollback(NovaException):
15061515
def __init__(self, inner_exception=None):
15071516
message = _("Instance rollback performed due to: %s")

nova/tests/functional/test_server_group.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from nova.compute import instance_actions
2121
from nova import context
2222
from nova.db.main import api as db
23+
from nova import objects
2324
from nova import test
2425
from nova.tests import fixtures as nova_fixtures
2526
from nova.tests.functional.api import client
@@ -499,6 +500,85 @@ def test_soft_affinity_not_supported(self):
499500
self.assertIn('Invalid input', ex.response.text)
500501
self.assertIn('soft-affinity', ex.response.text)
501502

503+
@mock.patch('nova.scheduler.filters.affinity_filter.'
504+
'ServerGroupAffinityFilter.host_passes', return_value=True)
505+
def test_failed_count_with_affinity_violation(self, mock_host_passes):
506+
"""Check failed count not incremented after violation of the late
507+
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
508+
"""
509+
510+
created_group = self.api.post_server_groups(self.affinity)
511+
flavor = self.api.get_flavors()[2]
512+
513+
# Ensure the first instance is on compute1
514+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
515+
compute2_service_id = self.admin_api.get_services(
516+
host=self.compute2.host, binary='nova-compute')[0]['id']
517+
self.admin_api.put_service(compute2_service_id,
518+
{'status': 'disabled'})
519+
520+
self._boot_a_server_to_group(created_group, flavor=flavor)
521+
522+
# Ensure the second instance is on compute2
523+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
524+
self.admin_api.put_service(compute2_service_id,
525+
{'status': 'enabled'})
526+
compute1_service_id = self.admin_api.get_services(
527+
host=self.compute.host, binary='nova-compute')[0]['id']
528+
self.admin_api.put_service(compute1_service_id,
529+
{'status': 'disabled'})
530+
531+
# Expects GroupAffinityViolation exception
532+
failed_server = self._boot_a_server_to_group(created_group,
533+
flavor=flavor,
534+
expected_status='ERROR')
535+
536+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
537+
'hosts available for retrying build failures for '
538+
'instance %s.' % failed_server['id'],
539+
failed_server['fault']['message'])
540+
541+
ctxt = context.get_admin_context()
542+
computes = objects.ComputeNodeList.get_all(ctxt)
543+
544+
for node in computes:
545+
self.assertEqual(node.stats.get('failed_builds'), '0')
546+
547+
@mock.patch('nova.scheduler.filters.affinity_filter.'
548+
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
549+
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
550+
"""Check failed count after violation of the late affinity check.
551+
https://bugs.launchpad.net/nova/+bug/1996732
552+
"""
553+
554+
created_group = self.api.post_server_groups(self.anti_affinity)
555+
flavor = self.api.get_flavors()[2]
556+
557+
# Ensure two instances are scheduled on the same host
558+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
559+
compute2_service_id = self.admin_api.get_services(
560+
host=self.compute2.host, binary='nova-compute')[0]['id']
561+
self.admin_api.put_service(compute2_service_id,
562+
{'status': 'disabled'})
563+
564+
self._boot_a_server_to_group(created_group, flavor=flavor)
565+
566+
# Expects GroupAffinityViolation exception
567+
failed_server = self._boot_a_server_to_group(created_group,
568+
flavor=flavor,
569+
expected_status='ERROR')
570+
571+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
572+
'hosts available for retrying build failures for '
573+
'instance %s.' % failed_server['id'],
574+
failed_server['fault']['message'])
575+
576+
ctxt = context.get_admin_context()
577+
computes = objects.ComputeNodeList.get_all(ctxt)
578+
579+
for node in computes:
580+
self.assertEqual(node.stats.get('failed_builds'), '0')
581+
502582

503583
class ServerGroupAffinityConfTest(ServerGroupTestBase):
504584
api_major_version = 'v2.1'

0 commit comments

Comments
 (0)