Skip to content

Commit ad15e18

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Fix failed count for anti-affinity check" into stable/zed
2 parents db0f754 + 2f1d657 commit ad15e18

File tree

5 files changed

+265
-14
lines changed

5 files changed

+265
-14
lines changed

nova/compute/build_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
ACTIVE = 'active' # Instance is running
2525
FAILED = 'failed' # Instance failed to build and was not rescheduled
2626
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
27+
# Instance failed by policy violation (such as affinity or anti-affinity)
28+
# and was not rescheduled. In this case, the node's failed count won't be
29+
# increased.
30+
FAILED_BY_POLICY = 'failed_by_policy'
31+
# Instance failed by policy violation (such as affinity or anti-affinity)
32+
# but was rescheduled. In this case, the node's failed count won't be
33+
# increased.
34+
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

nova/compute/manager.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1803,11 +1803,8 @@ def _do_validation(context, instance, group):
18031803
else:
18041804
max_server = 1
18051805
if len(members_on_host) >= max_server:
1806-
msg = _("Anti-affinity instance group policy "
1807-
"was violated.")
1808-
raise exception.RescheduledException(
1809-
instance_uuid=instance.uuid,
1810-
reason=msg)
1806+
raise exception.GroupAffinityViolation(
1807+
instance_uuid=instance.uuid, policy='Anti-affinity')
18111808

18121809
# NOTE(ganso): The check for affinity below does not work and it
18131810
# can easily be violated because the lock happens in different
@@ -1817,10 +1814,8 @@ def _do_validation(context, instance, group):
18171814
elif group.policy and 'affinity' == group.policy:
18181815
group_hosts = group.get_hosts(exclude=[instance.uuid])
18191816
if group_hosts and self.host not in group_hosts:
1820-
msg = _("Affinity instance group policy was violated.")
1821-
raise exception.RescheduledException(
1822-
instance_uuid=instance.uuid,
1823-
reason=msg)
1817+
raise exception.GroupAffinityViolation(
1818+
instance_uuid=instance.uuid, policy='Affinity')
18241819

18251820
_do_validation(context, instance, group)
18261821

@@ -2260,6 +2255,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
22602255
self.reportclient.delete_allocation_for_instance(
22612256
context, instance.uuid, force=True)
22622257

2258+
if result in (build_results.FAILED_BY_POLICY,
2259+
build_results.RESCHEDULED_BY_POLICY):
2260+
return
22632261
if result in (build_results.FAILED,
22642262
build_results.RESCHEDULED):
22652263
self._build_failed(node)
@@ -2358,6 +2356,8 @@ def _do_build_and_run_instance(self, context, instance, image,
23582356
self._nil_out_instance_obj_host_and_node(instance)
23592357
self._set_instance_obj_error_state(instance,
23602358
clean_task_state=True)
2359+
if isinstance(e, exception.RescheduledByPolicyException):
2360+
return build_results.FAILED_BY_POLICY
23612361
return build_results.FAILED
23622362
LOG.debug(e.format_message(), instance=instance)
23632363
# This will be used for logging the exception
@@ -2384,6 +2384,10 @@ def _do_build_and_run_instance(self, context, instance, image,
23842384
injected_files, requested_networks, security_groups,
23852385
block_device_mapping, request_spec=request_spec,
23862386
host_lists=[host_list])
2387+
2388+
if isinstance(e, exception.RescheduledByPolicyException):
2389+
return build_results.RESCHEDULED_BY_POLICY
2390+
23872391
return build_results.RESCHEDULED
23882392
except (exception.InstanceNotFound,
23892393
exception.UnexpectedDeletingTaskStateError):
@@ -2601,6 +2605,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
26012605
bdms=block_device_mapping)
26022606
raise exception.BuildAbortException(instance_uuid=instance.uuid,
26032607
reason=e.format_message())
2608+
except exception.GroupAffinityViolation as e:
2609+
LOG.exception('Failed to build and run instance',
2610+
instance=instance)
2611+
self._notify_about_instance_usage(context, instance,
2612+
'create.error', fault=e)
2613+
compute_utils.notify_about_instance_create(
2614+
context, instance, self.host,
2615+
phase=fields.NotificationPhase.ERROR, exception=e,
2616+
bdms=block_device_mapping)
2617+
raise exception.RescheduledByPolicyException(
2618+
instance_uuid=instance.uuid, reason=str(e))
26042619
except Exception as e:
26052620
LOG.exception('Failed to build and run instance',
26062621
instance=instance)

nova/exception.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,15 @@ class RescheduledException(NovaException):
14871487
"%(reason)s")
14881488

14891489

1490+
class RescheduledByPolicyException(RescheduledException):
1491+
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
1492+
"%(reason)s")
1493+
1494+
1495+
class GroupAffinityViolation(NovaException):
1496+
msg_fmt = _("%(policy)s instance group policy was violated")
1497+
1498+
14901499
class InstanceFaultRollback(NovaException):
14911500
def __init__(self, inner_exception=None):
14921501
message = _("Instance rollback performed due to: %s")

nova/tests/functional/test_server_group.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from nova.compute import instance_actions
2121
from nova import context
2222
from nova.db.main import api as db
23+
from nova import objects
2324
from nova import test
2425
from nova.tests import fixtures as nova_fixtures
2526
from nova.tests.functional.api import client
@@ -495,6 +496,85 @@ def test_soft_affinity_not_supported(self):
495496
self.assertIn('Invalid input', ex.response.text)
496497
self.assertIn('soft-affinity', ex.response.text)
497498

499+
@mock.patch('nova.scheduler.filters.affinity_filter.'
500+
'ServerGroupAffinityFilter.host_passes', return_value=True)
501+
def test_failed_count_with_affinity_violation(self, mock_host_passes):
502+
"""Check failed count not incremented after violation of the late
503+
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
504+
"""
505+
506+
created_group = self.api.post_server_groups(self.affinity)
507+
flavor = self.api.get_flavors()[2]
508+
509+
# Ensure the first instance is on compute1
510+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
511+
compute2_service_id = self.admin_api.get_services(
512+
host=self.compute2.host, binary='nova-compute')[0]['id']
513+
self.admin_api.put_service(compute2_service_id,
514+
{'status': 'disabled'})
515+
516+
self._boot_a_server_to_group(created_group, flavor=flavor)
517+
518+
# Ensure the second instance is on compute2
519+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
520+
self.admin_api.put_service(compute2_service_id,
521+
{'status': 'enabled'})
522+
compute1_service_id = self.admin_api.get_services(
523+
host=self.compute.host, binary='nova-compute')[0]['id']
524+
self.admin_api.put_service(compute1_service_id,
525+
{'status': 'disabled'})
526+
527+
# Expects GroupAffinityViolation exception
528+
failed_server = self._boot_a_server_to_group(created_group,
529+
flavor=flavor,
530+
expected_status='ERROR')
531+
532+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
533+
'hosts available for retrying build failures for '
534+
'instance %s.' % failed_server['id'],
535+
failed_server['fault']['message'])
536+
537+
ctxt = context.get_admin_context()
538+
computes = objects.ComputeNodeList.get_all(ctxt)
539+
540+
for node in computes:
541+
self.assertEqual(node.stats.get('failed_builds'), '0')
542+
543+
@mock.patch('nova.scheduler.filters.affinity_filter.'
544+
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
545+
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
546+
"""Check failed count after violation of the late affinity check.
547+
https://bugs.launchpad.net/nova/+bug/1996732
548+
"""
549+
550+
created_group = self.api.post_server_groups(self.anti_affinity)
551+
flavor = self.api.get_flavors()[2]
552+
553+
# Ensure two instances are scheduled on the same host
554+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
555+
compute2_service_id = self.admin_api.get_services(
556+
host=self.compute2.host, binary='nova-compute')[0]['id']
557+
self.admin_api.put_service(compute2_service_id,
558+
{'status': 'disabled'})
559+
560+
self._boot_a_server_to_group(created_group, flavor=flavor)
561+
562+
# Expects GroupAffinityViolation exception
563+
failed_server = self._boot_a_server_to_group(created_group,
564+
flavor=flavor,
565+
expected_status='ERROR')
566+
567+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
568+
'hosts available for retrying build failures for '
569+
'instance %s.' % failed_server['id'],
570+
failed_server['fault']['message'])
571+
572+
ctxt = context.get_admin_context()
573+
computes = objects.ComputeNodeList.get_all(ctxt)
574+
575+
for node in computes:
576+
self.assertEqual(node.stats.get('failed_builds'), '0')
577+
498578

499579
class ServerGroupAffinityConfTest(ServerGroupTestBase):
500580
api_major_version = 'v2.1'

0 commit comments

Comments
 (0)