Skip to content

Commit c67e69c

Browse files
committed
Enforce quota usage from placement when unshelving
When [quota]count_usage_from_placement = true or [quota]driver = nova.quota.UnifiedLimitsDriver, cores and ram quota usage are counted from placement. When an instance is SHELVED_OFFLOADED, it will not have allocations in placement, so its cores and ram should not count against quota during that time. This means however that when an instance is unshelved, there is a possibility of going over quota if the cores and ram it needs were allocated by some other instance(s) while it was SHELVED_OFFLOADED. This fixes a bug where quota was not being properly enforced during unshelve of a SHELVED_OFFLOADED instance when quota usage is counted from placement. Test coverage is also added for the "recheck" quota cases. Closes-Bug: #2003991 Change-Id: I4ab97626c10052c7af9934a80ff8db9ddab82738 (cherry picked from commit 6f79d63)
1 parent 004a773 commit c67e69c

File tree

7 files changed

+265
-28
lines changed

7 files changed

+265
-28
lines changed

nova/api/openstack/compute/shelve.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def _shelve_offload(self, req, id, body):
8181
raise exc.HTTPBadRequest(explanation=e.format_message())
8282

8383
@wsgi.response(202)
84-
@wsgi.expected_errors((400, 404, 409))
84+
@wsgi.expected_errors((400, 403, 404, 409))
8585
@wsgi.action('unshelve')
8686
# In microversion 2.77 we support specifying 'availability_zone' to
8787
# unshelve a server. But before 2.77 there is no request body
@@ -142,3 +142,5 @@ def _unshelve(self, req, id, body):
142142
exception.ComputeHostNotFound,
143143
) as e:
144144
raise exc.HTTPBadRequest(explanation=e.format_message())
145+
except exception.OverQuota as e:
146+
raise exc.HTTPForbidden(explanation=e.format_message())

nova/compute/api.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
from nova.image import glance
6161
from nova.limit import local as local_limit
6262
from nova.limit import placement as placement_limits
63+
from nova.limit import utils as limit_utils
6364
from nova.network import constants
6465
from nova.network import model as network_model
6566
from nova.network import neutron
@@ -4522,6 +4523,42 @@ def _validate_unshelve_az(self, context, instance, availability_zone):
45224523
"vol_zone": volume['availability_zone']}
45234524
raise exception.MismatchVolumeAZException(reason=msg)
45244525

4526+
@staticmethod
4527+
def _check_quota_unshelve_offloaded(
4528+
context: nova_context.RequestContext,
4529+
instance: 'objects.Instance',
4530+
request_spec: 'objects.RequestSpec'
4531+
):
4532+
if not (CONF.quota.count_usage_from_placement or
4533+
limit_utils.use_unified_limits()):
4534+
return
4535+
# TODO(melwitt): This is ugly but we have to do it this way because
4536+
# instances quota is currently counted from the API database but cores
4537+
# and ram are counted from placement. That means while an instance is
4538+
# SHELVED_OFFLOADED, it will still consume instances quota but it will
4539+
# not consume cores and ram. So we need an instances delta of
4540+
# 0 but cores and ram deltas from the flavor.
4541+
# Once instances usage is also being counted from placement, we can
4542+
# replace this method with a normal check_num_instances_quota() call.
4543+
vcpus = instance.flavor.vcpus
4544+
memory_mb = instance.flavor.memory_mb
4545+
# We are not looking to create a new server, we are unshelving an
4546+
# existing one.
4547+
deltas = {'instances': 0, 'cores': vcpus, 'ram': memory_mb}
4548+
4549+
objects.Quotas.check_deltas(
4550+
context,
4551+
deltas,
4552+
context.project_id,
4553+
user_id=context.user_id,
4554+
check_project_id=instance.project_id,
4555+
check_user_id=instance.user_id,
4556+
)
4557+
# Do the same for unified limits.
4558+
placement_limits.enforce_num_instances_and_flavor(
4559+
context, context.project_id, instance.flavor, request_spec.is_bfv,
4560+
0, 0, delta_updates={'servers': 0})
4561+
45254562
@block_extended_resource_request
45264563
@check_instance_lock
45274564
@check_instance_state(
@@ -4550,6 +4587,15 @@ def unshelve(
45504587
request_spec = objects.RequestSpec.get_by_instance_uuid(
45514588
context, instance.uuid)
45524589

4590+
# Check quota before we save any changes to the database, but only if
4591+
# we are counting quota usage from placement. When an instance is
4592+
# SHELVED_OFFLOADED, it will not consume cores or ram resources in
4593+
# placement. This means it is possible that an unshelve would cause the
4594+
# project/user to go over quota.
4595+
if instance.vm_state == vm_states.SHELVED_OFFLOADED:
4596+
self._check_quota_unshelve_offloaded(
4597+
context, instance, request_spec)
4598+
45534599
# We need to check a list of preconditions and validate inputs first
45544600

45554601
# Ensure instance is shelve offloaded

nova/conductor/manager.py

Lines changed: 66 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import eventlet
2121
import functools
2222
import sys
23+
import typing as ty
2324

2425
from keystoneauth1 import exceptions as ks_exc
2526
from oslo_config import cfg
@@ -48,6 +49,7 @@
4849
from nova.i18n import _
4950
from nova.image import glance
5051
from nova.limit import placement as placement_limits
52+
from nova.limit import utils as limit_utils
5153
from nova import manager
5254
from nova.network import neutron
5355
from nova import notifications
@@ -970,6 +972,33 @@ def _restrict_request_spec_to_cell(context, instance, request_spec):
970972
objects.Destination(
971973
cell=instance_mapping.cell_mapping))
972974

975+
def _recheck_quota(
976+
self,
977+
context: nova_context.RequestContext,
978+
flavor: 'objects.Flavor',
979+
request_spec: 'objects.RequestSpec',
980+
orig_num_req: int,
981+
project_id: ty.Optional[str] = None,
982+
user_id: ty.Optional[str] = None
983+
) -> None:
984+
# A quota "recheck" is a quota check that is performed *after* quota
985+
# limited resources are consumed. It is meant to address race
986+
# conditions where a request that was not over quota at the beginning
987+
# of the request before resources are allocated becomes over quota
988+
# after resources (like database rows or placement allocations) are
989+
# created. An example of this would be a large number of requests for
990+
# the same resource for the same project sent simultaneously.
991+
if CONF.quota.recheck_quota:
992+
# The orig_num_req is the number of instances requested, which is
993+
# the delta that was quota checked before resources were allocated.
994+
# This is only used for the exception message is the recheck fails
995+
# for lack of enough quota.
996+
compute_utils.check_num_instances_quota(
997+
context, flavor, 0, 0, project_id=project_id,
998+
user_id=user_id, orig_num_req=orig_num_req)
999+
placement_limits.enforce_num_instances_and_flavor(
1000+
context, project_id, flavor, request_spec.is_bfv, 0, 0)
1001+
9731002
# TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0.
9741003
@targets_cell
9751004
def unshelve_instance(self, context, instance, request_spec=None):
@@ -1055,6 +1084,30 @@ def safe_image_show(ctx, image_id):
10551084
host_lists = self._schedule_instances(context,
10561085
request_spec, [instance.uuid],
10571086
return_alternates=False)
1087+
1088+
# NOTE(melwitt): We recheck the quota after allocating the
1089+
# resources in placement, to prevent users from allocating
1090+
# more resources than their allowed quota in the event of a
1091+
# race. This is configurable because it can be expensive if
1092+
# strict quota limits are not required in a deployment.
1093+
try:
1094+
# Quota should only be checked for unshelve only if
1095+
# resources are being counted in placement. Legacy
1096+
# quotas continue to consume resources while
1097+
# SHELVED_OFFLOADED and will not allocate any new
1098+
# resources during unshelve.
1099+
if (CONF.quota.count_usage_from_placement or
1100+
limit_utils.use_unified_limits()):
1101+
self._recheck_quota(
1102+
context, instance.flavor, request_spec, 0,
1103+
project_id=instance.project_id,
1104+
user_id=instance.user_id)
1105+
except (exception.TooManyInstances,
1106+
limit_exceptions.ProjectOverLimit):
1107+
with excutils.save_and_reraise_exception():
1108+
self.report_client.delete_allocation_for_instance(
1109+
context, instance.uuid, force=True)
1110+
10581111
host_list = host_lists[0]
10591112
selection = host_list[0]
10601113
scheduler_utils.populate_filter_properties(
@@ -1677,27 +1730,22 @@ def schedule_and_build_instances(self, context, build_requests,
16771730
instances.append(instance)
16781731
cell_mapping_cache[instance.uuid] = cell
16791732

1680-
# NOTE(melwitt): We recheck the quota after creating the
1681-
# objects to prevent users from allocating more resources
1733+
# NOTE(melwitt): We recheck the quota after allocating the
1734+
# resources to prevent users from allocating more resources
16821735
# than their allowed quota in the event of a race. This is
16831736
# configurable because it can be expensive if strict quota
16841737
# limits are not required in a deployment.
1685-
if CONF.quota.recheck_quota:
1686-
try:
1687-
compute_utils.check_num_instances_quota(
1688-
context, instance.flavor, 0, 0,
1689-
orig_num_req=len(build_requests))
1690-
placement_limits.enforce_num_instances_and_flavor(
1691-
context, context.project_id, instance.flavor,
1692-
request_specs[0].is_bfv, 0, 0)
1693-
except (exception.TooManyInstances,
1694-
limit_exceptions.ProjectOverLimit) as exc:
1695-
with excutils.save_and_reraise_exception():
1696-
self._cleanup_build_artifacts(context, exc, instances,
1697-
build_requests,
1698-
request_specs,
1699-
block_device_mapping, tags,
1700-
cell_mapping_cache)
1738+
try:
1739+
self._recheck_quota(context, instance.flavor, request_specs[0],
1740+
len(build_requests), project_id=instance.project_id,
1741+
user_id=instance.user_id
1742+
)
1743+
except (exception.TooManyInstances,
1744+
limit_exceptions.ProjectOverLimit) as exc:
1745+
with excutils.save_and_reraise_exception():
1746+
self._cleanup_build_artifacts(
1747+
context, exc, instances, build_requests, request_specs,
1748+
block_device_mapping, tags, cell_mapping_cache)
17011749

17021750
zipped = zip(build_requests, request_specs, host_lists, instances)
17031751
for (build_request, request_spec, host_list, instance) in zipped:

nova/limit/placement.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ def enforce_num_instances_and_flavor(
156156
is_bfvm: bool,
157157
min_count: int,
158158
max_count: int,
159-
enforcer: ty.Optional[limit.Enforcer] = None
159+
enforcer: ty.Optional[limit.Enforcer] = None,
160+
delta_updates: ty.Optional[ty.Dict[str, int]] = None,
160161
) -> int:
161162
"""Return max instances possible, else raise TooManyInstances exception."""
162163
if not limit_utils.use_unified_limits():
@@ -169,7 +170,10 @@ def enforce_num_instances_and_flavor(
169170
raise ValueError("invalid max_count")
170171

171172
deltas = _get_deltas_by_flavor(flavor, is_bfvm, max_count)
172-
enforcer = _get_enforcer(context, project_id)
173+
if delta_updates:
174+
deltas.update(delta_updates)
175+
176+
enforcer = enforcer or _get_enforcer(context, project_id)
173177
try:
174178
enforcer.enforce(project_id, deltas)
175179
except limit_exceptions.ProjectOverLimit as e:

nova/tests/functional/test_servers.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -414,15 +414,12 @@ def _test_unshelve_offloaded_overquota_placement(self):
414414
# quota.
415415
self._create_server(flavor_id=1)
416416

417-
# FIXME(melwitt): This is bug #2003991, the unshelve is supposed to
418-
# fail if we would be over quota after unshelving.
419417
# Now try to unshelve the earlier instance. It should fail because it
420418
# would put us over quota to have 4 running instances.
421-
# ex = self.assertRaises(client.OpenStackApiException,
422-
# self._unshelve_server,
423-
# server)
424-
# self.assertEqual(403, ex.response.status_code)
425-
self._unshelve_server(server)
419+
ex = self.assertRaises(client.OpenStackApiException,
420+
self._unshelve_server,
421+
server)
422+
self.assertEqual(403, ex.response.status_code)
426423

427424
def test_unshelve_offloaded_overquota_placement(self):
428425
# Count quota usage from placement.

0 commit comments

Comments
 (0)