Skip to content

Commit f1ae593

Browse files
committed
Add validate_instance_group_policy to the virt driver
In VMWare we need to handle race conditions when spawning k8s instances in parallel while building a new cluster. Similar to how server groups are validated prior to spawning the VM on the compute host, we add a new method on the driver `validate_instance_group_policy` that checks driver-specific grouping policy (in this case, the K8S shard for the instance) Change-Id: I04151875fae44b72be52127e3b160f7f95abfb9e
1 parent 28b6a65 commit f1ae593

File tree

10 files changed

+178
-66
lines changed

10 files changed

+178
-66
lines changed

nova/compute/manager.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1733,6 +1733,15 @@ def _do_validation(context, instance, group):
17331733

17341734
_do_validation(context, instance, group)
17351735

1736+
def _validate_driver_instance_group_policy(self, context, instance):
1737+
lock_id = "driver-instance-group-validation-%s" % instance.uuid
1738+
1739+
@utils.synchronized(lock_id)
1740+
def _do_validation(context, instance):
1741+
self.driver.validate_instance_group_policy(context, instance)
1742+
1743+
_do_validation(context, instance)
1744+
17361745
def _log_original_error(self, exc_info, instance_uuid):
17371746
LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
17381747
exc_info=exc_info)
@@ -2407,6 +2416,8 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
24072416
# the host is set on the instance.
24082417
self._validate_instance_group_policy(context, instance,
24092418
scheduler_hints)
2419+
self._validate_driver_instance_group_policy(context,
2420+
instance)
24102421
image_meta = objects.ImageMeta.from_dict(image)
24112422

24122423
with self._build_resources(context, instance,

nova/db/main/api.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2159,6 +2159,11 @@ def _handle_k8s_hosts_query_filters(query, filters=None):
21592159
query = query.filter(
21602160
models.Instance.availability_zone == availability_zone)
21612161

2162+
skip_instance_uuid = filters.get('skip_instance_uuid')
2163+
if skip_instance_uuid:
2164+
query.filter(
2165+
models.Instance.uuid != skip_instance_uuid)
2166+
21622167
return query
21632168

21642169

nova/objects/compute_node.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,17 @@ def get_by_hypervisor_type(cls, context, hv_type):
506506
return base.obj_make_list(context, cls(context), objects.ComputeNode,
507507
db_computes)
508508

509+
@base.remotable_classmethod
510+
def get_k8s_hosts_by_instances_metadata(cls, context, meta_key, meta_value,
511+
filters=None):
512+
return db.get_k8s_hosts_by_instances_metadata(
513+
context, meta_key, meta_value, filters=filters)
514+
515+
@base.remotable_classmethod
516+
def get_k8s_hosts_by_instances_tag(cls, context, tag, filters=None):
517+
return db.get_k8s_hosts_by_instances_tag(
518+
context, tag, filters=filters)
519+
509520

510521
def _get_node_empty_ratio(context, max_count):
511522
"""Query the DB for non-deleted compute_nodes with 0.0/None alloc ratios

nova/scheduler/filters/shard_filter.py

Lines changed: 7 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,17 @@
1212
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
1313
# License for the specific language governing permissions and limitations
1414
# under the License.
15-
from collections import defaultdict
16-
1715
from oslo_log import log as logging
1816

1917
import nova.conf
2018
from nova import context as nova_context
21-
from nova.db.main import api as main_db_api
22-
from nova import exception
23-
from nova.objects.aggregate import AggregateList
2419
from nova.objects.build_request import BuildRequest
2520
from nova.objects.instance import Instance
2621
from nova.scheduler import filters
2722
from nova.scheduler.mixins import ProjectTagMixin
2823
from nova.scheduler import utils
2924
from nova import utils as nova_utils
25+
from nova.virt.vmwareapi import shard_util
3026

3127
LOG = logging.getLogger(__name__)
3228

@@ -97,63 +93,13 @@ def _get_metadata():
9793
"for the uuid %s", spec_obj.instance_uuid)
9894
return
9995

100-
kks_tag = next((t.tag for t in _get_tags()
101-
if t.tag.startswith(KKS_PREFIX)), None)
102-
gardener_meta = None
103-
if not kks_tag:
104-
gardener_meta = \
105-
{k: v for k, v in _get_metadata().items()
106-
if k.startswith(GARDENER_PREFIX)}
107-
108-
if not kks_tag and not gardener_meta:
109-
return None
110-
111-
q_filters = {'hv_type': VMWARE_HV_TYPE}
112-
if spec_obj.availability_zone:
113-
q_filters['availability_zone'] = spec_obj.availability_zone
114-
115-
results = None
116-
if kks_tag:
117-
results = nova_context.scatter_gather_skip_cell0(
118-
elevated, main_db_api.get_k8s_hosts_by_instances_tag,
119-
kks_tag, filters=q_filters)
120-
elif gardener_meta:
121-
(meta_key, meta_value) = next(iter(gardener_meta.items()))
122-
results = nova_context.scatter_gather_skip_cell0(
123-
elevated, main_db_api.get_k8s_hosts_by_instances_metadata,
124-
meta_key, meta_value, filters=q_filters)
125-
126-
if not results:
127-
return None
128-
129-
# hosts with count of instances from this K8S cluster
130-
# {host: <count>}
131-
k8s_hosts = defaultdict(lambda: 0)
132-
133-
for cell_uuid, cell_result in results.items():
134-
if nova_context.is_cell_failure_sentinel(cell_result):
135-
raise exception.NovaException(
136-
"Unable to schedule the K8S instance because "
137-
"cell %s is not responding." % cell_uuid)
138-
cell_hosts = dict(cell_result)
139-
for h, c in cell_hosts.items():
140-
k8s_hosts[h] += c
96+
k8s_shard_aggrs = shard_util.get_sorted_k8s_shard_aggregates(
97+
elevated, _get_metadata(), _get_tags(), spec_obj.availability_zone)
14198

142-
if not k8s_hosts:
99+
if not k8s_shard_aggrs:
143100
return None
144101

145-
all_shard_aggrs = [agg for agg in AggregateList.get_all(elevated)
146-
if agg.name.startswith(self._SHARD_PREFIX)]
147-
if not all_shard_aggrs:
148-
return None
149-
150-
shard_aggr = sorted(
151-
all_shard_aggrs,
152-
reverse=True,
153-
key=lambda aggr: sum(i for h, i in k8s_hosts.items()
154-
if h in aggr.hosts))[0]
155-
156-
return shard_aggr.name
102+
return k8s_shard_aggrs[0].name
157103

158104
def filter_all(self, filter_obj_list, spec_obj):
159105
# Only VMware
@@ -213,8 +159,8 @@ def _host_passes(self, host_state, spec_obj, k8s_shard):
213159

214160
if k8s_shard:
215161
if k8s_shard not in host_shard_names:
216-
LOG.debug("%(host_state)s is not part of the requested "
217-
"K8S cluster shard '%(k8s_shard)s'",
162+
LOG.debug("%(host_state)s is not part of the K8S "
163+
"cluster's shard '%(k8s_shard)s'",
218164
{'host_state': host_state,
219165
'k8s_shard': k8s_shard})
220166
return False

nova/tests/unit/objects/test_objects.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,7 @@ def obj_name(cls):
10551055
'CellMapping': '1.1-5d652928000a5bc369d79d5bde7e497d',
10561056
'CellMappingList': '1.1-496ef79bb2ab41041fff8bcb57996352',
10571057
'ComputeNode': '1.19-af6bd29a6c3b225da436a0d8487096f2',
1058-
'ComputeNodeList': '1.17-52f3b0962b1c86b98590144463ebb192',
1058+
'ComputeNodeList': '1.17-bb54e3fd5415be274c5515577acafe3d',
10591059
'ConsoleAuthToken': '1.1-8da320fb065080eb4d3c2e5c59f8bf52',
10601060
'CpuDiagnostics': '1.0-d256f2e442d1b837735fd17dfe8e3d47',
10611061
'Destination': '1.4-3b440d29459e2c98987ad5b25ad1cb2c',

nova/tests/unit/scheduler/filters/test_shard_filter.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
import mock
1818

19-
from nova.db.main import api as main_db_api
2019
from nova import objects
2120
from nova.scheduler.filters import shard_filter
2221
from nova import test
@@ -84,6 +83,7 @@ def test_shard_baremetal_passes(self, agg_mock, get_by_uuid):
8483
spec_obj = objects.RequestSpec(
8584
context=mock.sentinel.ctx, project_id='foo',
8685
instance_uuid=self.fake_build_req.instance_uuid,
86+
availability_zone=None,
8787
flavor=fake_flavor.fake_flavor_obj(
8888
mock.sentinel.ctx, expected_attrs=['extra_specs'],
8989
extra_specs=extra_specs))
@@ -102,6 +102,7 @@ def test_shard_project_not_found(self, agg_mock, mock_update_cache,
102102
spec_obj = objects.RequestSpec(
103103
context=mock.sentinel.ctx, project_id='bar',
104104
instance_uuid=self.fake_build_req.instance_uuid,
105+
availability_zone=None,
105106
flavor=fake_flavor.fake_flavor_obj(
106107
mock.sentinel.ctx, expected_attrs=['extra_specs']))
107108
self._assert_passes(host, spec_obj, False)
@@ -116,6 +117,7 @@ def test_shard_project_no_shards(self, agg_mock, get_by_uuid):
116117
spec_obj = objects.RequestSpec(
117118
context=mock.sentinel.ctx, project_id='foo',
118119
instance_uuid=self.fake_build_req.instance_uuid,
120+
availability_zone=None,
119121
flavor=fake_flavor.fake_flavor_obj(
120122
mock.sentinel.ctx, expected_attrs=['extra_specs']))
121123

@@ -130,6 +132,7 @@ def test_shard_host_no_shard_aggregate(self, agg_mock, get_by_uuid):
130132
spec_obj = objects.RequestSpec(
131133
context=mock.sentinel.ctx, project_id='foo',
132134
instance_uuid=self.fake_build_req.instance_uuid,
135+
availability_zone=None,
133136
flavor=fake_flavor.fake_flavor_obj(
134137
mock.sentinel.ctx, expected_attrs=['extra_specs']))
135138

@@ -144,6 +147,7 @@ def test_shard_host_no_shards_in_aggregate(self, get_by_uuid):
144147
spec_obj = objects.RequestSpec(
145148
context=mock.sentinel.ctx, project_id='foo',
146149
instance_uuid=self.fake_build_req.instance_uuid,
150+
availability_zone=None,
147151
flavor=fake_flavor.fake_flavor_obj(
148152
mock.sentinel.ctx, expected_attrs=['extra_specs']))
149153

@@ -158,6 +162,7 @@ def test_shard_project_shard_match_host_shard(self, get_by_uuid):
158162
spec_obj = objects.RequestSpec(
159163
context=mock.sentinel.ctx, project_id='foo',
160164
instance_uuid=self.fake_build_req.instance_uuid,
165+
availability_zone=None,
161166
flavor=fake_flavor.fake_flavor_obj(
162167
mock.sentinel.ctx, expected_attrs=['extra_specs']))
163168

@@ -172,6 +177,7 @@ def test_shard_project_shard_do_not_match_host_shard(self, get_by_uuid):
172177
spec_obj = objects.RequestSpec(
173178
context=mock.sentinel.ctx, project_id='foo',
174179
instance_uuid=self.fake_build_req.instance_uuid,
180+
availability_zone=None,
175181
flavor=fake_flavor.fake_flavor_obj(
176182
mock.sentinel.ctx, expected_attrs=['extra_specs']))
177183

@@ -186,6 +192,7 @@ def test_shard_project_has_multiple_shards_per_az(self, get_by_uuid):
186192
spec_obj = objects.RequestSpec(
187193
context=mock.sentinel.ctx, project_id='foo',
188194
instance_uuid=self.fake_build_req.instance_uuid,
195+
availability_zone=None,
189196
flavor=fake_flavor.fake_flavor_obj(
190197
mock.sentinel.ctx, expected_attrs=['extra_specs']))
191198

@@ -205,6 +212,7 @@ def test_shard_project_has_multiple_shards_per_az_resize_same_shard(
205212
spec_obj = objects.RequestSpec(
206213
context=mock.sentinel.ctx, project_id='foo',
207214
instance_uuid=self.fake_build_req.instance_uuid,
215+
availability_zone=None,
208216
flavor=fake_flavor.fake_flavor_obj(
209217
mock.sentinel.ctx, expected_attrs=['extra_specs']),
210218
scheduler_hints=dict(_nova_check_type=['resize'],
@@ -227,6 +235,7 @@ def test_shard_project_has_multiple_shards_per_az_resize_other_shard(
227235
flavor=fake_flavor.fake_flavor_obj(
228236
mock.sentinel.ctx, expected_attrs=['extra_specs']),
229237
instance_uuid=self.fake_build_req.instance_uuid,
238+
availability_zone=None,
230239
scheduler_hints=dict(_nova_check_type=['resize'],
231240
source_host=['host2']))
232241

@@ -245,6 +254,7 @@ def test_shard_project_has_sharding_enabled_any_host_passes(
245254
spec_obj = objects.RequestSpec(
246255
context=mock.sentinel.ctx, project_id='baz',
247256
instance_uuid=self.fake_build_req.instance_uuid,
257+
availability_zone=None,
248258
flavor=fake_flavor.fake_flavor_obj(
249259
mock.sentinel.ctx, expected_attrs=['extra_specs']))
250260
self._assert_passes(host, spec_obj, True)
@@ -261,6 +271,7 @@ def test_shard_project_has_sharding_enabled_and_single_shards(
261271
spec_obj = objects.RequestSpec(
262272
context=mock.sentinel.ctx, project_id='baz',
263273
instance_uuid=self.fake_build_req.instance_uuid,
274+
availability_zone=None,
264275
flavor=fake_flavor.fake_flavor_obj(
265276
mock.sentinel.ctx, expected_attrs=['extra_specs']))
266277
self._assert_passes(host, spec_obj, True)
@@ -287,7 +298,7 @@ def test_same_shard_for_kubernikus_cluster(self, get_context,
287298

288299
gather_host.assert_called_once_with(
289300
get_context.return_value,
290-
main_db_api.get_k8s_hosts_by_instances_tag,
301+
objects.ComputeNodeList.get_k8s_hosts_by_instances_tag,
291302
'kubernikus:kluster-example',
292303
filters={'hv_type': 'VMware vCenter Server',
293304
'availability_zone': 'az-2'})
@@ -321,7 +332,7 @@ def test_same_shard_for_gardener_cluster(self, get_context,
321332

322333
gather_host.assert_called_once_with(
323334
get_context.return_value,
324-
main_db_api.get_k8s_hosts_by_instances_metadata,
335+
objects.ComputeNodeList.get_k8s_hosts_by_instances_metadata,
325336
gardener_cluster, '1',
326337
filters={'hv_type': 'VMware vCenter Server',
327338
'availability_zone': 'az-2'})
@@ -351,7 +362,7 @@ def test_same_shard_for_nonbuild_requests(self, get_context,
351362

352363
gather_host.assert_called_once_with(
353364
get_context.return_value,
354-
main_db_api.get_k8s_hosts_by_instances_metadata,
365+
objects.ComputeNodeList.get_k8s_hosts_by_instances_metadata,
355366
gardener_cluster, '1',
356367
filters={'hv_type': 'VMware vCenter Server',
357368
'availability_zone': 'az-2'})
@@ -467,6 +478,7 @@ def test_log_level_for_missing_vc_aggregate(self, agg_mock, log_mock,
467478
spec_obj = objects.RequestSpec(
468479
context=mock.sentinel.ctx, project_id='foo',
469480
instance_uuid=self.fake_build_req.instance_uuid,
481+
availability_zone=None,
470482
flavor=fake_flavor.fake_flavor_obj(
471483
mock.sentinel.ctx, expected_attrs=['extra_specs']))
472484

nova/virt/driver.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1807,6 +1807,14 @@ def in_cluster_vmotion(self, context, instance, host_moref_value):
18071807
"""
18081808
raise NotImplementedError()
18091809

1810+
def validate_instance_group_policy(self, context, instance):
1811+
"""Validates that the instance meets driver-specific grouping policy
1812+
1813+
The driver can raise exception.RescheduledException to reject and
1814+
trigger rescheduling of the instance to a different host.
1815+
"""
1816+
pass
1817+
18101818

18111819
def load_compute_driver(virtapi, compute_driver=None):
18121820
"""Load a compute driver module.

nova/virt/vmwareapi/driver.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,3 +1319,6 @@ def in_cluster_vmotion(self, context, instance, host_moref_value):
13191319
vim_util.get_moref_value(current_host_ref),
13201320
vim_util.get_moref_value(host_ref),
13211321
instance=instance)
1322+
1323+
def validate_instance_group_policy(self, context, instance):
1324+
self._vmops._check_k8s_shard(instance)

0 commit comments

Comments
 (0)