Skip to content

Commit f9be867

Browse files
authored
fix: remove resources in correct order (#414)
Signed-off-by: Dario Faccin <dario.faccin@canonical.com>
1 parent 7f2a75f commit f9be867

File tree

7 files changed

+645
-436
lines changed

7 files changed

+645
-436
lines changed

charms/kserve-controller/poetry.lock

Lines changed: 414 additions & 369 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charms/kserve-controller/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ ops = "^2.22.0"
5757
serialized-data-interface = "^0.7.0"
5858
charmed-service-mesh-helpers = "^0.3.0"
5959
lightkube-extensions = "^0.3.0"
60+
tenacity = "^9.1.2"
6061

6162
[tool.poetry.group.fmt]
6263
optional = true

charms/kserve-controller/src/charm.py

Lines changed: 81 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
from pathlib import Path
1919
from typing import Dict
2020

21+
import tenacity
2122
import yaml
2223
from charmed_kubeflow_chisme.exceptions import ErrorWithStatus, GenericCharmRuntimeError
2324
from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler
2425
from charmed_kubeflow_chisme.lightkube.batch import delete_many
2526
from charmed_kubeflow_chisme.pebble import update_layer
2627
from charmed_kubeflow_chisme.service_mesh import generate_allow_all_authorization_policy
28+
from charmed_kubeflow_chisme.types import LightkubeResourcesList
2729
from charmed_service_mesh_helpers.interfaces import GatewayMetadataRequirer
2830
from charms.istio_beacon_k8s.v0.service_mesh import MeshType, PolicyResourceManager
2931
from charms.istio_pilot.v0.istio_gateway_info import (
@@ -40,7 +42,8 @@
4042
)
4143
from jinja2 import Template
4244
from jsonschema import ValidationError
43-
from lightkube import ApiError
45+
from lightkube import ApiError, Client
46+
from lightkube.generic_resource import load_in_cluster_generic_resources
4447
from lightkube.models.core_v1 import ServicePort
4548
from lightkube_extensions.batch import create_charm_default_labels
4649
from ops import main
@@ -104,6 +107,14 @@
104107
METRICS_PORT = 8080
105108

106109

110+
# For errors when a K8s object exists while it shouldn't
111+
class ObjectStillExistsError(Exception):
112+
"""Exception for when a K8s object exists, while it should have been removed."""
113+
114+
def __init__(self, resource_name: str):
115+
self.resource_name = resource_name
116+
117+
107118
def parse_images_config(config: str) -> Dict:
108119
"""
109120
Parse a YAML config-defined images list.
@@ -140,6 +151,7 @@ def __init__(self, *args):
140151
super().__init__(*args)
141152
self.custom_images = []
142153
self.images_context = {}
154+
self.inference_service_context = {}
143155
self._ingress_gateway_requirer = GatewayRequirer(
144156
self, relation_name=SDI_INGRESS_GATEWAY_RELATION
145157
)
@@ -169,7 +181,6 @@ def __init__(self, *args):
169181
self.framework.observe(event, self._on_event)
170182

171183
self._k8s_resource_handler = None
172-
self._crd_resource_handler = None
173184
self._cm_resource_handler = None
174185
self._cluster_runtimes_resource_handler = None
175186
self._secrets_manifests_wrapper = None
@@ -236,8 +247,7 @@ def _context(self):
236247
"no_proxy": self.model.config["no-proxy"],
237248
}
238249

239-
@property
240-
def _inference_service_context(self):
250+
def generate_inference_service_context(self):
241251
"""Context for rendering the inferenceservive-config ConfigMap."""
242252
# Ensure any input is valid for deployment mode
243253
deployment_mode = self._deployment_mode
@@ -283,7 +293,7 @@ def cm_resource_handler(self):
283293
self._cm_resource_handler = KubernetesResourceHandler(
284294
field_manager=self._lightkube_field_manager,
285295
template_files=CONFIG_FILES,
286-
context={**self._inference_service_context, **self.images_context},
296+
context={**self.inference_service_context, **self.images_context},
287297
logger=log,
288298
)
289299
return self._cm_resource_handler
@@ -298,7 +308,7 @@ def cluster_runtimes_resource_handler(self):
298308
context={**self.images_context},
299309
logger=log,
300310
)
301-
311+
load_in_cluster_generic_resources(self._cluster_runtimes_resource_handler.lightkube_client)
302312
return self._cluster_runtimes_resource_handler
303313

304314
@property
@@ -584,6 +594,7 @@ def _on_event(self, event):
584594
try:
585595
self.custom_images = parse_images_config(self.model.config["custom_images"])
586596
self.images_context = self.get_images(DEFAULT_IMAGES, self.custom_images)
597+
self.inference_service_context = self.generate_inference_service_context()
587598
self.unit.status = MaintenanceStatus("Creating k8s resources")
588599
self.reconcile_authorization_policies()
589600
self.k8s_resource_handler.apply()
@@ -659,14 +670,7 @@ def _on_event(self, event):
659670
log.error(api_err)
660671
raise
661672

662-
def _on_remove(self, event):
663-
try:
664-
self.custom_images = parse_images_config(self.model.config["custom_images"])
665-
self.images_context = self.get_images(DEFAULT_IMAGES, self.custom_images)
666-
except ErrorWithStatus as err:
667-
self.model.unit.status = err.status
668-
log.error(f"Failed to handle {event} with error: {err}")
669-
return
673+
def _on_remove(self, _):
670674
self.unit.status = MaintenanceStatus("Removing k8s resources")
671675

672676
# remove AuthorizationPolicies
@@ -675,20 +679,63 @@ def _on_remove(self, event):
675679
handlers = [
676680
self.k8s_resource_handler,
677681
self.cm_resource_handler,
678-
self.cluster_runtimes_resource_handler,
679682
]
680-
681683
try:
684+
runtimes_manifests = self.cluster_runtimes_resource_handler.render_manifests()
685+
delete_many(
686+
self.cluster_runtimes_resource_handler.lightkube_client,
687+
runtimes_manifests,
688+
)
689+
for runtime_name, runtime_kind in _extract_runtimes_names(runtimes_manifests).items():
690+
self.ensure_resource_is_deleted(
691+
client=self.cluster_runtimes_resource_handler.lightkube_client,
692+
resource_kind=runtime_kind,
693+
resource_name=runtime_name,
694+
)
682695
for handler in handlers:
683696
delete_many(
684697
handler.lightkube_client,
685698
handler.render_manifests(),
686699
)
687700
except ApiError as e:
688-
log.warning(f"Failed to delete resources, with error: {e}")
701+
if e.status.code != 404:
702+
log.warning(f"Failed to delete resources, with error: {e}")
703+
raise e
704+
except ObjectStillExistsError as e:
705+
log.warning(
706+
"Failed to remove resource: %s. Manual intervention for cleanup might be required",
707+
e.resource_name,
708+
)
689709
raise e
690710
self.unit.status = MaintenanceStatus("K8s resources removed")
691711

712+
@tenacity.retry(stop=tenacity.stop_after_delay(300), wait=tenacity.wait_fixed(5), reraise=True)
713+
def ensure_resource_is_deleted(self, client: Client, resource_kind, resource_name: str):
714+
"""Check if the CRD doesn't exist with retries.
715+
716+
The function will keep retrying until the CRD is deleted, and handle the
717+
404 error once it gets deleted.
718+
719+
Args:
720+
crd_name: The CRD to be checked if it is deleted.
721+
client: The lightkube client to use for talking to K8s.
722+
723+
Raises:
724+
ApiError: From lightkube, if there was an error aside from 404.
725+
ObjectStillExistsError: If the Profile's namespace was not deleted after retries.
726+
"""
727+
log.info("Checking if resource exists: %s", resource_name)
728+
try:
729+
client.get(resource_kind, name=resource_name)
730+
log.info('Resource "%s" exists, retrying...', resource_name)
731+
raise ObjectStillExistsError(resource_name)
732+
except ApiError as e:
733+
if e.status.code == 404:
734+
log.info('Resource "%s" does not exist!', resource_name)
735+
return
736+
# Raise any other error
737+
raise
738+
692739
def _check_container_connection(self, container: Container) -> None:
693740
"""Check if connection can be made with container.
694741
@@ -873,5 +920,22 @@ def _restart_controller_service(self) -> None:
873920
) from err
874921

875922

923+
def _extract_runtimes_names(manifests: LightkubeResourcesList) -> dict:
924+
"""
925+
Extracts a mapping of runtime resource names to their kinds.
926+
927+
Args:
928+
manifests (LightkubeResourcesList): List of runtime manifest objects,
929+
each with metadata and kind.
930+
931+
Returns:
932+
dict: Dictionary mapping resource names (str) to their kind.
933+
"""
934+
runtimes_kind_name_mapping = {}
935+
for runtime in manifests:
936+
runtimes_kind_name_mapping.update({runtime.metadata.name: runtime.__class__})
937+
return runtimes_kind_name_mapping
938+
939+
876940
if __name__ == "__main__":
877941
main(KServeControllerCharm)

charms/kserve-controller/tests/integration/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
# ConfigMap
2121
CONFIGMAP_TEMPLATE_PATH = Path("./src/templates/configmap_manifests.yaml.j2")
2222
CONFIGMAP_DATA_INGRESS_DOMAIN = "example.com"
23-
CONFIGMAP_DATA_INGRESS_GATEWAY_NAMESPACE = "kubeflow"
2423

2524
# Lightkube resources
2625
POD_DEFAULT = lightkube.generic_resource.create_namespaced_resource(

charms/kserve-controller/tests/integration/test_charm.py

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
get_pod_names,
2424
)
2525
from lightkube import Client
26+
from lightkube.resources.apiextensions_v1 import CustomResourceDefinition
2627
from lightkube.resources.core_v1 import (
2728
ConfigMap,
2829
Pod,
@@ -42,7 +43,6 @@
4243
from tests.integration.constants import (
4344
APP_NAME,
4445
CONFIGMAP_DATA_INGRESS_DOMAIN,
45-
CONFIGMAP_DATA_INGRESS_GATEWAY_NAMESPACE,
4646
CONFIGMAP_NAME,
4747
CONFIGMAP_TEMPLATE_PATH,
4848
CONTAINERS_SECURITY_CONTEXT_MAP,
@@ -77,19 +77,22 @@
7777
custom_images = json.loads(Path(CUSTOM_IMAGES_PATH).read_text())
7878

7979
explainer_image, explainer_version = custom_images["configmap__explainers__art"].split(":")
80-
configmap_context = {
81-
**custom_images,
82-
"configmap__explainers__art__image": explainer_image,
83-
"configmap__explainers__art__version": explainer_version,
84-
"deployment_mode": "Serverless",
85-
"enable_gateway_api": "false",
86-
"ingress_domain": CONFIGMAP_DATA_INGRESS_DOMAIN,
87-
"local_gateway_namespace": CONFIGMAP_DATA_LOCAL_GATEWAY_NAMESPACE,
88-
"local_gateway_name": CONFIGMAP_DATA_LOCAL_GATEWAY_NAME,
89-
"local_gateway_service_name": CONFIGMAP_DATA_LOCAL_GATEWAY_SERVICE_NAME,
90-
"ingress_gateway_namespace": CONFIGMAP_DATA_INGRESS_GATEWAY_NAMESPACE,
91-
"ingress_gateway_name": CONFIGMAP_DATA_INGRESS_GATEWAY_NAME_SERVERLESS,
92-
}
80+
81+
82+
def generate_configmap_context(ingress_gateway_namespace: str) -> dict:
83+
return {
84+
**custom_images,
85+
"configmap__explainers__art__image": explainer_image,
86+
"configmap__explainers__art__version": explainer_version,
87+
"deployment_mode": "Serverless",
88+
"enable_gateway_api": "false",
89+
"ingress_domain": CONFIGMAP_DATA_INGRESS_DOMAIN,
90+
"local_gateway_namespace": CONFIGMAP_DATA_LOCAL_GATEWAY_NAMESPACE,
91+
"local_gateway_name": CONFIGMAP_DATA_LOCAL_GATEWAY_NAME,
92+
"local_gateway_service_name": CONFIGMAP_DATA_LOCAL_GATEWAY_SERVICE_NAME,
93+
"ingress_gateway_namespace": ingress_gateway_namespace,
94+
"ingress_gateway_name": CONFIGMAP_DATA_INGRESS_GATEWAY_NAME_SERVERLESS,
95+
}
9396

9497

9598
@pytest.mark.skip_if_deployed
@@ -265,7 +268,9 @@ async def test_configmap_created(lightkube_client: lightkube.Client, ops_test: O
265268
ConfigMap, CONFIGMAP_NAME, namespace=ops_test.model_name
266269
)
267270

268-
expected_configmap = populate_template(CONFIGMAP_TEMPLATE_PATH, configmap_context)
271+
expected_configmap = populate_template(
272+
CONFIGMAP_TEMPLATE_PATH, generate_configmap_context(ops_test.model_name)
273+
)
269274
assert inferenceservice_config.data == expected_configmap["data"]
270275

271276

@@ -290,6 +295,7 @@ async def test_configmap_changes_with_config(
290295
ConfigMap, CONFIGMAP_NAME, namespace=ops_test.model_name
291296
)
292297

298+
configmap_context = generate_configmap_context(ops_test.model_name)
293299
configmap_context["configmap__batcher"] = "custom:1.0"
294300

295301
expected_configmap = populate_template(CONFIGMAP_TEMPLATE_PATH, configmap_context)
@@ -487,3 +493,41 @@ async def test_container_security_context(
487493
CONTAINERS_SECURITY_CONTEXT_MAP,
488494
ops_test.model.name,
489495
)
496+
497+
498+
@pytest.mark.abort_on_fail
499+
async def test_remove_with_resources_present(ops_test: OpsTest):
500+
"""Test remove with all resources deployed.
501+
502+
Verify that all deployed resources that need to be removed are removed.
503+
504+
This test should be next after test_upgrade(), because it removes deployed charm.
505+
"""
506+
507+
@tenacity.retry(
508+
wait=tenacity.wait_exponential(multiplier=1, min=1, max=15),
509+
stop=tenacity.stop_after_delay(5 * 60),
510+
reraise=True,
511+
)
512+
def assert_resources_removed():
513+
"""Asserts on the resource removal.
514+
515+
Retries multiple times using tenacity to allow time for the resources to be deleted.
516+
"""
517+
lightkube_client = lightkube.Client()
518+
crd_list = iter(
519+
lightkube_client.list(
520+
CustomResourceDefinition,
521+
labels=[("app.juju.is/created-by", APP_NAME)],
522+
namespace=ops_test.model_name,
523+
)
524+
)
525+
# testing for empty list (iterator)
526+
_last = object()
527+
assert next(crd_list, _last) is _last
528+
529+
# remove deployed charm and verify that it is removed alongside resources it created
530+
await ops_test.model.remove_application(app_name=APP_NAME, block_until_done=True)
531+
assert APP_NAME not in ops_test.model.applications
532+
533+
assert_resources_removed()

charms/kserve-controller/tests/integration/test_charm_ambient.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
from tests.integration.constants import (
3838
APP_NAME,
3939
CONFIGMAP_DATA_INGRESS_DOMAIN,
40-
CONFIGMAP_DATA_INGRESS_GATEWAY_NAMESPACE,
4140
CONFIGMAP_NAME,
4241
CONFIGMAP_TEMPLATE_PATH,
4342
CONTAINERS_SECURITY_CONTEXT_MAP,
@@ -71,16 +70,19 @@
7170

7271

7372
explainer_image, explainer_version = custom_images["configmap__explainers__art"].split(":")
74-
configmap_context = {
75-
**custom_images,
76-
"configmap__explainers__art__image": explainer_image,
77-
"configmap__explainers__art__version": explainer_version,
78-
"deployment_mode": "RawDeployment",
79-
"enable_gateway_api": "true",
80-
"ingress_domain": CONFIGMAP_DATA_INGRESS_DOMAIN,
81-
"ingress_gateway_namespace": CONFIGMAP_DATA_INGRESS_GATEWAY_NAMESPACE,
82-
"ingress_gateway_name": "istio-ingress-k8s",
83-
}
73+
74+
75+
def generate_configmap_context(ingress_gateway_namespace: str) -> dict:
76+
return {
77+
**custom_images,
78+
"configmap__explainers__art__image": explainer_image,
79+
"configmap__explainers__art__version": explainer_version,
80+
"deployment_mode": "RawDeployment",
81+
"enable_gateway_api": "true",
82+
"ingress_domain": CONFIGMAP_DATA_INGRESS_DOMAIN,
83+
"ingress_gateway_namespace": ingress_gateway_namespace,
84+
"ingress_gateway_name": "istio-ingress-k8s",
85+
}
8486

8587

8688
@pytest.mark.skip_if_deployed
@@ -210,7 +212,9 @@ async def test_configmap_created(lightkube_client: lightkube.Client, ops_test: O
210212
ConfigMap, CONFIGMAP_NAME, namespace=ops_test.model_name
211213
)
212214

213-
expected_configmap = populate_template(CONFIGMAP_TEMPLATE_PATH, configmap_context)
215+
expected_configmap = populate_template(
216+
CONFIGMAP_TEMPLATE_PATH, generate_configmap_context(ops_test.model_name)
217+
)
214218
assert inferenceservice_config.data == expected_configmap["data"]
215219

216220

@@ -235,6 +239,7 @@ async def test_configmap_changes_with_config(
235239
ConfigMap, CONFIGMAP_NAME, namespace=ops_test.model_name
236240
)
237241

242+
configmap_context = generate_configmap_context(ops_test.model_name)
238243
configmap_context["configmap__batcher"] = "custom:1.0"
239244

240245
expected_configmap = populate_template(CONFIGMAP_TEMPLATE_PATH, configmap_context)

0 commit comments

Comments
 (0)