1818from pathlib import Path
1919from typing import Dict
2020
21+ import tenacity
2122import yaml
2223from charmed_kubeflow_chisme .exceptions import ErrorWithStatus , GenericCharmRuntimeError
2324from charmed_kubeflow_chisme .kubernetes import KubernetesResourceHandler
2425from charmed_kubeflow_chisme .lightkube .batch import delete_many
2526from charmed_kubeflow_chisme .pebble import update_layer
2627from charmed_kubeflow_chisme .service_mesh import generate_allow_all_authorization_policy
28+ from charmed_kubeflow_chisme .types import LightkubeResourcesList
2729from charmed_service_mesh_helpers .interfaces import GatewayMetadataRequirer
2830from charms .istio_beacon_k8s .v0 .service_mesh import MeshType , PolicyResourceManager
2931from charms .istio_pilot .v0 .istio_gateway_info import (
4042)
4143from jinja2 import Template
4244from jsonschema import ValidationError
43- from lightkube import ApiError
45+ from lightkube import ApiError , Client
46+ from lightkube .generic_resource import load_in_cluster_generic_resources
4447from lightkube .models .core_v1 import ServicePort
4548from lightkube_extensions .batch import create_charm_default_labels
4649from ops import main
104107METRICS_PORT = 8080
105108
106109
110+ # For errors when a K8s object exists while it shouldn't
111+ class ObjectStillExistsError (Exception ):
112+ """Exception for when a K8s object exists, while it should have been removed."""
113+
114+ def __init__ (self , resource_name : str ):
115+ self .resource_name = resource_name
116+
117+
107118def parse_images_config (config : str ) -> Dict :
108119 """
109120 Parse a YAML config-defined images list.
@@ -140,6 +151,7 @@ def __init__(self, *args):
140151 super ().__init__ (* args )
141152 self .custom_images = []
142153 self .images_context = {}
154+ self .inference_service_context = {}
143155 self ._ingress_gateway_requirer = GatewayRequirer (
144156 self , relation_name = SDI_INGRESS_GATEWAY_RELATION
145157 )
@@ -169,7 +181,6 @@ def __init__(self, *args):
169181 self .framework .observe (event , self ._on_event )
170182
171183 self ._k8s_resource_handler = None
172- self ._crd_resource_handler = None
173184 self ._cm_resource_handler = None
174185 self ._cluster_runtimes_resource_handler = None
175186 self ._secrets_manifests_wrapper = None
@@ -236,8 +247,7 @@ def _context(self):
236247 "no_proxy" : self .model .config ["no-proxy" ],
237248 }
238249
239- @property
240- def _inference_service_context (self ):
250+ def generate_inference_service_context (self ):
241251 """Context for rendering the inferenceservive-config ConfigMap."""
242252 # Ensure any input is valid for deployment mode
243253 deployment_mode = self ._deployment_mode
@@ -283,7 +293,7 @@ def cm_resource_handler(self):
283293 self ._cm_resource_handler = KubernetesResourceHandler (
284294 field_manager = self ._lightkube_field_manager ,
285295 template_files = CONFIG_FILES ,
286- context = {** self ._inference_service_context , ** self .images_context },
296+ context = {** self .inference_service_context , ** self .images_context },
287297 logger = log ,
288298 )
289299 return self ._cm_resource_handler
@@ -298,7 +308,7 @@ def cluster_runtimes_resource_handler(self):
298308 context = {** self .images_context },
299309 logger = log ,
300310 )
301-
311+ load_in_cluster_generic_resources ( self . _cluster_runtimes_resource_handler . lightkube_client )
302312 return self ._cluster_runtimes_resource_handler
303313
304314 @property
@@ -584,6 +594,7 @@ def _on_event(self, event):
584594 try :
585595 self .custom_images = parse_images_config (self .model .config ["custom_images" ])
586596 self .images_context = self .get_images (DEFAULT_IMAGES , self .custom_images )
597+ self .inference_service_context = self .generate_inference_service_context ()
587598 self .unit .status = MaintenanceStatus ("Creating k8s resources" )
588599 self .reconcile_authorization_policies ()
589600 self .k8s_resource_handler .apply ()
@@ -659,14 +670,7 @@ def _on_event(self, event):
659670 log .error (api_err )
660671 raise
661672
662- def _on_remove (self , event ):
663- try :
664- self .custom_images = parse_images_config (self .model .config ["custom_images" ])
665- self .images_context = self .get_images (DEFAULT_IMAGES , self .custom_images )
666- except ErrorWithStatus as err :
667- self .model .unit .status = err .status
668- log .error (f"Failed to handle { event } with error: { err } " )
669- return
673+ def _on_remove (self , _ ):
670674 self .unit .status = MaintenanceStatus ("Removing k8s resources" )
671675
672676 # remove AuthorizationPolicies
@@ -675,20 +679,63 @@ def _on_remove(self, event):
675679 handlers = [
676680 self .k8s_resource_handler ,
677681 self .cm_resource_handler ,
678- self .cluster_runtimes_resource_handler ,
679682 ]
680-
681683 try :
684+ runtimes_manifests = self .cluster_runtimes_resource_handler .render_manifests ()
685+ delete_many (
686+ self .cluster_runtimes_resource_handler .lightkube_client ,
687+ runtimes_manifests ,
688+ )
689+ for runtime_name , runtime_kind in _extract_runtimes_names (runtimes_manifests ).items ():
690+ self .ensure_resource_is_deleted (
691+ client = self .cluster_runtimes_resource_handler .lightkube_client ,
692+ resource_kind = runtime_kind ,
693+ resource_name = runtime_name ,
694+ )
682695 for handler in handlers :
683696 delete_many (
684697 handler .lightkube_client ,
685698 handler .render_manifests (),
686699 )
687700 except ApiError as e :
688- log .warning (f"Failed to delete resources, with error: { e } " )
701+ if e .status .code != 404 :
702+ log .warning (f"Failed to delete resources, with error: { e } " )
703+ raise e
704+ except ObjectStillExistsError as e :
705+ log .warning (
706+ "Failed to remove resource: %s. Manual intervention for cleanup might be required" ,
707+ e .resource_name ,
708+ )
689709 raise e
690710 self .unit .status = MaintenanceStatus ("K8s resources removed" )
691711
712+ @tenacity .retry (stop = tenacity .stop_after_delay (300 ), wait = tenacity .wait_fixed (5 ), reraise = True )
713+ def ensure_resource_is_deleted (self , client : Client , resource_kind , resource_name : str ):
714+ """Check if the CRD doesn't exist with retries.
715+
716+ The function will keep retrying until the CRD is deleted, and handle the
717+ 404 error once it gets deleted.
718+
719+ Args:
720+ crd_name: The CRD to be checked if it is deleted.
721+ client: The lightkube client to use for talking to K8s.
722+
723+ Raises:
724+ ApiError: From lightkube, if there was an error aside from 404.
725+ ObjectStillExistsError: If the Profile's namespace was not deleted after retries.
726+ """
727+ log .info ("Checking if resource exists: %s" , resource_name )
728+ try :
729+ client .get (resource_kind , name = resource_name )
730+ log .info ('Resource "%s" exists, retrying...' , resource_name )
731+ raise ObjectStillExistsError (resource_name )
732+ except ApiError as e :
733+ if e .status .code == 404 :
734+ log .info ('Resource "%s" does not exist!' , resource_name )
735+ return
736+ # Raise any other error
737+ raise
738+
692739 def _check_container_connection (self , container : Container ) -> None :
693740 """Check if connection can be made with container.
694741
@@ -873,5 +920,22 @@ def _restart_controller_service(self) -> None:
873920 ) from err
874921
875922
923+ def _extract_runtimes_names (manifests : LightkubeResourcesList ) -> dict :
924+ """
925+ Extracts a mapping of runtime resource names to their kinds.
926+
927+ Args:
928+ manifests (LightkubeResourcesList): List of runtime manifest objects,
929+ each with metadata and kind.
930+
931+ Returns:
932+ dict: Dictionary mapping resource names (str) to their kind.
933+ """
934+ runtimes_kind_name_mapping = {}
935+ for runtime in manifests :
936+ runtimes_kind_name_mapping .update ({runtime .metadata .name : runtime .__class__ })
937+ return runtimes_kind_name_mapping
938+
939+
876940if __name__ == "__main__" :
877941 main (KServeControllerCharm )
0 commit comments