2424logger = get_logger (__name__ )
2525
2626CHART_REGEX = re .compile (r"^(?P<chart>.+):(?P<version>.+)$" )
27+ DEPLOY_RESOURCE_MAX_RETRIES = 3
28+ DEPLOY_RESOURCE_RETRY_BASE_SECONDS = 10
29+ DEPLOY_RESOURCE_RETRY_MAX_SECONDS = 30
30+
31+
32+ class MissingSerializedInstanceError (ValueError ):
33+ """Raised when a serialized model/pk cannot be resolved from the database."""
34+
35+ def __init__ (self , model : str , pk : int , base_instance_exists : bool ):
36+ self .model = model
37+ self .pk = pk
38+ self .base_instance_exists = base_instance_exists
39+ super ().__init__ (
40+ f"No instance found for model { model } with pk { pk } (base_instance_exists={ base_instance_exists } )"
41+ )
42+
43+
44+ def _retry_countdown (current_retries : int ) -> int :
45+ return min (DEPLOY_RESOURCE_RETRY_BASE_SECONDS * (2 ** current_retries ), DEPLOY_RESOURCE_RETRY_MAX_SECONDS )
2746
2847
2948@app .task
@@ -198,11 +217,71 @@ def get_manifest_yaml(release_name: str, namespace: str = "default") -> tuple[st
198217 return e .stdout , e .stderr
199218
200219
201- @shared_task
220+ @shared_task ( bind = True , max_retries = DEPLOY_RESOURCE_MAX_RETRIES )
202221@transaction .atomic
203- def deploy_resource (serialized_instance ):
204- instance : BaseAppInstance = deserialize (serialized_instance )
205- logger .info ("Deploying resource for instance %s" , instance )
222+ def deploy_resource (self , serialized_instance ):
223+ model = serialized_instance .get ("model" ) if isinstance (serialized_instance , dict ) else None
224+ pk = serialized_instance .get ("pk" ) if isinstance (serialized_instance , dict ) else None
225+ task_id = getattr (self .request , "id" , None )
226+
227+ logger .info (
228+ "deploy_resource.start task_id=%s model=%s pk=%s retry=%s" ,
229+ task_id ,
230+ model ,
231+ pk ,
232+ self .request .retries ,
233+ )
234+
235+ try :
236+ instance : BaseAppInstance = deserialize (serialized_instance )
237+ except MissingSerializedInstanceError as exc :
238+ retries = self .request .retries
239+ if retries < self .max_retries :
240+ countdown = _retry_countdown (retries )
241+ logger .warning (
242+ "deploy_resource.missing_instance_retry task_id=%s model=%s pk=%s retry=%s/%s "
243+ "countdown=%ss base_instance_exists=%s" ,
244+ task_id ,
245+ exc .model ,
246+ exc .pk ,
247+ retries + 1 ,
248+ self .max_retries ,
249+ countdown ,
250+ exc .base_instance_exists ,
251+ )
252+ raise self .retry (exc = exc , countdown = countdown )
253+
254+ logger .error (
255+ "deploy_resource.missing_instance_exhausted task_id=%s model=%s pk=%s retries=%s "
256+ "base_instance_exists=%s" ,
257+ task_id ,
258+ exc .model ,
259+ exc .pk ,
260+ retries ,
261+ exc .base_instance_exists ,
262+ )
263+ raise
264+
265+ logger .info (
266+ "deploy_resource.instance_resolved task_id=%s model=%s pk=%s instance_id=%s app_slug=%s" ,
267+ task_id ,
268+ model ,
269+ pk ,
270+ instance .pk ,
271+ instance .app .slug ,
272+ )
273+
274+ deleted_on = getattr (instance , "deleted_on" , None )
275+ if instance .latest_user_action in {"Deleting" , "SystemDeleting" } or deleted_on is not None :
276+ logger .info (
277+ "deploy_resource.skip_deleting task_id=%s instance_id=%s latest_user_action=%s deleted_on=%s" ,
278+ task_id ,
279+ instance .pk ,
280+ instance .latest_user_action ,
281+ deleted_on ,
282+ )
283+ return
284+
206285 values = get_merged_k8s_values (instance , ensure_up_to_date = True )
207286 release = values ["subdomain" ]
208287 chart : str = instance .chart
@@ -263,17 +342,59 @@ def deploy_resource(serialized_instance):
263342 if not valid_deployment :
264343 logger .warning (f"The deployment manifest file is INVALID for release { release } . { validation_output } " )
265344
345+ logger .info (
346+ "deploy_resource.helm_install_start task_id=%s instance_id=%s release=%s namespace=%s chart=%s version=%s" ,
347+ task_id ,
348+ instance .pk ,
349+ release ,
350+ values ["namespace" ],
351+ chart ,
352+ version ,
353+ )
354+
266355 # Install the app using Helm install
267356 output , error = helm_install (release , chart , values ["namespace" ], values_file , version )
268357 success = not error
269358
359+ if not success :
360+ retries = self .request .retries
361+ logger .warning (
362+ "deploy_resource.helm_install_failed task_id=%s instance_id=%s retry=%s/%s release=%s stderr=%s" ,
363+ task_id ,
364+ instance .pk ,
365+ retries ,
366+ self .max_retries ,
367+ release ,
368+ error ,
369+ )
370+ if retries < self .max_retries :
371+ countdown = _retry_countdown (retries )
372+ logger .info (
373+ "deploy_resource.helm_install_retry task_id=%s instance_id=%s retry=%s/%s countdown=%ss" ,
374+ task_id ,
375+ instance .pk ,
376+ retries + 1 ,
377+ self .max_retries ,
378+ countdown ,
379+ )
380+ raise self .retry (exc = RuntimeError (error or "Helm install failed" ), countdown = countdown )
381+
382+ logger .info (
383+ "deploy_resource.helm_install_done task_id=%s instance_id=%s success=%s release=%s" ,
384+ task_id ,
385+ instance .pk ,
386+ success ,
387+ release ,
388+ )
389+
270390 helm_info = {"success" : success , "info" : {"stdout" : output , "stderr" : error }}
271391
272392 instance .info = dict (helm = helm_info )
273393 # instance.app_status.status = "Created" if success else "Failed"
274394
275395 # Only update the info field to avoid overriding other modified fields elsewhere
276396 instance .save (update_fields = ["info" ])
397+ logger .info ("deploy_resource.info_saved task_id=%s instance_id=%s success=%s" , task_id , instance .pk , success )
277398
278399 # In development, also generate and validate the k8s deployment manifest
279400 if settings .DEBUG :
@@ -287,6 +408,14 @@ def deploy_resource(serialized_instance):
287408 if deployment_file :
288409 subprocess .run (["rm" , "-f" , deployment_file ])
289410
411+ logger .info (
412+ "deploy_resource.finish task_id=%s instance_id=%s success=%s valid_deployment=%s" ,
413+ task_id ,
414+ instance .pk ,
415+ success ,
416+ valid_deployment ,
417+ )
418+
290419
291420@shared_task
292421@transaction .atomic
@@ -303,7 +432,13 @@ def delete_resource(serialized_instance, initiated_by_str: str):
303432 - serialized_instance: A serialized version of the app to be deleted.
304433 - initiated_by_str: A string of enum AppActionOrigin indicating the source of the deletion (user|system).
305434 """
306- logger .debug (f"Type of serialized_instance is { type (serialized_instance )} " )
435+ logger .info (
436+ "delete_resource.start model=%s pk=%s initiated_by=%s payload_type=%s" ,
437+ serialized_instance .get ("model" ) if isinstance (serialized_instance , dict ) else None ,
438+ serialized_instance .get ("pk" ) if isinstance (serialized_instance , dict ) else None ,
439+ initiated_by_str ,
440+ type (serialized_instance ),
441+ )
307442
308443 initiated_by = AppActionOrigin (initiated_by_str )
309444 assert initiated_by == AppActionOrigin .USER or initiated_by == AppActionOrigin .SYSTEM
@@ -362,12 +497,14 @@ def deserialize(serialized_instance):
362497
363498 model_class = apps .get_model (app_label , model_name )
364499 instance = model_class .objects .get (pk = pk )
500+ logger .info ("deserialize.resolved model=%s pk=%s concrete_model=%s" , model , pk , model_class .__name__ )
365501
366502 return instance
367503 except (KeyError , ValueError ) as e :
368504 raise ValueError (f"Invalid serialized data format: { e } " )
369505 except ObjectDoesNotExist :
370- raise ValueError (f"No instance found for model { model } with pk { pk } " )
506+ base_instance_exists = BaseAppInstance .objects .filter (pk = pk ).exists ()
507+ raise MissingSerializedInstanceError (model = model , pk = pk , base_instance_exists = base_instance_exists )
371508
372509
373510@app .task
0 commit comments