@@ -326,15 +326,41 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
326326 return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperResuming )
327327
328328 case workloadv1beta2 .AppWrapperFailed :
329+ // Support for debugging failed jobs.
330+ // When an appwrapper is annotated with a non-zero debugging delay,
331+ // we hold quota for the delay period and do not delete the resources of
332+ // a failed appwrapper unless Kueue preempts it by setting Suspend to true.
333+ deletionDelay := r .debuggingFailureDeletionDelay (ctx , aw )
334+
335+ if deletionDelay > 0 && ! aw .Spec .Suspend {
336+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
337+ Type : string (workloadv1beta2 .DeletingResources ),
338+ Status : metav1 .ConditionFalse ,
339+ Reason : "DeletionPaused" ,
340+ Message : fmt .Sprintf ("%v has value %v" , workloadv1beta2 .DebuggingFailureDeletionDelayDurationAnnotation , deletionDelay ),
341+ })
342+ whenDelayed := meta .FindStatusCondition (aw .Status .Conditions , string (workloadv1beta2 .DeletingResources )).LastTransitionTime
343+
344+ now := time .Now ()
345+ deadline := whenDelayed .Add (deletionDelay )
346+ if now .Before (deadline ) {
347+ return ctrl.Result {RequeueAfter : deadline .Sub (now )}, r .Status ().Update (ctx , aw )
348+ }
349+ }
350+
329351 if meta .IsStatusConditionTrue (aw .Status .Conditions , string (workloadv1beta2 .ResourcesDeployed )) {
330352 if ! r .deleteComponents (ctx , aw ) {
331353 return ctrl.Result {RequeueAfter : 5 * time .Second }, nil
332354 }
355+ msg := "Resources deleted for failed AppWrapper"
356+ if deletionDelay > 0 && aw .Spec .Suspend {
357+ msg = "Kueue forced resource deletion by suspending AppWrapper"
358+ }
333359 meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
334360 Type : string (workloadv1beta2 .ResourcesDeployed ),
335361 Status : metav1 .ConditionFalse ,
336362 Reason : string (workloadv1beta2 .AppWrapperFailed ),
337- Message : "Resources deleted for failed AppWrapper" ,
363+ Message : msg ,
338364 })
339365 }
340366 meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
@@ -393,26 +419,36 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv
393419 return summary , nil
394420}
395421
422+ func (r * AppWrapperReconciler ) limitDuration (desired time.Duration ) time.Duration {
423+ if desired < 0 {
424+ return 0 * time .Second
425+ } else if desired > r .Config .FaultTolerance .GracePeriodCeiling {
426+ return r .Config .FaultTolerance .GracePeriodCeiling
427+ } else {
428+ return desired
429+ }
430+ }
431+
396432func (r * AppWrapperReconciler ) warmupGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
397433 if userPeriod , ok := aw .Annotations [workloadv1beta2 .WarmupGracePeriodDurationAnnotation ]; ok {
398434 if duration , err := time .ParseDuration (userPeriod ); err == nil {
399- return duration
435+ return r . limitDuration ( duration )
400436 } else {
401437 log .FromContext (ctx ).Info ("Malformed warmup period annotation" , "annotation" , userPeriod , "error" , err )
402438 }
403439 }
404- return r .Config .FaultTolerance .WarmupGracePeriod
440+ return r .limitDuration ( r . Config .FaultTolerance .WarmupGracePeriod )
405441}
406442
407443func (r * AppWrapperReconciler ) failureGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
408444 if userPeriod , ok := aw .Annotations [workloadv1beta2 .FailureGracePeriodDurationAnnotation ]; ok {
409445 if duration , err := time .ParseDuration (userPeriod ); err == nil {
410- return duration
446+ return r . limitDuration ( duration )
411447 } else {
412448 log .FromContext (ctx ).Info ("Malformed grace period annotation" , "annotation" , userPeriod , "error" , err )
413449 }
414450 }
415- return r .Config .FaultTolerance .FailureGracePeriod
451+ return r .limitDuration ( r . Config .FaultTolerance .FailureGracePeriod )
416452}
417453
418454func (r * AppWrapperReconciler ) retryLimit (ctx context.Context , aw * workloadv1beta2.AppWrapper ) int32 {
@@ -429,12 +465,34 @@ func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1bet
429465func (r * AppWrapperReconciler ) resettingPauseDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
430466 if userPeriod , ok := aw .Annotations [workloadv1beta2 .ResetPauseDurationAnnotation ]; ok {
431467 if duration , err := time .ParseDuration (userPeriod ); err == nil {
432- return duration
468+ return r . limitDuration ( duration )
433469 } else {
434470 log .FromContext (ctx ).Info ("Malformed reset pause annotation" , "annotation" , userPeriod , "error" , err )
435471 }
436472 }
437- return r .Config .FaultTolerance .ResetPause
473+ return r .limitDuration (r .Config .FaultTolerance .ResetPause )
474+ }
475+
476+ func (r * AppWrapperReconciler ) deletionGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
477+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .DeletionGracePeriodAnnotation ]; ok {
478+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
479+ return r .limitDuration (duration )
480+ } else {
481+ log .FromContext (ctx ).Info ("Malformed deletion period annotation" , "annotation" , userPeriod , "error" , err )
482+ }
483+ }
484+ return r .limitDuration (r .Config .FaultTolerance .DeletionGracePeriod )
485+ }
486+
487+ func (r * AppWrapperReconciler ) debuggingFailureDeletionDelay (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
488+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .DebuggingFailureDeletionDelayDurationAnnotation ]; ok {
489+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
490+ return r .limitDuration (duration )
491+ } else {
492+ log .FromContext (ctx ).Info ("Malformed delay deletion annotation" , "annotation" , userPeriod , "error" , err )
493+ }
494+ }
495+ return 0 * time .Second
438496}
439497
440498func clearCondition (aw * workloadv1beta2.AppWrapper , condition workloadv1beta2.AppWrapperCondition , reason string , message string ) {
0 commit comments