@@ -600,8 +600,13 @@ func (c *FrameworkController) recoverFrameworkWorkItems(f *ci.Framework) {
600600func (c * FrameworkController ) recoverTimeoutChecks (f * ci.Framework ) {
601601 // If a check is already timeout, the timeout will be handled by the following
602602 // sync after the recover, so no need to enqueue it again.
603+ if f .Status .State == ci .FrameworkCompleted {
604+ c .enqueueFrameworkCompletedRetainTimeoutCheck (f , true )
605+ return
606+ }
603607 c .enqueueFrameworkAttemptCreationTimeoutCheck (f , true )
604608 c .enqueueFrameworkRetryDelayTimeoutCheck (f , true )
609+
605610 for _ , taskRoleStatus := range f .TaskRoleStatuses () {
606611 for _ , taskStatus := range taskRoleStatus .TaskStatuses {
607612 taskRoleName := taskRoleStatus .Name
@@ -612,23 +617,26 @@ func (c *FrameworkController) recoverTimeoutChecks(f *ci.Framework) {
612617 }
613618}
614619
615- func (c * FrameworkController ) enqueueFrameworkAttemptCreationTimeoutCheck (
620+ func (c * FrameworkController ) enqueueFrameworkCompletedRetainTimeoutCheck (
616621 f * ci.Framework , failIfTimeout bool ) bool {
617- if f .Status .State != ci .FrameworkAttemptCreationRequested {
622+ if f .Status .State != ci .FrameworkCompleted {
618623 return false
619624 }
620625
621- leftDuration := common .CurrentLeftDuration (
622- f .Status .TransitionTime ,
623- c .cConfig .ObjectLocalCacheCreationTimeoutSec )
624- if common .IsTimeout (leftDuration ) && failIfTimeout {
626+ return c .enqueueFrameworkTimeoutCheck (
627+ f , f .Status .TransitionTime , c .cConfig .FrameworkCompletedRetainSec ,
628+ failIfTimeout , "FrameworkCompletedRetainTimeoutCheck" )
629+ }
630+
631+ func (c * FrameworkController ) enqueueFrameworkAttemptCreationTimeoutCheck (
632+ f * ci.Framework , failIfTimeout bool ) bool {
633+ if f .Status .State != ci .FrameworkAttemptCreationRequested {
625634 return false
626635 }
627636
628- c .fQueue .AddAfter (f .Key (), leftDuration )
629- klog .Infof ("[%v]: enqueueFrameworkAttemptCreationTimeoutCheck after %v" ,
630- f .Key (), leftDuration )
631- return true
637+ return c .enqueueFrameworkTimeoutCheck (
638+ f , f .Status .TransitionTime , c .cConfig .ObjectLocalCacheCreationTimeoutSec ,
639+ failIfTimeout , "FrameworkAttemptCreationTimeoutCheck" )
632640}
633641
634642func (c * FrameworkController ) enqueueTaskAttemptCreationTimeoutCheck (
@@ -639,17 +647,9 @@ func (c *FrameworkController) enqueueTaskAttemptCreationTimeoutCheck(
639647 return false
640648 }
641649
642- leftDuration := common .CurrentLeftDuration (
643- taskStatus .TransitionTime ,
644- c .cConfig .ObjectLocalCacheCreationTimeoutSec )
645- if common .IsTimeout (leftDuration ) && failIfTimeout {
646- return false
647- }
648-
649- c .fQueue .AddAfter (f .Key (), leftDuration )
650- klog .Infof ("[%v][%v][%v]: enqueueTaskAttemptCreationTimeoutCheck after %v" ,
651- f .Key (), taskRoleName , taskIndex , leftDuration )
652- return true
650+ return c .enqueueFrameworkTimeoutCheck (
651+ f , taskStatus .TransitionTime , c .cConfig .ObjectLocalCacheCreationTimeoutSec ,
652+ failIfTimeout , "TaskAttemptCreationTimeoutCheck" )
653653}
654654
655655func (c * FrameworkController ) enqueueFrameworkRetryDelayTimeoutCheck (
@@ -658,17 +658,9 @@ func (c *FrameworkController) enqueueFrameworkRetryDelayTimeoutCheck(
658658 return false
659659 }
660660
661- leftDuration := common .CurrentLeftDuration (
662- f .Status .TransitionTime ,
663- f .Status .RetryPolicyStatus .RetryDelaySec )
664- if common .IsTimeout (leftDuration ) && failIfTimeout {
665- return false
666- }
667-
668- c .fQueue .AddAfter (f .Key (), leftDuration )
669- klog .Infof ("[%v]: enqueueFrameworkRetryDelayTimeoutCheck after %v" ,
670- f .Key (), leftDuration )
671- return true
661+ return c .enqueueFrameworkTimeoutCheck (
662+ f , f .Status .TransitionTime , f .Status .RetryPolicyStatus .RetryDelaySec ,
663+ failIfTimeout , "FrameworkRetryDelayTimeoutCheck" )
672664}
673665
674666func (c * FrameworkController ) enqueueTaskRetryDelayTimeoutCheck (
@@ -679,22 +671,36 @@ func (c *FrameworkController) enqueueTaskRetryDelayTimeoutCheck(
679671 return false
680672 }
681673
682- leftDuration := common .CurrentLeftDuration (
683- taskStatus .TransitionTime ,
684- taskStatus .RetryPolicyStatus .RetryDelaySec )
674+ return c .enqueueFrameworkTimeoutCheck (
675+ f , taskStatus .TransitionTime , taskStatus .RetryPolicyStatus .RetryDelaySec ,
676+ failIfTimeout , "TaskRetryDelayTimeoutCheck" )
677+ }
678+
679+ func (c * FrameworkController ) enqueueFrameworkTimeoutCheck (
680+ f * ci.Framework , startTime meta.Time , timeoutSec * int64 ,
681+ failIfTimeout bool , logSfx string ) bool {
682+ leftDuration := common .CurrentLeftDuration (startTime , timeoutSec )
685683 if common .IsTimeout (leftDuration ) && failIfTimeout {
686684 return false
687685 }
688686
687+ // The startTime may not contain OS monotonic clock, such as it is recovered
688+ // after FrameworkController restart. So the IsTimeout judgement may be affected
689+ // by OS wall clock changes, such as it should be timeout but the IsTimeout
690+ // returns false.
691+ // See wall clock and monotonic clock in Golang time/time.go.
692+ // To ensure the timeout will be eventually checked, AddAfter the Framework
693+ // for every none timeout check.
689694 c .fQueue .AddAfter (f .Key (), leftDuration )
690- klog .Infof ("[%v][%v][%v]: enqueueTaskRetryDelayTimeoutCheck after %v" ,
691- f .Key (), taskRoleName , taskIndex , leftDuration )
695+ klog .Infof (
696+ "[%v]: enqueueFrameworkTimeoutCheck after %v: %v" ,
697+ f .Key (), leftDuration , logSfx )
692698 return true
693699}
694700
695- func (c * FrameworkController ) enqueueFramework (f * ci.Framework , logSfx string ) {
701+ func (c * FrameworkController ) enqueueFrameworkSync (f * ci.Framework , logSfx string ) {
696702 c .fQueue .Add (f .Key ())
697- klog .Infof ("[%v]: enqueueFramework : %v" , f .Key (), logSfx )
703+ klog .Infof ("[%v]: enqueueFrameworkSync : %v" , f .Key (), logSfx )
698704}
699705
700706func (c * FrameworkController ) syncFrameworkStatus (f * ci.Framework ) error {
@@ -717,8 +723,21 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
717723 defer func () { klog .Infof (logPfx + "Completed" ) }()
718724
719725 if f .Status .State == ci .FrameworkCompleted {
720- klog .Infof (logPfx + "Skipped: Framework is already completed" )
721- return nil
726+ if c .enqueueFrameworkCompletedRetainTimeoutCheck (f , true ) {
727+ klog .Infof (logPfx + "Skipped: Framework is already completed" )
728+ return nil
729+ }
730+
731+ // deleteFramework
732+ logSfx := ""
733+ if * c .cConfig .LogObjectSnapshot .Framework .OnFrameworkDeletion {
734+ // Ensure the FrameworkSnapshot is exposed before the deletion.
735+ logSfx = ci .GetFrameworkSnapshotLogTail (f )
736+ }
737+ klog .Infof (logPfx + "Framework will be deleted due to " +
738+ "FrameworkCompletedRetainSec %v is expired" + logSfx ,
739+ common .SecToDuration (c .cConfig .FrameworkCompletedRetainSec ))
740+ return c .deleteFramework (f , true )
722741 }
723742
724743 var cm * core.ConfigMap
@@ -849,6 +868,10 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
849868 retryDecision )
850869
851870 f .TransitionFrameworkState (ci .FrameworkCompleted )
871+
872+ c .enqueueFrameworkCompletedRetainTimeoutCheck (f , false )
873+ klog .Infof (logPfx +
874+ "Waiting Framework to be deleted after FrameworkCompletedRetainSec" )
852875 return nil
853876 }
854877 }
@@ -868,14 +891,13 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
868891 }
869892
870893 // retryFramework
871- klog .Infof (logPfx + "Retry Framework" )
872-
873- // The completed FrameworkAttempt has been persisted, so it is safe to also
874- // expose it as one history snapshot.
894+ logSfx := ""
875895 if * c .cConfig .LogObjectSnapshot .Framework .OnFrameworkRetry {
876- klog .Infof (logPfx + "Framework will be retried" +
877- ci .GetFrameworkSnapshotLogTail (f ))
896+ // The completed FrameworkAttempt has been persisted, so it is safe to
897+ // also expose it as one history snapshot.
898+ logSfx = ci .GetFrameworkSnapshotLogTail (f )
878899 }
900+ klog .Infof (logPfx + "Framework will be retried" + logSfx )
879901
880902 f .Status .RetryPolicyStatus .TotalRetriedCount ++
881903 if retryDecision .IsAccountable {
@@ -968,6 +990,47 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
968990 }
969991}
970992
993+ func (c * FrameworkController ) deleteFramework (
994+ f * ci.Framework , force bool ) error {
995+ errPfx := fmt .Sprintf (
996+ "[%v]: Failed to delete Framework %v: force: %v: " ,
997+ f .Key (), f .UID , force )
998+
999+ // Do not set zero GracePeriodSeconds to do force deletion in any case, since
1000+ // it will also immediately delete Pod in PodUnknown state, while the Pod may
1001+ // be still running.
1002+ deleteErr := c .fClient .FrameworkcontrollerV1 ().Frameworks (f .Namespace ).Delete (
1003+ f .Name , & meta.DeleteOptions {Preconditions : & meta.Preconditions {UID : & f .UID }})
1004+ if deleteErr != nil {
1005+ if ! apiErrors .IsNotFound (deleteErr ) {
1006+ return fmt .Errorf (errPfx + "%v" , deleteErr )
1007+ }
1008+ } else {
1009+ if force {
1010+ // Confirm it is deleted instead of still deleting.
1011+ remoteF , getErr := c .fClient .FrameworkcontrollerV1 ().Frameworks (f .Namespace ).Get (
1012+ f .Name , meta.GetOptions {})
1013+ if getErr != nil {
1014+ if ! apiErrors .IsNotFound (getErr ) {
1015+ return fmt .Errorf (errPfx +
1016+ "Framework cannot be got from remote: %v" , getErr )
1017+ }
1018+ } else {
1019+ if f .UID == remoteF .UID {
1020+ return fmt .Errorf (errPfx +
1021+ "Framework with DeletionTimestamp %v still exist after deletion" ,
1022+ remoteF .DeletionTimestamp )
1023+ }
1024+ }
1025+ }
1026+ }
1027+
1028+ klog .Infof (
1029+ "[%v]: Succeeded to delete Framework %v: force: %v" ,
1030+ f .Key (), f .UID , force )
1031+ return nil
1032+ }
1033+
9711034// Get Framework's current ConfigMap object, if not found, then clean up existing
9721035// controlled ConfigMap if any.
9731036// Returned cm is either managed or nil, if it is the managed cm, it is not
@@ -1348,14 +1411,13 @@ func (c *FrameworkController) syncTaskState(
13481411 }
13491412
13501413 // retryTask
1351- klog .Infof (logPfx + "Retry Task" )
1352-
1353- // The completed TaskAttempt has been persisted, so it is safe to also
1354- // expose it as one history snapshot.
1414+ logSfx := ""
13551415 if * c .cConfig .LogObjectSnapshot .Framework .OnTaskRetry {
1356- klog .Infof (logPfx + "Task will be retried" +
1357- ci .GetFrameworkSnapshotLogTail (f ))
1416+ // The completed TaskAttempt has been persisted, so it is safe to also
1417+ // expose it as one history snapshot.
1418+ logSfx = ci .GetFrameworkSnapshotLogTail (f )
13581419 }
1420+ klog .Infof (logPfx + "Task will be retried" + logSfx )
13591421
13601422 taskStatus .RetryPolicyStatus .TotalRetriedCount ++
13611423 if retryDecision .IsAccountable {
@@ -1645,15 +1707,15 @@ func (c *FrameworkController) completeTaskAttempt(
16451707 // To ensure the completed TaskAttempt is persisted before exposed,
16461708 // we need to wait until next sync to expose it, so manually enqueue a sync.
16471709 klog .Infof (logPfx + "Waiting the completed TaskAttempt to be persisted" )
1648- c .enqueueFramework (f , "TaskAttemptCompleted" )
1710+ c .enqueueFrameworkSync (f , "TaskAttemptCompleted" )
16491711 } else {
16501712 f .TransitionTaskState (taskRoleName , taskIndex , ci .TaskAttemptDeletionPending )
16511713
16521714 // To ensure the CompletionStatus is persisted before deleting the pod,
16531715 // we need to wait until next sync to delete the pod, so manually enqueue
16541716 // a sync.
16551717 klog .Infof (logPfx + "Waiting the CompletionStatus to be persisted" )
1656- c .enqueueFramework (f , "TaskAttemptDeletionPending" )
1718+ c .enqueueFrameworkSync (f , "TaskAttemptDeletionPending" )
16571719 }
16581720}
16591721
@@ -1707,15 +1769,15 @@ func (c *FrameworkController) completeFrameworkAttempt(
17071769 // To ensure the completed FrameworkAttempt is persisted before exposed,
17081770 // we need to wait until next sync to expose it, so manually enqueue a sync.
17091771 klog .Infof (logPfx + "Waiting the completed FrameworkAttempt to be persisted" )
1710- c .enqueueFramework (f , "FrameworkAttemptCompleted" )
1772+ c .enqueueFrameworkSync (f , "FrameworkAttemptCompleted" )
17111773 } else {
17121774 f .TransitionFrameworkState (ci .FrameworkAttemptDeletionPending )
17131775
17141776 // To ensure the CompletionStatus is persisted before deleting the cm,
17151777 // we need to wait until next sync to delete the cm, so manually enqueue
17161778 // a sync.
17171779 klog .Infof (logPfx + "Waiting the CompletionStatus to be persisted" )
1718- c .enqueueFramework (f , "FrameworkAttemptDeletionPending" )
1780+ c .enqueueFrameworkSync (f , "FrameworkAttemptDeletionPending" )
17191781 }
17201782}
17211783
@@ -1776,7 +1838,7 @@ func (c *FrameworkController) getExpectedFrameworkStatusInfo(key string) *Expect
17761838}
17771839
17781840func (c * FrameworkController ) deleteExpectedFrameworkStatusInfo (key string ) {
1779- klog .Infof ("[%v]: deleteExpectedFrameworkStatusInfo: " , key )
1841+ klog .Infof ("[%v]: deleteExpectedFrameworkStatusInfo" , key )
17801842 c .fExpectedStatusInfos .Delete (key )
17811843}
17821844
0 commit comments