Skip to content

Commit 256dbab

Browse files
authored
add initial grace period before triggering a failure due to missing components (#273)
1 parent ba18866 commit 256dbab

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -268,21 +268,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
268268
}
269269

270270
// Detect externally deleted components and transition to Failed with no GracePeriod or retry
271-
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
272271
if compStatus.deployed != compStatus.expected {
273-
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
274-
Type: string(workloadv1beta2.Unhealthy),
275-
Status: metav1.ConditionTrue,
276-
Reason: "MissingComponent",
277-
Message: detailMsg,
278-
})
279-
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
280-
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
272+
// There may be a lag before created resources become visible in the cache; don't react too quickly.
273+
whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
274+
graceDuration := r.admissionGraceDuration(ctx, aw)
275+
if time.Now().After(whenDeployed.Add(graceDuration)) {
276+
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
277+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
278+
Type: string(workloadv1beta2.Unhealthy),
279+
Status: metav1.ConditionTrue,
280+
Reason: "MissingComponent",
281+
Message: detailMsg,
282+
})
283+
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
284+
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
285+
}
281286
}
282287

283288
// If a component's controller has put it into a failed state, we do not need
284289
// to allow a grace period. The situation will not self-correct.
285-
detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed)
290+
detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed)
286291
if compStatus.failed > 0 {
287292
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
288293
Type: string(workloadv1beta2.Unhealthy),

test/e2e/appwrapper_test.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() {
297297
Expect(aw.Status.Retries).Should(Equal(int32(2)))
298298
})
299299

300-
It("Deleting a Running Component yields a failed AppWrapper", func() {
301-
aw := createAppWrapper(ctx, pytorchjob(2, 500))
300+
It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() {
301+
aw := toAppWrapper(pytorchjob(2, 500))
302+
if aw.Annotations == nil {
303+
aw.Annotations = make(map[string]string)
304+
}
305+
aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s"
306+
Expect(getClient(ctx).Create(ctx, aw)).To(Succeed())
302307
appwrappers = append(appwrappers, aw)
303308
Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning))
304309
aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})

0 commit comments

Comments
 (0)