-
Notifications
You must be signed in to change notification settings - Fork 38
feat: add updaterun metrics and update Progressing condition #1107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
29e041a
c42963b
7557f26
b2e7813
71a7cd5
a11c447
10783fa
2ab2781
b27396b
e09fbbe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,9 @@ var ( | |
| // stageUpdatingWaitTime is the time to wait before rechecking the stage update status. | ||
| // Put it as a variable for convenient testing. | ||
| stageUpdatingWaitTime = 60 * time.Second | ||
|
|
||
| // updateRunStuckThreshold is the time to wait on a single cluster update before marking update run as stuck. | ||
| updateRunStuckThreshold = 60 * time.Second | ||
| ) | ||
|
|
||
| // execute executes the update run by updating the clusters in the updating stage specified by updatingStageIndex. | ||
|
|
@@ -161,17 +164,26 @@ func (r *Reconciler) executeUpdatingStage( | |
| markClusterUpdatingFailed(clusterStatus, updateRun.Generation, unexpectedErr.Error()) | ||
| return 0, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) | ||
| } | ||
|
|
||
| finished, updateErr := checkClusterUpdateResult(binding, clusterStatus, updatingStageStatus, updateRun) | ||
| if finished { | ||
| finishedClusterCount++ | ||
| continue | ||
| } else { | ||
| // If cluster update has been running for more than 1 minute, mark the update run as stuck. | ||
| timeElapsed := time.Since(clusterStartedCond.LastTransitionTime.Time) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. checkClusterUpdateResult wait until all the resources become available which depends on the "timeToReady" setting by the user. Therefore, 'updateRunStuckThreshold' has to be the greater of 60 seconds or that number
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realized one thing: "timeToReady" is a rolling update specific. It's under
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point, does it mean that we need to add this to the update run API too or create some wait between clusters when there are untrackable resources?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i notice this problem when reviewing the code. Should we keep the default value as 1 min to keep it consistent with the rollingUpdate config? later, we can support the customized value?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, looks like we need to do similar thing in updateRun. That will be a new feature. |
||
| if timeElapsed > updateRunStuckThreshold { | ||
| klog.V(2).InfoS("Time waiting for cluster update to finish passes threshold, mark the update run as stuck", "time elapsed", timeElapsed, "threshold", updateRunStuckThreshold, "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "clusterStagedUpdateRun", updateRunRef) | ||
| markUpdateRunStuck(updateRun, updatingStageStatus.StageName, clusterStatus.ClusterName) | ||
| } | ||
| } | ||
| // No need to continue as we only support one cluster updating at a time for now. | ||
| return clusterUpdatingWaitTime, updateErr | ||
| } | ||
|
|
||
| if finishedClusterCount == len(updatingStageStatus.Clusters) { | ||
| // All the clusters in the stage have been updated. | ||
| markUpdateRunWaiting(updateRun, updatingStageStatus.StageName) | ||
| markStageUpdatingWaiting(updatingStageStatus, updateRun.Generation) | ||
| klog.V(2).InfoS("The stage has finished all cluster updating", "stage", updatingStageStatus.StageName, "clusterStagedUpdateRun", updateRunRef) | ||
| // Check if the after stage tasks are ready. | ||
|
|
@@ -180,6 +192,8 @@ func (r *Reconciler) executeUpdatingStage( | |
| return 0, err | ||
| } | ||
| if approved { | ||
| // Mark updateRun as progressing again. | ||
| markUpdateRunStarted(updateRun) | ||
| markStageUpdatingSucceeded(updatingStageStatus, updateRun.Generation) | ||
| // No need to wait to get to the next stage. | ||
| return 0, nil | ||
|
|
@@ -448,6 +462,28 @@ func markUpdateRunStarted(updateRun *placementv1beta1.ClusterStagedUpdateRun) { | |
| }) | ||
| } | ||
|
|
||
| // markUpdateRunStuck marks the updateRun as stuck in memory. | ||
| func markUpdateRunStuck(updateRun *placementv1beta1.ClusterStagedUpdateRun, stageName, clusterName string) { | ||
| meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ | ||
| Type: string(placementv1beta1.StagedUpdateRunConditionProgressing), | ||
| Status: metav1.ConditionFalse, | ||
| ObservedGeneration: updateRun.Generation, | ||
| Reason: condition.UpdateRunStuckReason, | ||
| Message: fmt.Sprintf("The updateRun is stuck waiting for cluster %s in stage %s to finish updating", clusterName, stageName), | ||
| }) | ||
| } | ||
|
|
||
| // markUpdateRunWaiting marks the updateRun as waiting in memory. | ||
| func markUpdateRunWaiting(updateRun *placementv1beta1.ClusterStagedUpdateRun, stageName string) { | ||
| meta.SetStatusCondition(&updateRun.Status.Conditions, metav1.Condition{ | ||
| Type: string(placementv1beta1.StagedUpdateRunConditionProgressing), | ||
| Status: metav1.ConditionFalse, | ||
| ObservedGeneration: updateRun.Generation, | ||
| Reason: condition.UpdateRunWaitingReason, | ||
| Message: fmt.Sprintf("The updateRun is waiting for after-stage tasks instage %s to complete", stageName), | ||
| }) | ||
| } | ||
|
|
||
| // markStageUpdatingStarted marks the stage updating status as started in memory. | ||
| func markStageUpdatingStarted(stageUpdatingStatus *placementv1beta1.StageUpdatingStatus, generation int64) { | ||
| if stageUpdatingStatus.StartTime == nil { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this seems to be a bit too aggressive, a normal deployment with many replicas can take a few mins to get ready
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me set it as 5 minutes for now. It will not cause updateRun to stop. Just mark it in the status so users can investigate CRP and see if there's issue. If issue is fixed or recovered automatically, updateRun can continue and the status is removed. We also add metrics to track so that we could have a better understanding and treak it.