Skip to content

Commit 6b9b766

Browse files
Improve opj handler returned errors (#310)
* release and operate return error map * add some comment * add some comment * release failed targets with retry * concurrent map write
1 parent 2fa4811 commit 6b9b766

File tree

6 files changed

+113
-24
lines changed

6 files changed

+113
-24
lines changed

pkg/controllers/operationjob/operationjob_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ func (r *ReconcileOperationJob) Reconcile(ctx context.Context, req reconcile.Req
138138
return reconcile.Result{}, err
139139
}
140140

141+
if err := r.ensureFailedTargetsReleased(ctx, instance, candidates); err != nil {
142+
return reconcile.Result{}, err
143+
}
144+
141145
err = r.doReconcile(ctx, instance, candidates)
142146
return requeueResult(requeueAfter), err
143147
}

pkg/controllers/operationjob/operationjob_manager.go

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ func (r *ReconcileOperationJob) operateTargets(
174174
if len(candidates) == 0 {
175175
return nil
176176
}
177-
return operator.OperateTargets(ctx, candidates, operationJob)
177+
errMap := operator.OperateTargets(ctx, candidates, operationJob)
178+
return ctrlutils.AggregateErrors(ojutils.ConvertErrMapToList(errMap))
178179
}
179180

180181
func (r *ReconcileOperationJob) getTargetsOpsStatus(
@@ -244,11 +245,10 @@ func (r *ReconcileOperationJob) getTargetsOpsStatus(
244245
// ensureActiveDeadlineAndTTL calculate time to ActiveDeadlineSeconds and TTLSecondsAfterFinished and release targets
245246
func (r *ReconcileOperationJob) ensureActiveDeadlineAndTTL(ctx context.Context, operationJob *appsv1alpha1.OperationJob, candidates []*OpsCandidate, logger logr.Logger) (bool, *time.Duration, error) {
246247
if operationJob.Spec.ActiveDeadlineSeconds != nil {
247-
var allowReleaseCandidates []*OpsCandidate
248248
for i := range candidates {
249249
candidate := candidates[i]
250-
// just skip if target operation already finished, or not started
251-
if IsCandidateOpsFinished(candidate) || candidate.OpsStatus.StartTime == nil {
250+
// just skip if target not started
251+
if candidate.OpsStatus.StartTime == nil {
252252
continue
253253
}
254254
leftTime := time.Duration(*operationJob.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(candidate.OpsStatus.StartTime.Time)
@@ -257,17 +257,10 @@ func (r *ReconcileOperationJob) ensureActiveDeadlineAndTTL(ctx context.Context,
257257
} else {
258258
logger.Info("should end but still processing")
259259
r.Recorder.Eventf(operationJob, corev1.EventTypeNormal, "Timeout", "Try to fail OperationJob for timeout...")
260-
// mark operationjob and targets failed and release targets
260+
// mark target failed if timeout
261261
MarkCandidateFailed(candidate)
262-
allowReleaseCandidates = append(allowReleaseCandidates, candidate)
263262
}
264263
}
265-
if len(allowReleaseCandidates) > 0 {
266-
releaseErr := r.releaseTargets(ctx, operationJob, allowReleaseCandidates, false)
267-
operationJob.Status = r.calculateStatus(operationJob, candidates)
268-
updateErr := r.updateStatus(ctx, operationJob)
269-
return false, nil, controllerutils.AggregateErrors([]error{releaseErr, updateErr})
270-
}
271264
}
272265

273266
if operationJob.Spec.TTLSecondsAfterFinished != nil {
@@ -286,26 +279,55 @@ func (r *ReconcileOperationJob) ensureActiveDeadlineAndTTL(ctx context.Context,
286279
return false, nil, nil
287280
}
288281

282+
// ensureFailedTargetsReleased select failed but unreleased targets and call releaseTargets
283+
func (r *ReconcileOperationJob) ensureFailedTargetsReleased(ctx context.Context, operationJob *appsv1alpha1.OperationJob, candidates []*OpsCandidate) error {
284+
var allowReleaseCandidates []*OpsCandidate
285+
for i := range candidates {
286+
if IsCandidateOpsFailed(candidates[i]) && !IsCandidateOpsReleased(candidates[i]) {
287+
allowReleaseCandidates = append(allowReleaseCandidates, candidates[i])
288+
}
289+
}
290+
if len(allowReleaseCandidates) > 0 {
291+
releaseErr := r.releaseTargets(ctx, operationJob, allowReleaseCandidates, false)
292+
operationJob.Status = r.calculateStatus(operationJob, candidates)
293+
updateErr := r.updateStatus(ctx, operationJob)
294+
return controllerutils.AggregateErrors([]error{releaseErr, updateErr})
295+
}
296+
return nil
297+
}
298+
289299
// releaseTargets try to release the targets from operation when the operationJob is deleted
290300
func (r *ReconcileOperationJob) releaseTargets(ctx context.Context, operationJob *appsv1alpha1.OperationJob, candidates []*OpsCandidate, needUpdateStatus bool) error {
291301
actionHandler, enablePodOpsLifecycle, err := r.getActionHandler(operationJob)
292302
if err != nil {
293303
return err
294304
}
295-
releaseErr := actionHandler.ReleaseTargets(ctx, candidates, operationJob)
305+
306+
// start to release targets
307+
releaseErrMap := actionHandler.ReleaseTargets(ctx, candidates, operationJob)
296308
_, _ = controllerutils.SlowStartBatch(len(candidates), controllerutils.SlowStartInitialBatchSize, false, func(i int, _ error) error {
297309
candidate := candidates[i]
298310
// cancel lifecycle if necessary
299311
if enablePodOpsLifecycle {
300312
err = r.cleanCandidateOpsLifecycle(ctx, true, candidate, operationJob)
301-
releaseErr = controllerutils.AggregateErrors([]error{releaseErr, err})
313+
releaseErrMap[candidate.PodName] = controllerutils.AggregateErrors([]error{releaseErrMap[candidate.PodName], err})
302314
}
303315
// mark candidate as failed if not finished
304316
if !IsCandidateOpsFinished(candidate) {
305317
candidate.OpsStatus.Progress = appsv1alpha1.OperationProgressFailed
306318
}
307319
return nil
308320
})
321+
322+
// mark target as released if error not occurred
323+
for _, candidate := range candidates {
324+
if releaseErrMap[candidate.PodName] == nil {
325+
MarkCandidateReleased(candidate)
326+
}
327+
}
328+
releaseErr := ctrlutils.AggregateErrors(ojutils.ConvertErrMapToList(releaseErrMap))
329+
330+
// update candidates status to job status
309331
if !needUpdateStatus {
310332
return releaseErr
311333
}

pkg/controllers/operationjob/opscore/candidate.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ type OpsCandidate struct {
3232
OpsStatus *appsv1alpha1.OpsStatus
3333
}
3434

35+
const (
36+
ExtraInfoReleased = "Released"
37+
)
38+
3539
func DecideCandidateByPartition(instance *appsv1alpha1.OperationJob, candidates []*OpsCandidate) []*OpsCandidate {
3640
if instance.Spec.Partition == nil {
3741
return candidates
@@ -81,6 +85,23 @@ func IsCandidateOpsFinished(candidate *OpsCandidate) bool {
8185
candidate.OpsStatus.Progress == appsv1alpha1.OperationProgressSucceeded
8286
}
8387

88+
func IsCandidateOpsReleased(candidate *OpsCandidate) bool {
89+
if candidate.OpsStatus == nil || candidate.OpsStatus.ExtraInfo == nil {
90+
return false
91+
}
92+
if val, exist := candidate.OpsStatus.ExtraInfo[ExtraInfoReleased]; exist && val == "true" {
93+
return true
94+
}
95+
return false
96+
}
97+
98+
func IsCandidateOpsFailed(candidate *OpsCandidate) bool {
99+
if candidate.OpsStatus == nil || candidate.OpsStatus.Progress == "" {
100+
return false
101+
}
102+
return candidate.OpsStatus.Progress == appsv1alpha1.OperationProgressFailed
103+
}
104+
84105
func IsCandidateServiceAvailable(candidate *OpsCandidate) bool {
85106
if candidate.Pod == nil || candidate.Pod.Labels == nil {
86107
return false
@@ -94,3 +115,13 @@ func MarkCandidateFailed(candidate *OpsCandidate) {
94115
candidate.OpsStatus.Progress = appsv1alpha1.OperationProgressFailed
95116
}
96117
}
118+
119+
func MarkCandidateReleased(candidate *OpsCandidate) {
120+
if candidate.OpsStatus == nil {
121+
candidate.OpsStatus = &appsv1alpha1.OpsStatus{}
122+
}
123+
if candidate.OpsStatus.ExtraInfo == nil {
124+
candidate.OpsStatus.ExtraInfo = map[string]string{}
125+
}
126+
candidate.OpsStatus.ExtraInfo[ExtraInfoReleased] = "true"
127+
}

pkg/controllers/operationjob/opscore/handler.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ type ActionHandler interface {
3838
// Setup sets up action with manager in AddToMgr, i.e., watch, cache...
3939
Setup(controller.Controller, *mixin.ReconcilerMixin) error
4040

41-
// OperateTargets do real operation to targets
42-
OperateTargets(context.Context, []*OpsCandidate, *appsv1alpha1.OperationJob) error
41+
// OperateTargets do real operation to targets, and returns an error map to each target name
42+
OperateTargets(context.Context, []*OpsCandidate, *appsv1alpha1.OperationJob) map[string]error
4343

4444
// GetOpsProgress returns target's current opsStatus, e.g., progress, reason, message
4545
GetOpsProgress(context.Context, *OpsCandidate, *appsv1alpha1.OperationJob) (progress ActionProgress, err error)
4646

47-
// ReleaseTargets releases the target from operation when the operationJob is deleted
48-
ReleaseTargets(context.Context, []*OpsCandidate, *appsv1alpha1.OperationJob) error
47+
// ReleaseTargets releases the target from operation when failed, and returns an error map to each target name
48+
ReleaseTargets(context.Context, []*OpsCandidate, *appsv1alpha1.OperationJob) map[string]error
4949
}

pkg/controllers/operationjob/replace/replace.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package replace
1919
import (
2020
"context"
2121
"fmt"
22+
"sync"
2223

2324
"github.com/go-logr/logr"
2425
corev1 "k8s.io/api/core/v1"
@@ -62,9 +63,13 @@ func (p *PodReplaceHandler) Setup(controller controller.Controller, reconcileMix
6263
return controller.Watch(&source.Kind{Type: &corev1.Pod{}}, &OriginPodHandler{Client: reconcileMixin.Client})
6364
}
6465

65-
func (p *PodReplaceHandler) OperateTargets(ctx context.Context, candidates []*OpsCandidate, operationJob *appsv1alpha1.OperationJob) error {
66-
_, err := controllerutils.SlowStartBatch(len(candidates), controllerutils.SlowStartInitialBatchSize, false, func(i int, _ error) error {
66+
func (p *PodReplaceHandler) OperateTargets(ctx context.Context, candidates []*OpsCandidate, operationJob *appsv1alpha1.OperationJob) map[string]error {
67+
errMap := &sync.Map{}
68+
_, _ = controllerutils.SlowStartBatch(len(candidates), controllerutils.SlowStartInitialBatchSize, false, func(i int, _ error) (err error) {
6769
candidate := candidates[i]
70+
defer func() {
71+
errMap.Store(candidate.PodName, err)
72+
}()
6873
if candidate.Pod == nil || candidate.Pod.Labels == nil {
6974
return nil
7075
}
@@ -99,7 +104,7 @@ func (p *PodReplaceHandler) OperateTargets(ctx context.Context, candidates []*Op
99104
}
100105
return nil
101106
})
102-
return err
107+
return ojutils.ConvertSyncErrMap(errMap)
103108
}
104109

105110
func (p *PodReplaceHandler) GetOpsProgress(ctx context.Context, candidate *OpsCandidate, operationJob *appsv1alpha1.OperationJob) (progress ActionProgress, err error) {
@@ -173,9 +178,13 @@ func (p *PodReplaceHandler) GetOpsProgress(ctx context.Context, candidate *OpsCa
173178
return
174179
}
175180

176-
func (p *PodReplaceHandler) ReleaseTargets(ctx context.Context, candidates []*OpsCandidate, operationJob *appsv1alpha1.OperationJob) error {
177-
_, err := controllerutils.SlowStartBatch(len(candidates), controllerutils.SlowStartInitialBatchSize, false, func(i int, _ error) error {
181+
func (p *PodReplaceHandler) ReleaseTargets(ctx context.Context, candidates []*OpsCandidate, operationJob *appsv1alpha1.OperationJob) map[string]error {
182+
errMap := &sync.Map{}
183+
_, _ = controllerutils.SlowStartBatch(len(candidates), controllerutils.SlowStartInitialBatchSize, false, func(i int, _ error) (err error) {
178184
candidate := candidates[i]
185+
defer func() {
186+
errMap.Store(candidate.PodName, err)
187+
}()
179188
if candidate.Pod == nil || candidate.Pod.DeletionTimestamp != nil {
180189
return nil
181190
}
@@ -196,7 +205,8 @@ func (p *PodReplaceHandler) ReleaseTargets(ctx context.Context, candidates []*Op
196205
ojutils.SetOpsStatusError(candidate, ojutils.ReasonUpdateObjectFailed, retErr.Error())
197206
return retErr
198207
}
208+
candidate.OpsStatus.ExtraInfo[ExtraInfoReleased] = "true"
199209
return nil
200210
})
201-
return err
211+
return ojutils.ConvertSyncErrMap(errMap)
202212
}

pkg/controllers/operationjob/utils/common.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package utils
1818

1919
import (
2020
"context"
21+
"sync"
2122

2223
corev1 "k8s.io/api/core/v1"
2324
"k8s.io/apimachinery/pkg/types"
@@ -89,3 +90,24 @@ func UpdatePodWithRetry(ctx context.Context, c client.Client, obj client.Object,
8990
return c.Update(ctx, pod)
9091
})
9192
}
93+
94+
func ConvertErrMapToList(errMap map[string]error) []error {
95+
var errList []error
96+
for _, v := range errMap {
97+
errList = append(errList, v)
98+
}
99+
return errList
100+
}
101+
102+
func ConvertSyncErrMap(errMap *sync.Map) map[string]error {
103+
ret := make(map[string]error)
104+
errMap.Range(func(key, value any) bool {
105+
if value == nil {
106+
ret[key.(string)] = nil
107+
} else {
108+
ret[key.(string)] = value.(error)
109+
}
110+
return true
111+
})
112+
return ret
113+
}

0 commit comments

Comments
 (0)