Skip to content

Commit 611490f

Browse files
Merge pull request #316 from abhinavdahiya/summarize_workloads
Bug 1768260: lib,pkg: provide detailed errors for workload failures
2 parents 960a109 + 5dafc9e commit 611490f

File tree

2 files changed

+95
-25
lines changed

2 files changed

+95
-25
lines changed

lib/resourcebuilder/apps.go

Lines changed: 85 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@ import (
66
"strings"
77

88
configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
9-
"github.com/openshift/cluster-version-operator/lib"
10-
"github.com/openshift/cluster-version-operator/lib/resourceapply"
11-
"github.com/openshift/cluster-version-operator/lib/resourceread"
129
appsv1 "k8s.io/api/apps/v1"
10+
corev1 "k8s.io/api/core/v1"
1311
"k8s.io/apimachinery/pkg/api/errors"
1412
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1513
"k8s.io/apimachinery/pkg/util/wait"
1614
appsclientv1 "k8s.io/client-go/kubernetes/typed/apps/v1"
1715
"k8s.io/client-go/rest"
1816
"k8s.io/klog"
17+
18+
"github.com/openshift/cluster-version-operator/lib"
19+
"github.com/openshift/cluster-version-operator/lib/resourceapply"
20+
"github.com/openshift/cluster-version-operator/lib/resourceread"
21+
"github.com/openshift/cluster-version-operator/pkg/payload"
1922
)
2023

2124
type deploymentBuilder struct {
@@ -77,7 +80,9 @@ func (b *deploymentBuilder) Do(ctx context.Context) error {
7780
return nil
7881
}
7982
func waitForDeploymentCompletion(ctx context.Context, client appsclientv1.DeploymentsGetter, deployment *appsv1.Deployment) error {
80-
return wait.PollImmediateUntil(defaultObjectPollInterval, func() (bool, error) {
83+
iden := fmt.Sprintf("%s/%s", deployment.Namespace, deployment.Name)
84+
var lastErr error
85+
err := wait.PollImmediateUntil(defaultObjectPollInterval, func() (bool, error) {
8186
d, err := client.Deployments(deployment.Namespace).Get(deployment.Name, metav1.GetOptions{})
8287
if errors.IsNotFound(err) {
8388
// exit early to recreate the deployment.
@@ -86,41 +91,77 @@ func waitForDeploymentCompletion(ctx context.Context, client appsclientv1.Deploy
8691
if err != nil {
8792
// Do not return error here, as we could be updating the API Server itself, in which case we
8893
// want to continue waiting.
89-
klog.Errorf("error getting Deployment %s during rollout: %v", deployment.Name, err)
94+
lastErr = &payload.UpdateError{
95+
Nested: err,
96+
Reason: "WorkloadNotAvailable",
97+
Message: fmt.Sprintf("could not find the deployment %s during rollout", iden),
98+
Name: iden,
99+
}
90100
return false, nil
91101
}
92102

93103
if d.DeletionTimestamp != nil {
94-
return false, fmt.Errorf("Deployment %s is being deleted", deployment.Name)
104+
return false, fmt.Errorf("Deployment %s is being deleted", iden)
95105
}
96106

97107
if d.Generation <= d.Status.ObservedGeneration && d.Status.UpdatedReplicas == d.Status.Replicas && d.Status.UnavailableReplicas == 0 {
98108
return true, nil
99109
}
100110

101-
deploymentConditions := make([]string, 0, len(d.Status.Conditions))
102-
for _, dc := range d.Status.Conditions {
111+
var availableCondition *appsv1.DeploymentCondition
112+
var progressingCondition *appsv1.DeploymentCondition
113+
var replicafailureCondition *appsv1.DeploymentCondition
114+
for idx, dc := range d.Status.Conditions {
103115
switch dc.Type {
104-
case appsv1.DeploymentProgressing, appsv1.DeploymentAvailable:
105-
if dc.Status == "False" {
106-
deploymentConditions = append(deploymentConditions, fmt.Sprintf(", reason: %s, message: %s", dc.Reason, dc.Message))
107-
}
116+
case appsv1.DeploymentProgressing:
117+
progressingCondition = &d.Status.Conditions[idx]
118+
case appsv1.DeploymentAvailable:
119+
availableCondition = &d.Status.Conditions[idx]
108120
case appsv1.DeploymentReplicaFailure:
109-
if dc.Status == "True" {
110-
deploymentConditions = append(deploymentConditions, fmt.Sprintf(", reason: %s, message: %s", dc.Reason, dc.Message))
111-
}
121+
replicafailureCondition = &d.Status.Conditions[idx]
122+
}
123+
}
124+
125+
if replicafailureCondition != nil && replicafailureCondition.Status == corev1.ConditionTrue {
126+
lastErr = &payload.UpdateError{
127+
Nested: fmt.Errorf("deployment %s has some pods failing; unavailable replicas=%d", iden, d.Status.UnavailableReplicas),
128+
Reason: "WorkloadNotProgressing",
129+
Message: fmt.Sprintf("deployment %s has a replica failure %s: %s", iden, replicafailureCondition.Reason, replicafailureCondition.Message),
130+
Name: iden,
131+
}
132+
return false, nil
133+
}
134+
135+
if availableCondition != nil && availableCondition.Status == corev1.ConditionFalse {
136+
lastErr = &payload.UpdateError{
137+
Nested: fmt.Errorf("deployment %s is not available; updated replicas=%d of %d, available replicas=%d of %d", iden, d.Status.UpdatedReplicas, d.Status.Replicas, d.Status.AvailableReplicas, d.Status.Replicas),
138+
Reason: "WorkloadNotAvailable",
139+
Message: fmt.Sprintf("deployment %s is not available %s: %s", iden, availableCondition.Reason, availableCondition.Message),
140+
Name: iden,
141+
}
142+
return false, nil
143+
}
144+
145+
if progressingCondition != nil && progressingCondition.Status == corev1.ConditionTrue {
146+
lastErr = &payload.UpdateError{
147+
Nested: fmt.Errorf("deployment %s is progressing; updated replicas=%d of %d, available replicas=%d of %d", iden, d.Status.UpdatedReplicas, d.Status.Replicas, d.Status.AvailableReplicas, d.Status.Replicas),
148+
Reason: "WorkloadNotAvailable",
149+
Message: fmt.Sprintf("deployment %s is progressing %s: %s", iden, progressingCondition.Reason, progressingCondition.Message),
150+
Name: iden,
112151
}
152+
return false, nil
113153
}
114154

115-
klog.V(4).Infof("Deployment %s is not ready. status: (replicas: %d, updated: %d, ready: %d, unavailable: %d%s)",
116-
d.Name,
117-
d.Status.Replicas,
118-
d.Status.UpdatedReplicas,
119-
d.Status.ReadyReplicas,
120-
d.Status.UnavailableReplicas,
121-
strings.Join(deploymentConditions, ""))
155+
klog.Errorf("deployment %s is in unknown state", iden)
122156
return false, nil
123157
}, ctx.Done())
158+
if err != nil {
159+
if err == wait.ErrWaitTimeout && lastErr != nil {
160+
return lastErr
161+
}
162+
return err
163+
}
164+
return nil
124165
}
125166

126167
type daemonsetBuilder struct {
@@ -183,7 +224,9 @@ func (b *daemonsetBuilder) Do(ctx context.Context) error {
183224
}
184225

185226
func waitForDaemonsetRollout(ctx context.Context, client appsclientv1.DaemonSetsGetter, daemonset *appsv1.DaemonSet) error {
186-
return wait.PollImmediateUntil(defaultObjectPollInterval, func() (bool, error) {
227+
iden := fmt.Sprintf("%s/%s", daemonset.Namespace, daemonset.Name)
228+
var lastErr error
229+
err := wait.PollImmediateUntil(defaultObjectPollInterval, func() (bool, error) {
187230
d, err := client.DaemonSets(daemonset.Namespace).Get(daemonset.Name, metav1.GetOptions{})
188231
if errors.IsNotFound(err) {
189232
// exit early to recreate the daemonset.
@@ -192,7 +235,12 @@ func waitForDaemonsetRollout(ctx context.Context, client appsclientv1.DaemonSets
192235
if err != nil {
193236
// Do not return error here, as we could be updating the API Server itself, in which case we
194237
// want to continue waiting.
195-
klog.Errorf("error getting Daemonset %s during rollout: %v", daemonset.Name, err)
238+
lastErr = &payload.UpdateError{
239+
Nested: err,
240+
Reason: "WorkloadNotAvailable",
241+
Message: fmt.Sprintf("could not find the daemonset %s during rollout", iden),
242+
Name: iden,
243+
}
196244
return false, nil
197245
}
198246

@@ -203,7 +251,19 @@ func waitForDaemonsetRollout(ctx context.Context, client appsclientv1.DaemonSets
203251
if d.Generation <= d.Status.ObservedGeneration && d.Status.UpdatedNumberScheduled == d.Status.DesiredNumberScheduled && d.Status.NumberUnavailable == 0 {
204252
return true, nil
205253
}
206-
klog.V(4).Infof("Daemonset %s is not ready. status: (desired: %d, updated: %d, ready: %d, unavailable: %d)", d.Name, d.Status.DesiredNumberScheduled, d.Status.UpdatedNumberScheduled, d.Status.NumberReady, d.Status.NumberAvailable)
254+
lastErr = &payload.UpdateError{
255+
Nested: fmt.Errorf("daemonset %s is progressing; updated replicas=%d of %d, available replicas=%d of %d", iden, d.Status.UpdatedNumberScheduled, d.Status.DesiredNumberScheduled, d.Status.NumberAvailable, d.Status.DesiredNumberScheduled),
256+
Reason: "WorkloadNotAvailable",
257+
Message: fmt.Sprintf("daemonset %s is progressing", iden),
258+
Name: iden,
259+
}
207260
return false, nil
208261
}, ctx.Done())
262+
if err != nil {
263+
if err == wait.ErrWaitTimeout && lastErr != nil {
264+
return lastErr
265+
}
266+
return err
267+
}
268+
return nil
209269
}

pkg/payload/task.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,16 @@ func SummaryForReason(reason, name string) string {
207207
return "a cluster operator has not yet rolled out"
208208
case "ClusterOperatorsNotAvailable":
209209
return "some cluster operators have not yet rolled out"
210+
case "WorkloadNotAvailable":
211+
if len(name) > 0 {
212+
return fmt.Sprintf("the workload %s has not yet successfully rolled out", name)
213+
}
214+
return "a workload has not yet rolled out"
215+
case "WorkloadNotProgressing":
216+
if len(name) > 0 {
217+
return fmt.Sprintf("the workload %s cannot roll out", name)
218+
}
219+
return "a workload cannot roll out"
210220
}
211221

212222
if strings.HasPrefix(reason, "UpdatePayload") {

0 commit comments

Comments
 (0)