Skip to content

Commit 37d9200

Browse files
committed
set status for other failure modes during ClusterExtensionRevision reconciliation
Signed-off-by: Joe Lanford <[email protected]>
1 parent d97de50 commit 37d9200

File tree

3 files changed

+67
-16
lines changed

3 files changed

+67
-16
lines changed

internal/operator-controller/applier/boxcutter.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"context"
66
"crypto/sha256"
77
"encoding/hex"
8+
"errors"
89
"fmt"
910
"hash"
1011
"io/fs"
@@ -204,11 +205,18 @@ func (bc *Boxcutter) apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
204205
// TODO: Delete archived previous revisions over a certain revision limit
205206

206207
// TODO: Define constants for the ClusterExtensionRevision condition types.
207-
installedCondition := meta.FindStatusCondition(currentRevision.Status.Conditions, "Succeeded")
208-
if installedCondition == nil {
208+
progressingCondition := meta.FindStatusCondition(currentRevision.Status.Conditions, "Progressing")
209+
availableCondition := meta.FindStatusCondition(currentRevision.Status.Conditions, "Available")
210+
succeededCondition := meta.FindStatusCondition(currentRevision.Status.Conditions, "Succeeded")
211+
212+
if progressingCondition == nil && availableCondition == nil && succeededCondition == nil {
209213
return false, "New revision created", nil
210-
} else if installedCondition.Status != metav1.ConditionTrue {
211-
return false, installedCondition.Message, nil
214+
} else if progressingCondition != nil && progressingCondition.Status == metav1.ConditionTrue {
215+
return false, progressingCondition.Message, nil
216+
} else if availableCondition != nil && availableCondition.Status != metav1.ConditionTrue {
217+
return false, "", errors.New(availableCondition.Message)
218+
} else if succeededCondition != nil && succeededCondition.Status != metav1.ConditionTrue {
219+
return false, succeededCondition.Message, nil
212220
}
213221
return true, "", nil
214222
}
@@ -240,6 +248,9 @@ func computeSHA256Hash(obj any) string {
240248
func deepHashObject(hasher hash.Hash, objectToWrite any) {
241249
hasher.Reset()
242250

251+
// TODO: change this out to `json.Marshal`. Pretty sure we found issues in the past where
252+
// spew would produce different output when internal structures changed without the
253+
// external public API changing.
243254
printer := spew.ConfigState{
244255
Indent: " ",
245256
SortKeys: true,

internal/operator-controller/controllers/clusterextensionrevision_controller.go

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package controllers
55
import (
66
"context"
77
"encoding/json"
8+
"errors"
89
"fmt"
910
"strings"
1011
"time"
@@ -105,20 +106,58 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, ce *
105106
}
106107
rres, err := c.RevisionEngine.Reconcile(ctx, *revision, opts...)
107108
if err != nil {
108-
return ctrl.Result{}, fmt.Errorf("revision reconcile: %w", err)
109+
meta.SetStatusCondition(&rev.Status.Conditions, metav1.Condition{
110+
Type: "Available",
111+
Status: metav1.ConditionFalse,
112+
Reason: "ReconcileFailure",
113+
Message: err.Error(),
114+
ObservedGeneration: rev.Generation,
115+
})
116+
return ctrl.Result{}, fmt.Errorf("revision reconcile: %w", errors.Join(err, c.Client.Status().Update(ctx, rev)))
109117
}
110118
l.Info("reconcile report", "report", rres.String())
111119

112120
// Retry failing preflight checks with a flat 10s retry.
113121
// TODO: report status, backoff?
114122
if verr := rres.GetValidationError(); verr != nil {
115123
l.Info("preflight error, retrying after 10s", "err", verr.String())
116-
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
124+
meta.SetStatusCondition(&rev.Status.Conditions, metav1.Condition{
125+
Type: "Available",
126+
Status: metav1.ConditionFalse,
127+
Reason: "RevisionValidationFailure",
128+
Message: fmt.Sprintf("revision validation error: %s", verr),
129+
ObservedGeneration: rev.Generation,
130+
})
131+
return ctrl.Result{RequeueAfter: 10 * time.Second}, c.Client.Status().Update(ctx, rev)
117132
}
118-
for _, pres := range rres.GetPhases() {
133+
for i, pres := range rres.GetPhases() {
119134
if verr := pres.GetValidationError(); verr != nil {
120135
l.Info("preflight error, retrying after 10s", "err", verr.String())
121-
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
136+
meta.SetStatusCondition(&rev.Status.Conditions, metav1.Condition{
137+
Type: "Available",
138+
Status: metav1.ConditionFalse,
139+
Reason: "PhaseValidationError",
140+
Message: fmt.Sprintf("phase %d validation error: %s", i, verr),
141+
ObservedGeneration: rev.Generation,
142+
})
143+
return ctrl.Result{RequeueAfter: 10 * time.Second}, c.Client.Status().Update(ctx, rev)
144+
}
145+
var collidingObjs []string
146+
for _, ores := range pres.GetObjects() {
147+
if ores.Action() == machinery.ActionCollision {
148+
collidingObjs = append(collidingObjs, ores.String())
149+
}
150+
}
151+
if len(collidingObjs) > 0 {
152+
l.Info("object collision error, retrying after 10s", "collisions", collidingObjs)
153+
meta.SetStatusCondition(&rev.Status.Conditions, metav1.Condition{
154+
Type: "Available",
155+
Status: metav1.ConditionFalse,
156+
Reason: "ObjectCollisions",
157+
Message: fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n")),
158+
ObservedGeneration: rev.Generation,
159+
})
160+
return ctrl.Result{RequeueAfter: 10 * time.Second}, c.Client.Status().Update(ctx, rev)
122161
}
123162
}
124163

@@ -191,14 +230,14 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, ce *
191230
}
192231
if rres.InTransistion() {
193232
meta.SetStatusCondition(&rev.Status.Conditions, metav1.Condition{
194-
Type: "InTransition",
233+
Type: "Progressing",
195234
Status: metav1.ConditionTrue,
196-
Reason: "InTransition",
235+
Reason: "Progressing",
197236
Message: "Rollout in progress.",
198237
ObservedGeneration: rev.Generation,
199238
})
200239
} else {
201-
meta.RemoveStatusCondition(&rev.Status.Conditions, "InTransition")
240+
meta.RemoveStatusCondition(&rev.Status.Conditions, "Progressing")
202241
}
203242

204243
return ctrl.Result{}, c.Client.Status().Update(ctx, rev)

test/e2e/cluster_extension_install_test.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -940,14 +940,12 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
940940
require.Equal(ct, ocv1.ReasonRetrying, cond.Reason)
941941
}, pollDuration, pollInterval)
942942

943-
t.Log("By eventually failing to install the package successfully due to no namespace")
943+
t.Log("By eventually reporting Installed != True")
944944
require.EventuallyWithT(t, func(ct *assert.CollectT) {
945945
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
946946
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
947947
require.NotNil(ct, cond)
948-
require.Equal(ct, metav1.ConditionUnknown, cond.Status)
949-
require.Equal(ct, ocv1.ReasonFailed, cond.Reason)
950-
require.Contains(ct, cond.Message, fmt.Sprintf("service account %q not found in namespace %q: unable to authenticate with the Kubernetes cluster.", clusterExtension.Name, clusterExtension.Name))
948+
require.NotEqual(ct, metav1.ConditionTrue, cond.Status)
951949
}, pollDuration, pollInterval)
952950

953951
t.Log("By creating the Namespace and ServiceAccount")
@@ -1057,7 +1055,10 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
10571055
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
10581056
require.NotNil(ct, cond)
10591057
require.Equal(ct, metav1.ConditionFalse, cond.Status)
1060-
require.Equal(ct, ocv1.ReasonFailed, cond.Reason)
1058+
// TODO: We probably _should_ be testing the reason here, but helm and boxcutter applier have different reasons.
1059+
// Maybe we change helm to use "Absent" rather than "Failed" since the Progressing condition already captures
1060+
// the failure?
1061+
//require.Equal(ct, ocv1.ReasonFailed, cond.Reason)
10611062
require.Contains(ct, cond.Message, "No bundle installed")
10621063
}, pollDuration, pollInterval)
10631064

0 commit comments

Comments
 (0)