Skip to content

Commit f5ab29f

Browse files
authored
🐛 CRD Upgrade Safety pre-flight fixes (#1104)
* (bugfix): crd preflight passes on old crd not found crd preflight updated to run on both install and upgrade e2e test added to ensure fixing error in an initial install failure results in successful install (or upgrade in helm semantics) Signed-off-by: everettraven <[email protected]> * address reviews Signed-off-by: everettraven <[email protected]> * add comment regarding eating errors Signed-off-by: everettraven <[email protected]> * sanity Signed-off-by: everettraven <[email protected]> * sanity pt.2 Signed-off-by: everettraven <[email protected]> * address comment about comments Signed-off-by: everettraven <[email protected]> * fix lint error Signed-off-by: everettraven <[email protected]> --------- Signed-off-by: everettraven <[email protected]>
1 parent 2490a01 commit f5ab29f

File tree

4 files changed

+268
-14
lines changed

4 files changed

+268
-14
lines changed

internal/controllers/clusterextension_controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,15 @@ func (r *ClusterExtensionReconciler) reconcile(ctx context.Context, ext *ocv1alp
271271
}
272272

273273
l.V(1).Info("applying bundle contents")
274+
// NOTE: We need to be cautious of eating errors here.
275+
// We should always return any error that occurs during an
276+
// attempt to apply content to the cluster. Only when there is
277+
// a verifiable reason to eat the error (i.e it is recoverable)
278+
// should an exception be made.
279+
// The following kinds of errors should be returned up the stack
280+
// to ensure exponential backoff can occur:
281+
// - Permission errors (it is not possible to watch changes to permissions.
282+
// The only way to eventually recover from permission errors is to keep retrying).
274283
managedObjs, state, err := r.Applier.Apply(ctx, unpackResult.Bundle, ext, lbls)
275284
if err != nil {
276285
setInstalledStatusConditionFailed(ext, err.Error())

internal/rukpak/preflights/crdupgradesafety/crdupgradesafety.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"helm.sh/helm/v3/pkg/release"
1111
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
1212
apiextensionsv1client "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/typed/apiextensions/v1"
13+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/runtime"
1516

@@ -64,11 +65,15 @@ func NewPreflight(crdCli apiextensionsv1client.CustomResourceDefinitionInterface
6465
return p
6566
}
6667

67-
func (p *Preflight) Install(_ context.Context, _ *release.Release) error {
68-
return nil
68+
func (p *Preflight) Install(ctx context.Context, rel *release.Release) error {
69+
return p.runPreflight(ctx, rel)
6970
}
7071

7172
func (p *Preflight) Upgrade(ctx context.Context, rel *release.Release) error {
73+
return p.runPreflight(ctx, rel)
74+
}
75+
76+
func (p *Preflight) runPreflight(ctx context.Context, rel *release.Release) error {
7277
if rel == nil {
7378
return nil
7479
}
@@ -96,6 +101,11 @@ func (p *Preflight) Upgrade(ctx context.Context, rel *release.Release) error {
96101

97102
oldCrd, err := p.crdClient.Get(ctx, newCrd.Name, metav1.GetOptions{})
98103
if err != nil {
104+
// if there is no existing CRD, there is nothing to break
105+
// so it is immediately successful.
106+
if apierrors.IsNotFound(err) {
107+
continue
108+
}
99109
return fmt.Errorf("getting existing resource for CRD %q: %w", newCrd.Name, err)
100110
}
101111

internal/rukpak/preflights/crdupgradesafety/crdupgradesafety_test.go

Lines changed: 145 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ import (
1212
"helm.sh/helm/v3/pkg/release"
1313
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
1414
apiextensionsv1client "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/typed/apiextensions/v1"
15+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1617
"k8s.io/apimachinery/pkg/runtime"
18+
"k8s.io/apimachinery/pkg/runtime/schema"
1719

1820
"github.com/operator-framework/operator-controller/internal/rukpak/preflights/crdupgradesafety"
1921
"github.com/operator-framework/operator-controller/internal/rukpak/util"
@@ -71,8 +73,140 @@ func getManifestString(t *testing.T, crdFile string) string {
7173
// TestInstall exists only for completeness as Install() is currently a no-op. It can be used as
7274
// a template for real tests in the future if the func is implemented.
7375
func TestInstall(t *testing.T) {
74-
preflight := newMockPreflight(nil, nil, nil)
75-
require.Nil(t, preflight.Install(context.Background(), nil))
76+
tests := []struct {
77+
name string
78+
oldCrdPath string
79+
validator *kappcus.Validator
80+
release *release.Release
81+
wantErrMsgs []string
82+
wantCrdGetErr error
83+
}{
84+
{
85+
name: "nil release",
86+
},
87+
{
88+
name: "release with no objects",
89+
release: &release.Release{
90+
Name: "test-release",
91+
},
92+
},
93+
{
94+
name: "release with invalid manifest",
95+
release: &release.Release{
96+
Name: "test-release",
97+
Manifest: "abcd",
98+
},
99+
wantErrMsgs: []string{"json: cannot unmarshal string into Go value of type unstructured.detector"},
100+
},
101+
{
102+
name: "release with no CRD objects",
103+
release: &release.Release{
104+
Name: "test-release",
105+
Manifest: getManifestString(t, "no-crds.json"),
106+
},
107+
},
108+
{
109+
name: "fail to get old crd other than not found error",
110+
release: &release.Release{
111+
Name: "test-release",
112+
Manifest: getManifestString(t, "crd-valid-upgrade.json"),
113+
},
114+
wantCrdGetErr: fmt.Errorf("error!"),
115+
wantErrMsgs: []string{"error!"},
116+
},
117+
{
118+
name: "fail to get old crd, not found error",
119+
release: &release.Release{
120+
Name: "test-release",
121+
Manifest: getManifestString(t, "crd-valid-upgrade.json"),
122+
},
123+
wantCrdGetErr: apierrors.NewNotFound(schema.GroupResource{Group: apiextensionsv1.SchemeGroupVersion.Group, Resource: "customresourcedefinitions"}, "not found"),
124+
},
125+
{
126+
name: "invalid crd manifest file",
127+
release: &release.Release{
128+
Name: "test-release",
129+
Manifest: getManifestString(t, "crd-invalid"),
130+
},
131+
wantErrMsgs: []string{"json: cannot unmarshal"},
132+
},
133+
{
134+
name: "custom validator",
135+
oldCrdPath: "old-crd.json",
136+
release: &release.Release{
137+
Name: "test-release",
138+
Manifest: getManifestString(t, "old-crd.json"),
139+
},
140+
validator: &kappcus.Validator{
141+
Validations: []kappcus.Validation{
142+
kappcus.NewValidationFunc("test", func(old, new apiextensionsv1.CustomResourceDefinition) error {
143+
return fmt.Errorf("custom validation error!!")
144+
}),
145+
},
146+
},
147+
wantErrMsgs: []string{"custom validation error!!"},
148+
},
149+
{
150+
name: "valid upgrade",
151+
oldCrdPath: "old-crd.json",
152+
release: &release.Release{
153+
Name: "test-release",
154+
Manifest: getManifestString(t, "crd-valid-upgrade.json"),
155+
},
156+
},
157+
{
158+
name: "new crd validation failures (all except existing field removal)",
159+
// Not really intended to test kapp validators, although it does anyway to a large extent.
160+
// This test is primarily meant to ensure that we are actually using all of them.
161+
oldCrdPath: "old-crd.json",
162+
release: &release.Release{
163+
Name: "test-release",
164+
Manifest: getManifestString(t, "crd-invalid-upgrade.json"),
165+
},
166+
wantErrMsgs: []string{
167+
`"NoScopeChange"`,
168+
`"NoStoredVersionRemoved"`,
169+
`enums added`,
170+
`new required fields added`,
171+
`maximum constraint added when one did not exist previously`,
172+
`maximum items constraint added`,
173+
`maximum length constraint added`,
174+
`maximum properties constraint added`,
175+
`minimum constraint added when one did not exist previously`,
176+
`minimum items constraint added`,
177+
`minimum length constraint added`,
178+
`minimum properties constraint added`,
179+
`new value added as default`,
180+
},
181+
},
182+
{
183+
name: "new crd validation failure for existing field removal",
184+
// Separate test from above as this error will cause the validator to
185+
// return early and skip some of the above validations.
186+
oldCrdPath: "old-crd.json",
187+
release: &release.Release{
188+
Name: "test-release",
189+
Manifest: getManifestString(t, "crd-field-removed.json"),
190+
},
191+
wantErrMsgs: []string{
192+
`"NoExistingFieldRemoved"`,
193+
},
194+
},
195+
}
196+
197+
for _, tc := range tests {
198+
t.Run(tc.name, func(t *testing.T) {
199+
preflight := newMockPreflight(getCrdFromManifestFile(t, tc.oldCrdPath), tc.wantCrdGetErr, tc.validator)
200+
err := preflight.Install(context.Background(), tc.release)
201+
if len(tc.wantErrMsgs) != 0 {
202+
for _, expectedErrMsg := range tc.wantErrMsgs {
203+
require.ErrorContainsf(t, err, expectedErrMsg, "")
204+
}
205+
} else {
206+
require.NoError(t, err)
207+
}
208+
})
209+
}
76210
}
77211

78212
func TestUpgrade(t *testing.T) {
@@ -109,14 +243,22 @@ func TestUpgrade(t *testing.T) {
109243
},
110244
},
111245
{
112-
name: "fail to get old crd",
246+
name: "fail to get old crd, other than not found error",
113247
release: &release.Release{
114248
Name: "test-release",
115249
Manifest: getManifestString(t, "crd-valid-upgrade.json"),
116250
},
117251
wantCrdGetErr: fmt.Errorf("error!"),
118252
wantErrMsgs: []string{"error!"},
119253
},
254+
{
255+
name: "fail to get old crd, not found error",
256+
release: &release.Release{
257+
Name: "test-release",
258+
Manifest: getManifestString(t, "crd-valid-upgrade.json"),
259+
},
260+
wantCrdGetErr: apierrors.NewNotFound(schema.GroupResource{Group: apiextensionsv1.SchemeGroupVersion.Group, Resource: "customresourcedefinitions"}, "not found"),
261+
},
120262
{
121263
name: "invalid crd manifest file",
122264
release: &release.Release{

test/e2e/cluster_extension_install_test.go

Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,14 @@ func createServiceAccount(ctx context.Context, name types.NamespacedName, cluste
5050
if err != nil {
5151
return nil, err
5252
}
53+
54+
return sa, createClusterRoleAndBindingForSA(ctx, name.Name, sa, clusterExtensionName)
55+
}
56+
57+
func createClusterRoleAndBindingForSA(ctx context.Context, name string, sa *corev1.ServiceAccount, clusterExtensionName string) error {
5358
cr := &rbacv1.ClusterRole{
5459
ObjectMeta: metav1.ObjectMeta{
55-
Name: name.Name,
60+
Name: name,
5661
},
5762
Rules: []rbacv1.PolicyRule{
5863
{
@@ -144,33 +149,33 @@ func createServiceAccount(ctx context.Context, name types.NamespacedName, cluste
144149
},
145150
},
146151
}
147-
err = c.Create(ctx, cr)
152+
err := c.Create(ctx, cr)
148153
if err != nil {
149-
return nil, err
154+
return err
150155
}
151156
crb := &rbacv1.ClusterRoleBinding{
152157
ObjectMeta: metav1.ObjectMeta{
153-
Name: name.Name,
158+
Name: name,
154159
},
155160
Subjects: []rbacv1.Subject{
156161
{
157162
Kind: "ServiceAccount",
158-
Name: name.Name,
159-
Namespace: name.Namespace,
163+
Name: sa.Name,
164+
Namespace: sa.Namespace,
160165
},
161166
},
162167
RoleRef: rbacv1.RoleRef{
163168
APIGroup: "rbac.authorization.k8s.io",
164169
Kind: "ClusterRole",
165-
Name: name.Name,
170+
Name: name,
166171
},
167172
}
168173
err = c.Create(ctx, crb)
169174
if err != nil {
170-
return nil, err
175+
return err
171176
}
172177

173-
return sa, nil
178+
return nil
174179
}
175180

176181
func testInit(t *testing.T) (*ocv1alpha1.ClusterExtension, *catalogd.ClusterCatalog, *corev1.ServiceAccount) {
@@ -605,6 +610,94 @@ func TestClusterExtensionInstallReResolvesWhenManagedContentChanged(t *testing.T
605610
}, pollDuration, pollInterval)
606611
}
607612

613+
func TestClusterExtensionRecoversFromInitialInstallFailedWhenFailureFixed(t *testing.T) {
614+
t.Log("When a cluster extension is installed from a catalog")
615+
t.Log("When the extension bundle format is registry+v1")
616+
617+
clusterExtension, extensionCatalog, _ := testInit(t)
618+
name := rand.String(10)
619+
sa := &corev1.ServiceAccount{
620+
ObjectMeta: metav1.ObjectMeta{
621+
Name: name,
622+
Namespace: "default",
623+
},
624+
}
625+
err := c.Create(context.Background(), sa)
626+
require.NoError(t, err)
627+
628+
defer testCleanup(t, extensionCatalog, clusterExtension, sa)
629+
defer getArtifactsOutput(t)
630+
631+
clusterExtension.Spec = ocv1alpha1.ClusterExtensionSpec{
632+
PackageName: "prometheus",
633+
InstallNamespace: "default",
634+
ServiceAccount: ocv1alpha1.ServiceAccountReference{
635+
Name: sa.Name,
636+
},
637+
}
638+
t.Log("It resolves the specified package with correct bundle path")
639+
t.Log("By creating the ClusterExtension resource")
640+
require.NoError(t, c.Create(context.Background(), clusterExtension))
641+
642+
t.Log("By eventually reporting a successful resolution and bundle path")
643+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
644+
assert.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
645+
assert.Len(ct, clusterExtension.Status.Conditions, len(conditionsets.ConditionTypes))
646+
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1alpha1.TypeResolved)
647+
if !assert.NotNil(ct, cond) {
648+
return
649+
}
650+
assert.Equal(ct, metav1.ConditionTrue, cond.Status)
651+
assert.Equal(ct, ocv1alpha1.ReasonSuccess, cond.Reason)
652+
assert.Contains(ct, cond.Message, "resolved to")
653+
assert.Equal(ct, &ocv1alpha1.BundleMetadata{Name: "prometheus-operator.1.2.0", Version: "1.2.0"}, clusterExtension.Status.ResolvedBundle)
654+
}, pollDuration, pollInterval)
655+
656+
t.Log("By eventually reporting a successful unpacked")
657+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
658+
assert.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
659+
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1alpha1.TypeUnpacked)
660+
if !assert.NotNil(ct, cond) {
661+
return
662+
}
663+
assert.Equal(ct, metav1.ConditionTrue, cond.Status)
664+
assert.Equal(ct, ocv1alpha1.ReasonUnpackSuccess, cond.Reason)
665+
assert.Contains(ct, cond.Message, "unpack successful")
666+
}, pollDuration, pollInterval)
667+
668+
t.Log("By eventually failing to install the package successfully due to insufficient ServiceAccount permissions")
669+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
670+
assert.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
671+
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1alpha1.TypeInstalled)
672+
if !assert.NotNil(ct, cond) {
673+
return
674+
}
675+
assert.Equal(ct, metav1.ConditionFalse, cond.Status)
676+
assert.Equal(ct, ocv1alpha1.ReasonInstallationFailed, cond.Reason)
677+
assert.Contains(ct, cond.Message, "forbidden")
678+
}, pollDuration, pollInterval)
679+
680+
t.Log("By fixing the ServiceAccount permissions")
681+
require.NoError(t, createClusterRoleAndBindingForSA(context.Background(), name, sa, clusterExtension.Name))
682+
683+
// NOTE: In order to ensure predictable results we need to ensure we have a single
684+
// known failure with a singular fix operation. Additionally, due to the exponential
685+
// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
686+
// after creating and binding the needed permissions to the ServiceAccount.
687+
t.Log("By eventually installing the package successfully")
688+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
689+
assert.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
690+
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1alpha1.TypeInstalled)
691+
if !assert.NotNil(ct, cond) {
692+
return
693+
}
694+
assert.Equal(ct, metav1.ConditionTrue, cond.Status)
695+
assert.Equal(ct, ocv1alpha1.ReasonSuccess, cond.Reason)
696+
assert.Contains(ct, cond.Message, "Installed bundle")
697+
assert.NotEmpty(ct, clusterExtension.Status.InstalledBundle)
698+
}, pollDuration, pollInterval)
699+
}
700+
608701
// getArtifactsOutput gets all the artifacts from the test run and saves them to the artifact path.
609702
// Currently it saves:
610703
// - clusterextensions

0 commit comments

Comments
 (0)