Skip to content

Commit f182d0e

Browse files
author
Jeff Peeler
committed
fix(olm): clean up resources on CSV deletion
This ensures proper resource deletion is done upon CSV deletion. Since this touches a lot of different places, here's a summary of changes made: The RBAC has been modified to be owned by CSV instead of the operator group. An operator group may remain after a CSV is deleted, but the associated resources shouldn't. Similarly, created service accounts were missing an owner reference to the CSV. Due to the large amount of CSV requeueing and potential in progress handling of a CSV, RBAC couldn't be deleted in handleClusterServiceVersionDeletion (because sometimes the RBAC would be recreated by another CSV sync). Instead, a new queue was created for GC-ing resources. The sync loop specifically is used to do deletes so that the loop can return an error (an error being if the CSV is not yet deleted) and will be scheduled to try again later. The requeueing code has been changed to not requeue if the CSV is not in the cache to help not delay the new GC sync loop. The new queue does not utilize an informer or indexer, so the event and the resource are placed directly on the queue rather than relying on the indexer to retrieve by key in the processing loop (processNextWorkItem).
1 parent f8a1453 commit f182d0e

File tree

7 files changed

+226
-29
lines changed

7 files changed

+226
-29
lines changed

pkg/controller/operators/olm/operator.go

Lines changed: 154 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ import (
88
"time"
99

1010
v1 "github.com/operator-framework/operator-lifecycle-manager/pkg/api/apis/operators/v1"
11+
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubestate"
1112
"github.com/sirupsen/logrus"
1213
log "github.com/sirupsen/logrus"
1314
corev1 "k8s.io/api/core/v1"
15+
rbacv1 "k8s.io/api/rbac/v1"
1416
extinf "k8s.io/apiextensions-apiserver/pkg/client/informers/externalversions"
1517
k8serrors "k8s.io/apimachinery/pkg/api/errors"
1618
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -32,6 +34,7 @@ import (
3234
"github.com/operator-framework/operator-lifecycle-manager/pkg/api/client/informers/externalversions"
3335
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/certs"
3436
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
37+
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/envvar"
3538
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver"
3639
csvutility "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/csv"
3740
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/event"
@@ -40,10 +43,9 @@ import (
4043
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
4144
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorlister"
4245
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/ownerutil"
46+
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/proxy"
4347
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/queueinformer"
4448
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/scoped"
45-
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/proxy"
46-
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/envvar"
4749
"github.com/operator-framework/operator-lifecycle-manager/pkg/metrics"
4850
)
4951

@@ -65,6 +67,7 @@ type Operator struct {
6567
csvQueueSet *queueinformer.ResourceQueueSet
6668
csvCopyQueueSet *queueinformer.ResourceQueueSet
6769
csvGCQueueSet *queueinformer.ResourceQueueSet
70+
objGCQueueSet *queueinformer.ResourceQueueSet
6871
apiServiceQueue workqueue.RateLimitingInterface
6972
csvIndexers map[string]cache.Indexer
7073
recorder record.EventRecorder
@@ -118,6 +121,7 @@ func newOperatorWithConfig(ctx context.Context, config *operatorConfig) (*Operat
118121
csvQueueSet: queueinformer.NewEmptyResourceQueueSet(),
119122
csvCopyQueueSet: queueinformer.NewEmptyResourceQueueSet(),
120123
csvGCQueueSet: queueinformer.NewEmptyResourceQueueSet(),
124+
objGCQueueSet: queueinformer.NewEmptyResourceQueueSet(),
121125
apiServiceQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "apiservice"),
122126
resolver: config.strategyResolver,
123127
apiReconciler: config.apiReconciler,
@@ -215,7 +219,7 @@ func newOperatorWithConfig(ctx context.Context, config *operatorConfig) (*Operat
215219
}
216220

217221
subInformer := extInformerFactory.Operators().V1alpha1().Subscriptions()
218-
op.lister.OperatorsV1alpha1().RegisterSubscriptionLister(namespace, subInformer.Lister())
222+
op.lister.OperatorsV1alpha1().RegisterSubscriptionLister(namespace, subInformer.Lister())
219223
subQueueInformer, err := queueinformer.NewQueueInformer(
220224
ctx,
221225
queueinformer.WithLogger(op.logger),
@@ -319,6 +323,38 @@ func newOperatorWithConfig(ctx context.Context, config *operatorConfig) (*Operat
319323
if err := op.RegisterQueueInformer(serviceAccountQueueInformer); err != nil {
320324
return nil, err
321325
}
326+
327+
objGCQueue := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), fmt.Sprintf("%s/obj-gc", namespace))
328+
op.objGCQueueSet.Set(namespace, objGCQueue)
329+
objGCQueueInformer, err := queueinformer.NewQueue(
330+
ctx,
331+
queueinformer.WithLogger(op.logger),
332+
queueinformer.WithQueue(objGCQueue),
333+
queueinformer.WithSyncer(queueinformer.LegacySyncHandler(op.syncGCObject).ToSyncer()),
334+
)
335+
if err != nil {
336+
return nil, err
337+
}
338+
if err := op.RegisterQueueInformer(objGCQueueInformer); err != nil {
339+
return nil, err
340+
}
341+
342+
}
343+
344+
// add queue for all namespaces as well
345+
objGCQueue := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), fmt.Sprintf("%s/obj-gc", ""))
346+
op.objGCQueueSet.Set("", objGCQueue)
347+
objGCQueueInformer, err := queueinformer.NewQueue(
348+
ctx,
349+
queueinformer.WithLogger(op.logger),
350+
queueinformer.WithQueue(objGCQueue),
351+
queueinformer.WithSyncer(queueinformer.LegacySyncHandler(op.syncGCObject).ToSyncer()),
352+
)
353+
if err != nil {
354+
return nil, err
355+
}
356+
if err := op.RegisterQueueInformer(objGCQueueInformer); err != nil {
357+
return nil, err
322358
}
323359

324360
k8sInformerFactory := informers.NewSharedInformerFactory(op.opClient.KubernetesInterface(), config.resyncPeriod)
@@ -444,7 +480,7 @@ func newOperatorWithConfig(ctx context.Context, config *operatorConfig) (*Operat
444480
op.resolver = &install.StrategyResolver{
445481
ProxyInjectorBuilderFunc: proxyEnvInjector.GetDeploymentInitializer,
446482
}
447-
483+
448484
return op, nil
449485
}
450486

@@ -536,6 +572,60 @@ func (a *Operator) RegisterCSVWatchNotification(csvNotification csvutility.Watch
536572
a.csvNotification = csvNotification
537573
}
538574

575+
func (a *Operator) syncGCObject(obj interface{}) (syncError error) {
576+
metaObj, ok := obj.(metav1.Object)
577+
if !ok {
578+
a.logger.Warn("object sync: casting to metav1.Object failed")
579+
return
580+
}
581+
logger := a.logger.WithFields(logrus.Fields{
582+
"name": metaObj.GetName(),
583+
"namespace": metaObj.GetNamespace(),
584+
"self": metaObj.GetSelfLink(),
585+
})
586+
587+
switch metaObj.(type) {
588+
case *rbacv1.ClusterRole:
589+
if name, ns, ok := ownerutil.GetOwnerByKindLabel(metaObj, v1alpha1.ClusterServiceVersionKind); ok {
590+
_, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(ns).Get(name)
591+
if err == nil {
592+
logger.Debugf("CSV still present, must wait until it is deleted (owners=%v/%v)", ns, name)
593+
syncError = fmt.Errorf("cleanup must wait")
594+
return
595+
} else if !k8serrors.IsNotFound(err) {
596+
syncError = err
597+
return
598+
}
599+
}
600+
601+
if err := a.opClient.DeleteClusterRole(metaObj.GetName(), &metav1.DeleteOptions{}); err != nil {
602+
logger.WithError(err).Warn("cannot delete cluster role")
603+
break
604+
}
605+
logger.Debugf("Deleted cluster role %v due to no owning CSV", metaObj.GetName())
606+
case *rbacv1.ClusterRoleBinding:
607+
if name, ns, ok := ownerutil.GetOwnerByKindLabel(metaObj, v1alpha1.ClusterServiceVersionKind); ok {
608+
_, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(ns).Get(name)
609+
if err == nil {
610+
logger.Debugf("CSV still present, must wait until it is deleted (owners=%v)", name)
611+
syncError = fmt.Errorf("cleanup must wait")
612+
return
613+
} else if !k8serrors.IsNotFound(err) {
614+
syncError = err
615+
return
616+
}
617+
}
618+
619+
if err := a.opClient.DeleteClusterRoleBinding(metaObj.GetName(), &metav1.DeleteOptions{}); err != nil {
620+
logger.WithError(err).Warn("cannot delete cluster role binding")
621+
break
622+
}
623+
logger.Debugf("Deleted cluster role binding %v due to no owning CSV", metaObj.GetName())
624+
}
625+
626+
return
627+
}
628+
539629
func (a *Operator) syncObject(obj interface{}) (syncError error) {
540630
// Assert as metav1.Object
541631
metaObj, ok := obj.(metav1.Object)
@@ -550,14 +640,33 @@ func (a *Operator) syncObject(obj interface{}) (syncError error) {
550640
"self": metaObj.GetSelfLink(),
551641
})
552642

553-
// Requeue all owner CSVs
554-
if ownerutil.IsOwnedByKind(metaObj, v1alpha1.ClusterServiceVersionKind) {
555-
logger.Debug("requeueing owner csvs")
556-
a.requeueOwnerCSVs(metaObj)
557-
}
558-
559643
// Requeues objects that can't have ownerrefs (cluster -> namespace, cross-namespace)
560644
if ownerutil.IsOwnedByKindLabel(metaObj, v1alpha1.ClusterServiceVersionKind) {
645+
name, ns, ok := ownerutil.GetOwnerByKindLabel(metaObj, v1alpha1.ClusterServiceVersionKind)
646+
if !ok {
647+
logger.Error("unexpected owner label retrieval failure")
648+
}
649+
_, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(ns).Get(name)
650+
if !k8serrors.IsNotFound(err) {
651+
logger.Debug("requeueing owner csvs from owner label")
652+
a.requeueOwnerCSVs(metaObj)
653+
} else {
654+
switch metaObj.(type) {
655+
case *rbacv1.ClusterRole, *rbacv1.ClusterRoleBinding:
656+
resourceEvent := kubestate.NewResourceEvent(
657+
kubestate.ResourceUpdated,
658+
metaObj,
659+
)
660+
syncError = a.objGCQueueSet.RequeueEvent(ns, resourceEvent)
661+
logger.Debugf("syncObject - requeued update event for %v, res=%v", resourceEvent, syncError)
662+
return
663+
}
664+
}
665+
666+
}
667+
668+
// Requeue all owner CSVs
669+
if ownerutil.IsOwnedByKind(metaObj, v1alpha1.ClusterServiceVersionKind) {
561670
logger.Debug("requeueing owner csvs")
562671
a.requeueOwnerCSVs(metaObj)
563672
}
@@ -712,6 +821,25 @@ func (a *Operator) handleClusterServiceVersionDeletion(obj interface{}) {
712821
}
713822
}
714823
}
824+
825+
ownerSelector := ownerutil.CSVOwnerSelector(clusterServiceVersion)
826+
crbs, err := a.lister.RbacV1().ClusterRoleBindingLister().List(ownerSelector)
827+
if err != nil {
828+
logger.WithError(err).Warn("cannot list cluster role bindings")
829+
}
830+
for _, crb := range crbs {
831+
syncError := a.objGCQueueSet.RequeueEvent("", kubestate.NewResourceEvent(kubestate.ResourceUpdated, crb))
832+
logger.Debugf("handleCSVdeletion - requeued update event for %v, res=%v", crb, syncError)
833+
}
834+
835+
crs, err := a.lister.RbacV1().ClusterRoleLister().List(ownerSelector)
836+
if err != nil {
837+
logger.WithError(err).Warn("cannot list cluster roles")
838+
}
839+
for _, cr := range crs {
840+
syncError := a.objGCQueueSet.RequeueEvent("", kubestate.NewResourceEvent(kubestate.ResourceUpdated, cr))
841+
logger.Debugf("handleCSVdeletion - requeued update event for %v, res=%v", cr, syncError)
842+
}
715843
}
716844

717845
func (a *Operator) removeDanglingChildCSVs(csv *v1alpha1.ClusterServiceVersion) error {
@@ -835,6 +963,7 @@ func (a *Operator) syncClusterServiceVersion(obj interface{}) (syncError error)
835963
a.csvCopyQueueSet.Requeue(outCSV.GetNamespace(), outCSV.GetName())
836964
}
837965

966+
logger.Debug("done syncing CSV")
838967
return
839968
}
840969

@@ -1580,14 +1709,14 @@ func (a *Operator) handleDeletion(obj interface{}) {
15801709
})
15811710
logger.Debug("handling resource deletion")
15821711

1583-
logger.Debug("requeueing owner csvs")
1712+
logger.Debug("requeueing owner csvs due to deletion")
15841713
a.requeueOwnerCSVs(metaObj)
15851714

15861715
// Requeue CSVs with provided and required labels (for CRDs)
15871716
if labelSets, err := a.apiLabeler.LabelSetsFor(metaObj); err != nil {
15881717
logger.WithError(err).Warn("couldn't create label set")
15891718
} else if len(labelSets) > 0 {
1590-
logger.Debug("requeueing providing/requiring csvs")
1719+
logger.Debug("requeueing providing/requiring csvs due to deletion")
15911720
a.requeueCSVsByLabelSet(logger, labelSets...)
15921721
}
15931722
}
@@ -1619,8 +1748,13 @@ func (a *Operator) requeueOwnerCSVs(ownee metav1.Object) {
16191748
owners := ownerutil.GetOwnersByKind(ownee, v1alpha1.ClusterServiceVersionKind)
16201749
if len(owners) > 0 && ownee.GetNamespace() != metav1.NamespaceAll {
16211750
for _, ownerCSV := range owners {
1751+
_, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(ownee.GetNamespace()).Get(ownerCSV.Name)
1752+
if k8serrors.IsNotFound(err) {
1753+
logger.Debugf("skipping requeue since CSV %v is not in cache", ownerCSV.Name)
1754+
continue
1755+
}
16221756
// Since cross-namespace CSVs can't exist we're guaranteed the owner will be in the same namespace
1623-
err := a.csvQueueSet.Requeue(ownee.GetNamespace(), ownerCSV.Name)
1757+
err = a.csvQueueSet.Requeue(ownee.GetNamespace(), ownerCSV.Name)
16241758
if err != nil {
16251759
logger.Warn(err.Error())
16261760
}
@@ -1630,7 +1764,13 @@ func (a *Operator) requeueOwnerCSVs(ownee metav1.Object) {
16301764

16311765
// Requeue owners based on labels
16321766
if name, ns, ok := ownerutil.GetOwnerByKindLabel(ownee, v1alpha1.ClusterServiceVersionKind); ok {
1633-
err := a.csvQueueSet.Requeue(ns, name)
1767+
_, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(ns).Get(name)
1768+
if k8serrors.IsNotFound(err) {
1769+
logger.Debugf("skipping requeue since CSV %v is not in cache", name)
1770+
return
1771+
}
1772+
1773+
err = a.csvQueueSet.Requeue(ns, name)
16341774
if err != nil {
16351775
logger.Warn(err.Error())
16361776
}

pkg/controller/operators/olm/operatorgroup.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import (
1818
"github.com/operator-framework/operator-lifecycle-manager/pkg/api/apis/operators/v1alpha1"
1919
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
2020
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver"
21-
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/ownerutil"
21+
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/ownerutil"
2222
)
2323

2424
const (
@@ -253,7 +253,7 @@ func (a *Operator) ensureProvidedAPIClusterRole(operatorGroup *v1.OperatorGroup,
253253
},
254254
Rules: []rbacv1.PolicyRule{{Verbs: verbs, APIGroups: []string{group}, Resources: []string{resource}, ResourceNames: resourceNames}},
255255
}
256-
err := ownerutil.AddOwnerLabels(clusterRole, operatorGroup)
256+
err := ownerutil.AddOwnerLabels(clusterRole, csv)
257257
if err != nil {
258258
return err
259259
}

pkg/controller/registry/resolver/rbac.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ func RBACForClusterServiceVersion(csv *v1alpha1.ClusterServiceVersion) (map[stri
117117
// Create ServiceAccount if necessary
118118
if _, ok := permissions[permission.ServiceAccountName]; !ok {
119119
serviceAccount := &corev1.ServiceAccount{}
120+
ownerutil.AddOwner(serviceAccount, csv, false, false)
120121
serviceAccount.SetName(permission.ServiceAccountName)
121122

122123
permissions[permission.ServiceAccountName] = NewOperatorPermissions(serviceAccount)

pkg/lib/queueinformer/config.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (c *queueInformerConfig) complete() {
4343
}
4444

4545
// validate returns an error if the config isn't valid.
46-
func (c *queueInformerConfig) validate() (err error) {
46+
func (c *queueInformerConfig) validateQueueInformer() (err error) {
4747
switch config := c; {
4848
case config.provider == nil:
4949
err = newInvalidConfigError("nil metrics provider")
@@ -62,6 +62,24 @@ func (c *queueInformerConfig) validate() (err error) {
6262
return
6363
}
6464

65+
// difference from above is that this intentionally verifies without index/informer
66+
func (c *queueInformerConfig) validateQueue() (err error) {
67+
switch config := c; {
68+
case config.provider == nil:
69+
err = newInvalidConfigError("nil metrics provider")
70+
case config.logger == nil:
71+
err = newInvalidConfigError("nil logger")
72+
case config.queue == nil:
73+
err = newInvalidConfigError("nil queue")
74+
case config.keyFunc == nil:
75+
err = newInvalidConfigError("nil key function")
76+
case config.syncer == nil:
77+
err = newInvalidConfigError("nil syncer")
78+
}
79+
80+
return
81+
}
82+
6583
func defaultKeyFunc(obj interface{}) (string, bool) {
6684
// Get keys nested in resource events up to depth 2
6785
keyable := false

pkg/lib/queueinformer/queueinformer.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,25 @@ func (q *QueueInformer) metricHandlers() *cache.ResourceEventHandlerFuncs {
105105
}
106106
}
107107

108+
func NewQueue(ctx context.Context, options ...Option) (*QueueInformer, error) {
109+
config := defaultConfig()
110+
config.apply(options)
111+
112+
if err := config.validateQueue(); err != nil {
113+
return nil, err
114+
}
115+
116+
queue := &QueueInformer{
117+
MetricsProvider: config.provider,
118+
logger: config.logger,
119+
queue: config.queue,
120+
keyFunc: config.keyFunc,
121+
syncer: config.syncer,
122+
}
123+
124+
return queue, nil
125+
}
126+
108127
// NewQueueInformer returns a new QueueInformer configured with options.
109128
func NewQueueInformer(ctx context.Context, options ...Option) (*QueueInformer, error) {
110129
// Get default config and apply given options
@@ -116,7 +135,7 @@ func NewQueueInformer(ctx context.Context, options ...Option) (*QueueInformer, e
116135
}
117136

118137
func newQueueInformerFromConfig(ctx context.Context, config *queueInformerConfig) (*QueueInformer, error) {
119-
if err := config.validate(); err != nil {
138+
if err := config.validateQueueInformer(); err != nil {
120139
return nil, err
121140
}
122141

0 commit comments

Comments
 (0)