Skip to content

Commit 5b5d690

Browse files
authored
Prevent OOMKill while listing CRDs (#4573)
- Also set GOMEMLIMIT (which helps avoid OOMKill)
1 parent b097e79 commit 5b5d690

File tree

7 files changed

+32
-17
lines changed

7 files changed

+32
-17
lines changed

v2/charts/azure-service-operator/templates/apps_v1_deployment_azureserviceoperator-controller-manager.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ spec:
6767
- --webhook-cert-dir={{ .Values.webhook.certDir }}
6868
{{- end }}
6969
env:
70+
- name: GOMEMLIMIT
71+
value: {{ .Values.go.memLimit }}
7072
- name: AZURE_CLIENT_ID
7173
valueFrom:
7274
secretKeyRef:

v2/charts/azure-service-operator/values.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ priorityClassName: ""
186186
# Number of pod replicas to create for the deployment
187187
replicaCount: 2
188188

189-
# Recomended initial values for resources
189+
# Recommended initial values for resources
190190
# adjust them as necessary
191191
resources:
192192
limits:
@@ -196,6 +196,9 @@ resources:
196196
cpu: 200m
197197
memory: 256Mi
198198

199+
go:
200+
memLimit: 400MiB # This should be set to ~80-90% of the hard memory limit set above in resources
201+
199202
# Number of old history to retain to allow rollback
200203
# Default Kubernetes value is set to 10
201204
revisionHistoryLimit: 10

v2/cmd/controller/app/setup.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ func SetupControllerManager(ctx context.Context, setupLog logr.Logger, flgs *Fla
142142
setupLog.Error(err, "failed to initialize CRD client")
143143
os.Exit(1)
144144
}
145-
existingCRDs, err := crdManager.ListCRDs(ctx)
145+
existingCRDs := apiextensions.CustomResourceDefinitionList{}
146+
err = crdManager.ListCRDs(ctx, &existingCRDs)
146147
if err != nil {
147148
setupLog.Error(err, "failed to list current CRDs")
148149
os.Exit(1)
@@ -155,7 +156,7 @@ func SetupControllerManager(ctx context.Context, setupLog logr.Logger, flgs *Fla
155156
// Note that this step will restart the pod when it succeeds
156157
err = crdManager.Install(ctx, crdmanagement.Options{
157158
CRDPatterns: flgs.CRDPatterns,
158-
ExistingCRDs: existingCRDs,
159+
ExistingCRDs: &existingCRDs,
159160
Path: crdmanagement.CRDLocation,
160161
Namespace: cfg.PodNamespace,
161162
})
@@ -184,7 +185,7 @@ func SetupControllerManager(ctx context.Context, setupLog logr.Logger, flgs *Fla
184185
// TODO: the nontrivial startup cost of reading the local copy of CRDs into memory. Since "none" is
185186
// TODO: us approximating the standard operator experience we don't perform this assertion currently as most
186187
// TODO: operators don't.
187-
readyResources := crdmanagement.MakeCRDMap(existingCRDs)
188+
readyResources := crdmanagement.MakeCRDMap(existingCRDs.Items)
188189

189190
if cfg.OperatorMode.IncludesWatchers() {
190191
//nolint:contextcheck

v2/config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ spec:
6969
resources:
7070
limits:
7171
cpu: 500m
72-
memory: 512Mi
72+
memory: 512Mi # Make sure to change the GOMEMLIMIT env variable if you change this too
7373
requests:
7474
cpu: 200m
7575
memory: 256Mi

v2/config/manager/manager_image_patch.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ spec:
1313
- image: localhost:5000/azureserviceoperator:latest
1414
name: manager
1515
env:
16+
- name: GOMEMLIMIT
17+
value: 400MiB # This should be set to ~80-90% of the hard memory limit on the pod
1618
- name: AZURE_CLIENT_ID
1719
valueFrom:
1820
secretKeyRef:

v2/internal/crdmanagement/manager.go

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -143,30 +143,36 @@ func NewManager(logger logr.Logger, kubeClient kubeclient.Client, leaderElection
143143
}
144144
}
145145

146-
func (m *Manager) ListCRDs(ctx context.Context) ([]apiextensions.CustomResourceDefinition, error) {
147-
list := apiextensions.CustomResourceDefinitionList{}
146+
// ListCRDs lists ASO CRDs.
147+
// This accepts a list rather than returning one to allow re-using the same list object (they're large and having multiple)
148+
// copies of the collection results in huge memory usage.
149+
func (m *Manager) ListCRDs(ctx context.Context, list *apiextensions.CustomResourceDefinitionList) error {
150+
// Clear the existing list, if there is one.
151+
list.Items = nil
152+
list.Continue = ""
153+
list.ResourceVersion = ""
148154

149155
selector := labels.NewSelector()
150156
requirement, err := labels.NewRequirement(ServiceOperatorAppLabel, selection.Equals, []string{ServiceOperatorAppValue})
151157
if err != nil {
152-
return nil, err
158+
return err
153159
}
154160
selector = selector.Add(*requirement)
155161

156162
match := client.MatchingLabelsSelector{
157163
Selector: selector,
158164
}
159165

160-
err = m.kubeClient.List(ctx, &list, match)
166+
err = m.kubeClient.List(ctx, list, match)
161167
if err != nil {
162-
return nil, eris.Wrapf(err, "failed to list CRDs")
168+
return eris.Wrapf(err, "failed to list CRDs")
163169
}
164170

165171
for _, crd := range list.Items {
166172
m.logger.V(Verbose).Info("Found an existing CRD", "CRD", crd.Name)
167173
}
168174

169-
return list.Items, nil
175+
return nil
170176
}
171177

172178
func (m *Manager) LoadOperatorCRDs(path string, podNamespace string) ([]apiextensions.CustomResourceDefinition, error) {
@@ -345,11 +351,11 @@ func (m *Manager) applyCRDs(
345351
// Double-checked locking, we need to make sure once we have the lock there's still work to do, as it may
346352
// already have been done while we were waiting for the lock.
347353
m.logger.V(Status).Info("Double-checked locking - ensure there's still CRDs to apply...")
348-
existingCRDs, err := m.ListCRDs(ctx)
354+
err := m.ListCRDs(ctx, options.ExistingCRDs)
349355
if err != nil {
350356
return eris.Wrap(err, "failed to list current CRDs")
351357
}
352-
instructions, err = m.DetermineCRDsToInstallOrUpgrade(goalCRDs, existingCRDs, options.CRDPatterns)
358+
instructions, err = m.DetermineCRDsToInstallOrUpgrade(goalCRDs, options.ExistingCRDs.Items, options.CRDPatterns)
353359
if err != nil {
354360
return eris.Wrap(err, "failed to determine CRDs to apply")
355361
}
@@ -411,7 +417,7 @@ type Options struct {
411417
Path string
412418
Namespace string
413419
CRDPatterns string
414-
ExistingCRDs []apiextensions.CustomResourceDefinition
420+
ExistingCRDs *apiextensions.CustomResourceDefinitionList
415421
}
416422

417423
func (m *Manager) Install(ctx context.Context, options Options) error {
@@ -420,7 +426,7 @@ func (m *Manager) Install(ctx context.Context, options Options) error {
420426
return eris.Wrap(err, "failed to load CRDs from disk")
421427
}
422428

423-
installationInstructions, err := m.DetermineCRDsToInstallOrUpgrade(goalCRDs, options.ExistingCRDs, options.CRDPatterns)
429+
installationInstructions, err := m.DetermineCRDsToInstallOrUpgrade(goalCRDs, options.ExistingCRDs.Items, options.CRDPatterns)
424430
if err != nil {
425431
return eris.Wrap(err, "failed to determine CRDs to apply")
426432
}

v2/internal/crdmanagement/manager_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,9 +415,10 @@ func Test_ListCRDs_ListsOnlyCRDsMatchingLabel(t *testing.T) {
415415
logger := testcommon.NewTestLogger(t)
416416
crdManager := crdmanagement.NewManager(logger, kubeClient, nil)
417417

418-
crds, err := crdManager.ListCRDs(ctx)
418+
crds := &apiextensions.CustomResourceDefinitionList{}
419+
err := crdManager.ListCRDs(ctx, crds)
419420
g.Expect(err).ToNot(HaveOccurred())
420-
g.Expect(crds).To(HaveLen(1))
421+
g.Expect(crds.Items).To(HaveLen(1))
421422
}
422423

423424
// This test requires that the task target `bundle-crds` has been run

0 commit comments

Comments
 (0)