Skip to content

Commit 0ef2dfb

Browse files
committed
Add node failure tolerations to all service operators
This change adds 120s tolerations for node.kubernetes.io/not-ready and unreachable taints to reduce pod failover during a node failure. The total eviction time is ~160s (5min+ default). 120s was choosen to prevents pod rescheduling e.g. on kubelet restarts or network issues Jira: OSPRH-18450 Signed-off-by: Martin Schuppert <[email protected]>
1 parent 40c250a commit 0ef2dfb

File tree

7 files changed

+63
-0
lines changed

7 files changed

+63
-0
lines changed

bindata/operator/managers.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@ spec:
8585
runAsNonRoot: true
8686
serviceAccountName: {{ .Name }}-operator-controller-manager
8787
terminationGracePeriodSeconds: 10
88+
tolerations:
89+
- key: "node.kubernetes.io/not-ready"
90+
operator: "Exists"
91+
effect: "NoExecute"
92+
tolerationSeconds: 120
93+
- key: "node.kubernetes.io/unreachable"
94+
operator: "Exists"
95+
effect: "NoExecute"
96+
tolerationSeconds: 120
8897
{{- if isEnvVarTrue .Deployment.Manager.Env "ENABLE_WEBHOOKS" }}
8998
volumes:
9099
- name: cert

bindata/operator/operator.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,15 @@ spec:
133133
runAsNonRoot: true
134134
serviceAccountName: openstack-operator-controller-manager
135135
terminationGracePeriodSeconds: 10
136+
tolerations:
137+
- effect: NoExecute
138+
key: node.kubernetes.io/not-ready
139+
operator: Exists
140+
tolerationSeconds: 120
141+
- effect: NoExecute
142+
key: node.kubernetes.io/unreachable
143+
operator: Exists
144+
tolerationSeconds: 120
136145
volumes:
137146
- name: cert
138147
secret:

bindata/operator/rabbit.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,12 @@ spec:
4646
memory: {{ .RabbitmqOperator.Deployment.Manager.Resources.Requests.Memory }}
4747
serviceAccountName: rabbitmq-cluster-operator-controller-manager
4848
terminationGracePeriodSeconds: 10
49+
tolerations:
50+
- key: "node.kubernetes.io/not-ready"
51+
operator: "Exists"
52+
effect: "NoExecute"
53+
tolerationSeconds: 120
54+
- key: "node.kubernetes.io/unreachable"
55+
operator: "Exists"
56+
effect: "NoExecute"
57+
tolerationSeconds: 120

config/manager/manager.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,12 @@ spec:
7272
customRequests: replace_me #NOTE: this is used via the Makefile to inject a custom template that kustomize won't allow
7373
serviceAccountName: openstack-operator-controller-manager
7474
terminationGracePeriodSeconds: 10
75+
tolerations:
76+
- key: "node.kubernetes.io/not-ready"
77+
operator: "Exists"
78+
effect: "NoExecute"
79+
tolerationSeconds: 120
80+
- key: "node.kubernetes.io/unreachable"
81+
operator: "Exists"
82+
effect: "NoExecute"
83+
tolerationSeconds: 120

config/operator/deployment/deployment.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,12 @@ spec:
106106
memory: 128Mi
107107
serviceAccountName: openstack-operator-controller-operator
108108
terminationGracePeriodSeconds: 10
109+
tolerations:
110+
- key: "node.kubernetes.io/not-ready"
111+
operator: "Exists"
112+
effect: "NoExecute"
113+
tolerationSeconds: 120
114+
- key: "node.kubernetes.io/unreachable"
115+
operator: "Exists"
116+
effect: "NoExecute"
117+
tolerationSeconds: 120

config/operator/managers.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@ spec:
8585
runAsNonRoot: true
8686
serviceAccountName: {{ .Name }}-operator-controller-manager
8787
terminationGracePeriodSeconds: 10
88+
tolerations:
89+
- key: "node.kubernetes.io/not-ready"
90+
operator: "Exists"
91+
effect: "NoExecute"
92+
tolerationSeconds: 120
93+
- key: "node.kubernetes.io/unreachable"
94+
operator: "Exists"
95+
effect: "NoExecute"
96+
tolerationSeconds: 120
8897
{{- if isEnvVarTrue .Deployment.Manager.Env "ENABLE_WEBHOOKS" }}
8998
volumes:
9099
- name: cert

config/operator/rabbit.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,12 @@ spec:
4646
memory: {{ .RabbitmqOperator.Deployment.Manager.Resources.Requests.Memory }}
4747
serviceAccountName: rabbitmq-cluster-operator-controller-manager
4848
terminationGracePeriodSeconds: 10
49+
tolerations:
50+
- key: "node.kubernetes.io/not-ready"
51+
operator: "Exists"
52+
effect: "NoExecute"
53+
tolerationSeconds: 120
54+
- key: "node.kubernetes.io/unreachable"
55+
operator: "Exists"
56+
effect: "NoExecute"
57+
tolerationSeconds: 120

0 commit comments

Comments
 (0)