Skip to content

Commit fcc5124

Browse files
Merge pull request #1545 from stuggi/OSPRH-18450
Add node failure tolerations to all service operators and openstackclient
2 parents a2af118 + 0ef2dfb commit fcc5124

File tree

9 files changed

+89
-0
lines changed

9 files changed

+89
-0
lines changed

bindata/operator/managers.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@ spec:
8585
runAsNonRoot: true
8686
serviceAccountName: {{ .Name }}-operator-controller-manager
8787
terminationGracePeriodSeconds: 10
88+
tolerations:
89+
- key: "node.kubernetes.io/not-ready"
90+
operator: "Exists"
91+
effect: "NoExecute"
92+
tolerationSeconds: 120
93+
- key: "node.kubernetes.io/unreachable"
94+
operator: "Exists"
95+
effect: "NoExecute"
96+
tolerationSeconds: 120
8897
{{- if isEnvVarTrue .Deployment.Manager.Env "ENABLE_WEBHOOKS" }}
8998
volumes:
9099
- name: cert

bindata/operator/operator.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,15 @@ spec:
133133
runAsNonRoot: true
134134
serviceAccountName: openstack-operator-controller-manager
135135
terminationGracePeriodSeconds: 10
136+
tolerations:
137+
- effect: NoExecute
138+
key: node.kubernetes.io/not-ready
139+
operator: Exists
140+
tolerationSeconds: 120
141+
- effect: NoExecute
142+
key: node.kubernetes.io/unreachable
143+
operator: Exists
144+
tolerationSeconds: 120
136145
volumes:
137146
- name: cert
138147
secret:

bindata/operator/rabbit.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,12 @@ spec:
4646
memory: {{ .RabbitmqOperator.Deployment.Manager.Resources.Requests.Memory }}
4747
serviceAccountName: rabbitmq-cluster-operator-controller-manager
4848
terminationGracePeriodSeconds: 10
49+
tolerations:
50+
- key: "node.kubernetes.io/not-ready"
51+
operator: "Exists"
52+
effect: "NoExecute"
53+
tolerationSeconds: 120
54+
- key: "node.kubernetes.io/unreachable"
55+
operator: "Exists"
56+
effect: "NoExecute"
57+
tolerationSeconds: 120

config/manager/manager.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,12 @@ spec:
7272
customRequests: replace_me #NOTE: this is used via the Makefile to inject a custom template that kustomize won't allow
7373
serviceAccountName: openstack-operator-controller-manager
7474
terminationGracePeriodSeconds: 10
75+
tolerations:
76+
- key: "node.kubernetes.io/not-ready"
77+
operator: "Exists"
78+
effect: "NoExecute"
79+
tolerationSeconds: 120
80+
- key: "node.kubernetes.io/unreachable"
81+
operator: "Exists"
82+
effect: "NoExecute"
83+
tolerationSeconds: 120

config/operator/deployment/deployment.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,12 @@ spec:
106106
memory: 128Mi
107107
serviceAccountName: openstack-operator-controller-operator
108108
terminationGracePeriodSeconds: 10
109+
tolerations:
110+
- key: "node.kubernetes.io/not-ready"
111+
operator: "Exists"
112+
effect: "NoExecute"
113+
tolerationSeconds: 120
114+
- key: "node.kubernetes.io/unreachable"
115+
operator: "Exists"
116+
effect: "NoExecute"
117+
tolerationSeconds: 120

config/operator/managers.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@ spec:
8585
runAsNonRoot: true
8686
serviceAccountName: {{ .Name }}-operator-controller-manager
8787
terminationGracePeriodSeconds: 10
88+
tolerations:
89+
- key: "node.kubernetes.io/not-ready"
90+
operator: "Exists"
91+
effect: "NoExecute"
92+
tolerationSeconds: 120
93+
- key: "node.kubernetes.io/unreachable"
94+
operator: "Exists"
95+
effect: "NoExecute"
96+
tolerationSeconds: 120
8897
{{- if isEnvVarTrue .Deployment.Manager.Env "ENABLE_WEBHOOKS" }}
8998
volumes:
9099
- name: cert

config/operator/rabbit.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,12 @@ spec:
4646
memory: {{ .RabbitmqOperator.Deployment.Manager.Resources.Requests.Memory }}
4747
serviceAccountName: rabbitmq-cluster-operator-controller-manager
4848
terminationGracePeriodSeconds: 10
49+
tolerations:
50+
- key: "node.kubernetes.io/not-ready"
51+
operator: "Exists"
52+
effect: "NoExecute"
53+
tolerationSeconds: 120
54+
- key: "node.kubernetes.io/unreachable"
55+
operator: "Exists"
56+
effect: "NoExecute"
57+
tolerationSeconds: 120

controllers/client/openstackclient_controller.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,18 @@ func (r *OpenStackClientReconciler) Reconcile(ctx context.Context, req ctrl.Requ
378378
)
379379
}
380380

381+
// if pod is stuck in terminating state for more than 3 minutes, force delete it
382+
if osclient.DeletionTimestamp != nil {
383+
terminatingDuration := time.Since(osclient.DeletionTimestamp.Time)
384+
if terminatingDuration > time.Minute*3 {
385+
// Force delete only truly stuck pods
386+
err := r.Client.Delete(ctx, osclient, client.GracePeriodSeconds(0))
387+
if err != nil {
388+
return ctrl.Result{}, fmt.Errorf("Failed to force delete pod: %w", err)
389+
}
390+
}
391+
}
392+
381393
podReady := false
382394

383395
for _, condition := range osclient.Status.Conditions {

pkg/openstackclient/funcs.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,20 @@ func ClientPodSpec(
9595
VolumeMounts: volumeMounts,
9696
},
9797
},
98+
Tolerations: []corev1.Toleration{
99+
{
100+
Key: "node.kubernetes.io/not-ready",
101+
Operator: corev1.TolerationOpExists,
102+
Effect: corev1.TaintEffectNoExecute,
103+
TolerationSeconds: &[]int64{120}[0],
104+
},
105+
{
106+
Key: "node.kubernetes.io/unreachable",
107+
Operator: corev1.TolerationOpExists,
108+
Effect: corev1.TaintEffectNoExecute,
109+
TolerationSeconds: &[]int64{120}[0],
110+
},
111+
},
98112
}
99113

100114
if instance.Spec.NodeSelector != nil {

0 commit comments

Comments
 (0)