deployment: add startupProbe for nfd-master

marquiz · marquiz · commit fb6484fb8dcf · 2024-12-12T20:00:49.000+02:00
This patch mitigates inadvertent termination of nfd-master pods by the
liveness probe on big clusters.

With a recent change nfd-master started to wait (block) for informer
caches to sync before starting the main loop. Consequently, this change
also made the gRPC health enpoint to not respond until the caches have
been synced. In big clusters the syncing the NodeFeature object cache
takes a long time as the objects are big and there's (at least) one per
each node in the cluster. Thus, in big clusters, the liveness probe
kicks in and kills the nfd-master pod before it's ready.
diff --git a/deployment/base/master/master-deployment.yaml b/deployment/base/master/master-deployment.yaml
@@ -28,16 +28,16 @@ spec:
             requests:
               cpu: 100m
               memory: 128Mi
+          startupProbe:
+            grpc:
+              port: 8082
+            failureThreshold: 30
           livenessProbe:
             grpc:
               port: 8082
-            initialDelaySeconds: 10
-            periodSeconds: 10
           readinessProbe:
             grpc:
               port: 8082
-            initialDelaySeconds: 5
-            periodSeconds: 10
             failureThreshold: 10
           command:
             - "nfd-master"
diff --git a/deployment/helm/node-feature-discovery/templates/master.yaml b/deployment/helm/node-feature-discovery/templates/master.yaml
@@ -47,6 +47,21 @@ spec:
             {{- toYaml .Values.master.securityContext | nindent 12 }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          startupProbe:
+            grpc:
+              port: {{ .Values.master.healthPort | default "8082" }}
+          {{- with .Values.master.startupProbe.initialDelaySeconds }}
+            initialDelaySeconds: {{ . }}
+          {{- end }}
+          {{- with .Values.master.startupProbe.failureThreshold }}
+            failureThreshold: {{ . }}
+          {{- end }}
+          {{- with .Values.master.startupProbe.periodSeconds }}
+            periodSeconds: {{ . }}
+          {{- end }}
+          {{- with .Values.master.startupProbe.timeoutSeconds }}
+            timeoutSeconds: {{ . }}
+          {{- end }}
           livenessProbe:
             grpc:
               port: {{ .Values.master.healthPort | default "8082" }}
diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml
@@ -152,18 +152,23 @@ master:
                 operator: In
                 values: [""]
                 
+  startupProbe:
+    grpc:
+      port: 8082
+    failureThreshold: 30
+    # periodSeconds: 10
   livenessProbe:
     grpc:
       port: 8082
-    initialDelaySeconds: 10
     # failureThreshold: 3
+    # initialDelaySeconds: 0
     # periodSeconds: 10
     # timeoutSeconds: 1
   readinessProbe:
     grpc:
       port: 8082
-    initialDelaySeconds: 5
     failureThreshold: 10
+    # initialDelaySeconds: 0
     # periodSeconds: 10
     # timeoutSeconds: 1
     # successThreshold: 1
diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md
@@ -201,11 +201,15 @@ API's you need to install the prometheus operator in your cluster.
 | `master.extraArgs`                          | array   | []                               | Additional [command line arguments](../reference/master-commandline-reference.md) to pass to nfd-master                                                                                               |
 | `master.extraEnvs`                          | array   | []                               | Additional environment variables to pass to nfd-master                                                                                                                                                |
 | `master.revisionHistoryLimit`               | integer |                                  | Specify how many old ReplicaSets for this Deployment you want to retain. [revisionHistoryLimit](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#revision-history-limit)         |
-| `master.livenessProbe.initialDelaySeconds`  | integer | 10                               | Specifies the number of seconds after the container has started before liveness probes are initiated.                                                                                                 |
+| `master.startupProbe.initialDelaySecond s`  | integer | 0 (by Kubernetes)                | Specifies the number of seconds after the container has started before startup probes are initiated.                                                                                                  |
+| `master.startupProbe.failureThreshold`      | integer | 30                               | Specifies the number of consecutive failures of startup probes before considering the pod as not ready.                                                                                               |
+| `master.startupProbe.periodSeconds`         | integer | 10 (by Kubernetes)               | Specifies how often (in seconds) to perform the startup probe.                                                                                                                                        |
+| `master.startupProbe.timeoutSeconds`        | integer | 1 (by Kubernetes)                | Specifies the number of seconds after which the probe times out.                                                                                                                                      |
+| `master.livenessProbe.initialDelaySeconds`  | integer | 0 (by Kubernetes)                | Specifies the number of seconds after the container has started before liveness probes are initiated.                                                                                                 |
 | `master.livenessProbe.failureThreshold`     | integer | 3 (by Kubernetes)                | Specifies the number of consecutive failures of liveness probes before considering the pod as not ready.                                                                                              |
 | `master.livenessProbe.periodSeconds`        | integer | 10 (by Kubernetes)               | Specifies how often (in seconds) to perform the liveness probe.                                                                                                                                       |
 | `master.livenessProbe.timeoutSeconds`       | integer | 1 (by Kubernetes)                | Specifies the number of seconds after which the probe times out.                                                                                                                                      |
-| `master.readinessProbe.initialDelaySeconds` | integer | 5                                | Specifies the number of seconds after the container has started before readiness probes are initiated.                                                                                                |
+| `master.readinessProbe.initialDelaySeconds` | integer | 0 (by Kubernetes)                | Specifies the number of seconds after the container has started before readiness probes are initiated.                                                                                                |
 | `master.readinessProbe.failureThreshold`    | integer | 10                               | Specifies the number of consecutive failures of readiness probes before considering the pod as not ready.                                                                                             |
 | `master.readinessProbe.periodSeconds`       | integer | 10 (by Kubernetes)               | Specifies how often (in seconds) to perform the readiness probe.                                                                                                                                      |
 | `master.readinessProbe.timeoutSeconds`      | integer | 1 (by Kubernetes)                | Specifies the number of seconds after which the probe times out.                                                                                                                                      |