Skip to content

Commit 54d24bd

Browse files
committed
nfd-gc: add healthz endpoint
1 parent e21bf81 commit 54d24bd

File tree

6 files changed

+68
-3
lines changed

6 files changed

+68
-3
lines changed

cmd/nfd-gc/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
8484
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
8585
"Kubeconfig to use")
8686
flagset.IntVar(&args.Port, "port", 8080,
87-
"Port on which to expose metrics.")
87+
"Port which metrics and healthz endpoints are served on")
8888

8989
klog.InitFlags(flagset)
9090

deployment/base/gc/gc.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,19 @@ spec:
1919
- name: nfd-gc
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
22+
livenessProbe:
23+
httpGet:
24+
path: /healthz
25+
port: http
26+
initialDelaySeconds: 10
27+
periodSeconds: 10
28+
readinessProbe:
29+
httpGet:
30+
path: /healthz
31+
port: http
32+
initialDelaySeconds: 5
33+
periodSeconds: 10
34+
failureThreshold: 10
2235
resources:
2336
limits:
2437
cpu: 20m

deployment/helm/node-feature-discovery/templates/nfd-gc.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,41 @@ spec:
4444
- name: gc
4545
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4646
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
47+
livenessProbe:
48+
httpGet:
49+
path: /healthz
50+
port: http
51+
{{- with .Values.gc.livenessProbe.initialDelaySeconds }}
52+
initialDelaySeconds: {{ . }}
53+
{{- end }}
54+
{{- with .Values.gc.livenessProbe.failureThreshold }}
55+
failureThreshold: {{ . }}
56+
{{- end }}
57+
{{- with .Values.gc.livenessProbe.periodSeconds }}
58+
periodSeconds: {{ . }}
59+
{{- end }}
60+
{{- with .Values.gc.livenessProbe.timeoutSeconds }}
61+
timeoutSeconds: {{ . }}
62+
{{- end }}
63+
readinessProbe:
64+
httpGet:
65+
path: /healthz
66+
port: http
67+
{{- with .Values.gc.readinessProbe.initialDelaySeconds }}
68+
initialDelaySeconds: {{ . }}
69+
{{- end }}
70+
{{- with .Values.gc.readinessProbe.failureThreshold }}
71+
failureThreshold: {{ . }}
72+
{{- end }}
73+
{{- with .Values.gc.readinessProbe.periodSeconds }}
74+
periodSeconds: {{ . }}
75+
{{- end }}
76+
{{- with .Values.gc.readinessProbe.timeoutSeconds }}
77+
timeoutSeconds: {{ . }}
78+
{{- end }}
79+
{{- with .Values.gc.readinessProbe.successThreshold }}
80+
successThreshold: {{ . }}
81+
{{- end }}
4782
env:
4883
- name: NODE_NAME
4984
valueFrom:

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,18 @@ gc:
565565

566566
podSecurityContext: {}
567567

568+
livenessProbe:
569+
initialDelaySeconds: 10
570+
# failureThreshold: 3
571+
# periodSeconds: 10
572+
# timeoutSeconds: 1
573+
readinessProbe:
574+
initialDelaySeconds: 5
575+
failureThreshold: 10
576+
# periodSeconds: 10
577+
# timeoutSeconds: 1
578+
# successThreshold: 1
579+
568580
resources:
569581
limits:
570582
memory: 1Gi

docs/deployment/helm.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ API's you need to install the prometheus operator in your cluster.
330330
| `gc.podDisruptionBudget.minAvailable` | integer | 1 | Specifies minAvailable for the podDisruptionBudget configuration for nfd-gc |
331331
| `gc.podDisruptionBudget.maxUnavailable` | integer | NULL | Specifies maxUnavailable for the podDisruptionBudget configuration for nfd-gc |
332332
| `gc.podDisruptionBudget.unhealthyPodEvictionPolicy` | string | AlwaysAllow | Specifies unhealthyPodEvictionPolicy for the podDisruptionBudget configuration for nfd-gc |
333-
| `gc.port` | integer | 8080 | Port on which to serve Prometheus metrics. |
333+
| `gc.port` | integer | 8080 | Port on which to serve http for metrics and health endpoints. |
334334
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector][nodeselector] |
335335
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations][toleration] |
336336
| `gc.annotations` | dict | {} | Garbage collector pod [annotations][annotations] |

pkg/nfd-gc/nfd-gc.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ func New(args *Args) (NfdGarbageCollector, error) {
8282
}, nil
8383
}
8484

85+
func (n *nfdGarbageCollector) Healthz(writer http.ResponseWriter, _ *http.Request) {
86+
writer.WriteHeader(http.StatusOK)
87+
}
88+
8589
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
8690
kind := "NodeFeature"
8791
if err := n.client.Resource(gvrNF).Namespace(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
@@ -252,7 +256,8 @@ func (n *nfdGarbageCollector) Run() error {
252256
httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
253257
registerVersion(version.Get())
254258

255-
// TODO: health probe endpoint could be added here
259+
// Register health endpoint (at this point we're "ready and live")
260+
httpMux.HandleFunc("/healthz", n.Healthz)
256261

257262
// Start HTTP server
258263
httpServer := http.Server{Addr: fmt.Sprintf(":%d", n.args.Port), Handler: httpMux}

0 commit comments

Comments
 (0)