Skip to content

Commit 0ad5e50

Browse files
authored
Merge pull request #1609 from ozhuraki/worker-health
nfd-worker: Add liveness probe
2 parents c4ff25d + 8b63d17 commit 0ad5e50

File tree

5 files changed

+82
-1
lines changed

5 files changed

+82
-1
lines changed

cmd/nfd-worker/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ import (
3232

3333
const (
3434
// ProgramName is the canonical name of this program
35-
ProgramName = "nfd-worker"
35+
ProgramName = "nfd-worker"
36+
GrpcHealthPort = 8082
3637
)
3738

3839
func main() {
@@ -79,6 +80,7 @@ func main() {
7980
utils.ConfigureGrpcKlog()
8081

8182
// Get new NfdWorker instance
83+
args.GrpcHealthPort = GrpcHealthPort
8284
instance, err := worker.NewNfdWorker(args)
8385
if err != nil {
8486
klog.ErrorS(err, "failed to initialize NfdWorker instance")

deployment/base/worker-daemonset/worker-daemonset.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ spec:
1919
- name: nfd-worker
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
22+
livenessProbe:
23+
grpc:
24+
port: 8082
25+
initialDelaySeconds: 10
26+
periodSeconds: 10
27+
readinessProbe:
28+
grpc:
29+
port: 8082
30+
initialDelaySeconds: 5
31+
periodSeconds: 10
32+
failureThreshold: 10
2233
command:
2334
- "nfd-worker"
2435
args:

deployment/helm/node-feature-discovery/templates/worker.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,17 @@ spec:
4343
{{- toYaml .Values.worker.securityContext | nindent 12 }}
4444
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4545
imagePullPolicy: {{ .Values.image.pullPolicy }}
46+
livenessProbe:
47+
grpc:
48+
port: 8082
49+
initialDelaySeconds: 10
50+
periodSeconds: 10
51+
readinessProbe:
52+
grpc:
53+
port: 8082
54+
initialDelaySeconds: 5
55+
periodSeconds: 10
56+
failureThreshold: 10
4657
env:
4758
- name: NODE_NAME
4859
valueFrom:

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,20 @@ worker:
394394
runAsNonRoot: true
395395
# runAsUser: 1000
396396

397+
# livenessProbe: {}
398+
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
399+
# grpc:
400+
# port: 8082
401+
# initialDelaySeconds: 10
402+
# periodSeconds: 10
403+
# readinessProbe: {}
404+
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
405+
# grpc:
406+
# port: 8082
407+
# initialDelaySeconds: 5
408+
# periodSeconds: 10
409+
# failureThreshold: 10
410+
397411
serviceAccount:
398412
# Specifies whether a service account should be created.
399413
# We create this by default to make it easier for downstream users to apply PodSecurityPolicies.

pkg/nfd-worker/nfd-worker.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"crypto/x509"
2222
"encoding/json"
2323
"fmt"
24+
"net"
2425
"os"
2526
"path/filepath"
2627
"regexp"
@@ -33,6 +34,8 @@ import (
3334
"google.golang.org/grpc"
3435
"google.golang.org/grpc/credentials"
3536
"google.golang.org/grpc/credentials/insecure"
37+
"google.golang.org/grpc/health"
38+
"google.golang.org/grpc/health/grpc_health_v1"
3639
"k8s.io/apimachinery/pkg/api/errors"
3740
"k8s.io/apimachinery/pkg/types"
3841
"k8s.io/apimachinery/pkg/util/validation"
@@ -104,6 +107,7 @@ type Args struct {
104107
Server string
105108
ServerNameOverride string
106109
MetricsPort int
110+
GrpcHealthPort int
107111

108112
Overrides ConfigOverrideArgs
109113
}
@@ -124,6 +128,7 @@ type nfdWorker struct {
124128
config *NFDConfig
125129
kubernetesNamespace string
126130
grpcClient pb.LabelerClient
131+
healthServer *grpc.Server
127132
nfdClient *nfdclient.Clientset
128133
stop chan struct{} // channel for signaling stop
129134
featureSources []source.FeatureSource
@@ -187,6 +192,29 @@ func (i *infiniteTicker) Reset(d time.Duration) {
187192
}
188193
}
189194

195+
func (w *nfdWorker) startGrpcHealthServer(errChan chan<- error) error {
196+
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
197+
if err != nil {
198+
return fmt.Errorf("failed to listen: %w", err)
199+
}
200+
201+
s := grpc.NewServer()
202+
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
203+
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)
204+
205+
go func() {
206+
defer func() {
207+
lis.Close()
208+
}()
209+
if err := s.Serve(lis); err != nil {
210+
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
211+
}
212+
klog.InfoS("gRPC health server stopped")
213+
}()
214+
w.healthServer = s
215+
return nil
216+
}
217+
190218
// Run feature discovery.
191219
func (w *nfdWorker) runFeatureDiscovery() error {
192220
discoveryStart := time.Now()
@@ -262,8 +290,20 @@ func (w *nfdWorker) Run() error {
262290
return nil
263291
}
264292

293+
grpcErr := make(chan error, 1)
294+
295+
// Start gRPC server for liveness probe (at this point we're "live")
296+
if w.args.GrpcHealthPort != 0 {
297+
if err := w.startGrpcHealthServer(grpcErr); err != nil {
298+
return fmt.Errorf("failed to start gRPC health server: %w", err)
299+
}
300+
}
301+
265302
for {
266303
select {
304+
case err := <-grpcErr:
305+
return fmt.Errorf("error in serving gRPC: %w", err)
306+
267307
case <-labelTrigger.C:
268308
err = w.runFeatureDiscovery()
269309
if err != nil {
@@ -294,6 +334,9 @@ func (w *nfdWorker) Run() error {
294334

295335
case <-w.stop:
296336
klog.InfoS("shutting down nfd-worker")
337+
if w.healthServer != nil {
338+
w.healthServer.GracefulStop()
339+
}
297340
configWatch.Close()
298341
w.certWatch.Close()
299342
return nil

0 commit comments

Comments
 (0)