Skip to content

Commit a053efd

Browse files
committed
nfd-master: run a separate gRPC health server
This patch separates the gRPC health server from the deprecated gRPC server (disabled by default, replaced by the NodeFeature CRD API) used for node labeling requests. The new health server runs on hardcoded TCP port number 8082. The main motivation for this change is to make the Kubernetes' built-in gRPC liveness probes to function if TLS is enabled (as they don't support TLS). The health server itself is a naive implementation (as it was before), basically only checking that nfd-master has started and hasn't crashed. The patch adds a TODO note to improve the functionality.
1 parent b3919f3 commit a053efd

File tree

4 files changed

+56
-15
lines changed

4 files changed

+56
-15
lines changed

cmd/nfd-master/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ import (
3232

3333
const (
3434
// ProgramName is the canonical name of this program
35-
ProgramName = "nfd-master"
35+
ProgramName = "nfd-master"
36+
GrpcHealthPort = 8082
3637
)
3738

3839
func main() {
@@ -100,6 +101,7 @@ func main() {
100101
utils.ConfigureGrpcKlog()
101102

102103
// Get new NfdMaster instance
104+
args.GrpcHealthPort = GrpcHealthPort
103105
instance, err := master.NewNfdMaster(args)
104106
if err != nil {
105107
klog.ErrorS(err, "failed to initialize NfdMaster instance")

deployment/base/master/master-deployment.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ spec:
2323
imagePullPolicy: Always
2424
livenessProbe:
2525
grpc:
26-
port: 8080
26+
port: 8082
2727
initialDelaySeconds: 10
2828
periodSeconds: 10
2929
readinessProbe:
3030
grpc:
31-
port: 8080
31+
port: 8082
3232
initialDelaySeconds: 5
3333
periodSeconds: 10
3434
failureThreshold: 10
@@ -37,5 +37,3 @@ spec:
3737
ports:
3838
- name: metrics
3939
containerPort: 8081
40-
- name: grpc
41-
containerPort: 8080

deployment/helm/node-feature-discovery/templates/master.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,12 @@ spec:
4343
imagePullPolicy: {{ .Values.image.pullPolicy }}
4444
livenessProbe:
4545
grpc:
46-
port: 8080
46+
port: 8082
4747
initialDelaySeconds: 10
4848
periodSeconds: 10
4949
readinessProbe:
5050
grpc:
51-
port: 8080
51+
port: 8082
5252
initialDelaySeconds: 5
5353
periodSeconds: 10
5454
failureThreshold: 10

pkg/nfd-master/nfd-master.go

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ type Args struct {
116116
CrdController bool
117117
EnableNodeFeatureApi bool
118118
Port int
119+
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
120+
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
121+
GrpcHealthPort int
119122
Prune bool
120123
VerifyNodeName bool
121124
Options string
@@ -144,6 +147,7 @@ type nfdMaster struct {
144147
nodeName string
145148
configFilePath string
146149
server *grpc.Server
150+
healthServer *grpc.Server
147151
stop chan struct{}
148152
ready chan bool
149153
apihelper apihelper.APIHelpers
@@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error {
270274

271275
// Run gRPC server
272276
grpcErr := make(chan error, 1)
273-
go m.runGrpcServer(grpcErr)
277+
// If the NodeFeature API is enabled, don'tregister the labeler API
278+
// server. Otherwise, register the labeler server.
279+
if !m.args.EnableNodeFeatureApi {
280+
go m.runGrpcServer(grpcErr)
281+
}
274282

275283
// Run updater that handles events from the nfd CRD API.
276284
if m.nfdController != nil {
@@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error {
281289
}
282290
}
283291

292+
// Start gRPC server for liveness probe (at this point we're "live")
293+
if m.args.GrpcHealthPort != 0 {
294+
if err := m.startGrpcHealthServer(grpcErr); err != nil {
295+
return fmt.Errorf("failed to start gRPC health server: %w", err)
296+
}
297+
}
298+
284299
// Notify that we're ready to accept connections
285300
m.ready <- true
286301
close(m.ready)
@@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error {
323338
}
324339
}
325340

341+
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
342+
// TODO: improve status checking e.g. with watchdog in the main event loop and
343+
// cheking that node updater pool is alive.
344+
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
345+
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
346+
if err != nil {
347+
return fmt.Errorf("failed to listen: %w", err)
348+
}
349+
350+
s := grpc.NewServer()
351+
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
352+
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
353+
354+
go func() {
355+
defer func() {
356+
lis.Close()
357+
}()
358+
if err := s.Serve(lis); err != nil {
359+
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
360+
}
361+
klog.InfoS("gRPC health server stopped")
362+
}()
363+
m.healthServer = s
364+
return nil
365+
}
366+
326367
func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
327368
// Create server listening for TCP connections
328369
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
@@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
352393
}
353394
m.server = grpc.NewServer(serverOpts...)
354395

355-
// If the NodeFeature API is enabled, don'tregister the labeler API
356-
// server. Otherwise, register the labeler server.
357-
if !m.args.EnableNodeFeatureApi {
358-
pb.RegisterLabelerServer(m.server, m)
359-
}
396+
pb.RegisterLabelerServer(m.server, m)
360397

361-
grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
362398
klog.InfoS("gRPC server serving", "port", m.args.Port)
363399

364400
// Run gRPC server
@@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
421457

422458
// Stop NfdMaster
423459
func (m *nfdMaster) Stop() {
424-
m.server.GracefulStop()
460+
if m.server != nil {
461+
m.server.GracefulStop()
462+
}
463+
if m.healthServer != nil {
464+
m.healthServer.GracefulStop()
465+
}
425466

426467
if m.nfdController != nil {
427468
m.nfdController.stop()

0 commit comments

Comments
 (0)