Skip to content

Commit 4501bed

Browse files
authored
Merge pull request #1535 from marquiz/devel/grpc-probe
nfd-master: run a separate gRPC health server
2 parents 64eba87 + a053efd commit 4501bed

File tree

4 files changed

+56
-15
lines changed

4 files changed

+56
-15
lines changed

cmd/nfd-master/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ import (
3232

3333
const (
3434
// ProgramName is the canonical name of this program
35-
ProgramName = "nfd-master"
35+
ProgramName = "nfd-master"
36+
GrpcHealthPort = 8082
3637
)
3738

3839
func main() {
@@ -100,6 +101,7 @@ func main() {
100101
utils.ConfigureGrpcKlog()
101102

102103
// Get new NfdMaster instance
104+
args.GrpcHealthPort = GrpcHealthPort
103105
instance, err := master.NewNfdMaster(args)
104106
if err != nil {
105107
klog.ErrorS(err, "failed to initialize NfdMaster instance")

deployment/base/master/master-deployment.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ spec:
2323
imagePullPolicy: Always
2424
livenessProbe:
2525
grpc:
26-
port: 8080
26+
port: 8082
2727
initialDelaySeconds: 10
2828
periodSeconds: 10
2929
readinessProbe:
3030
grpc:
31-
port: 8080
31+
port: 8082
3232
initialDelaySeconds: 5
3333
periodSeconds: 10
3434
failureThreshold: 10
@@ -37,5 +37,3 @@ spec:
3737
ports:
3838
- name: metrics
3939
containerPort: 8081
40-
- name: grpc
41-
containerPort: 8080

deployment/helm/node-feature-discovery/templates/master.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,12 @@ spec:
4343
imagePullPolicy: {{ .Values.image.pullPolicy }}
4444
livenessProbe:
4545
grpc:
46-
port: 8080
46+
port: 8082
4747
initialDelaySeconds: 10
4848
periodSeconds: 10
4949
readinessProbe:
5050
grpc:
51-
port: 8080
51+
port: 8082
5252
initialDelaySeconds: 5
5353
periodSeconds: 10
5454
failureThreshold: 10

pkg/nfd-master/nfd-master.go

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ type Args struct {
116116
CrdController bool
117117
EnableNodeFeatureApi bool
118118
Port int
119+
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
120+
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
121+
GrpcHealthPort int
119122
Prune bool
120123
VerifyNodeName bool
121124
Options string
@@ -144,6 +147,7 @@ type nfdMaster struct {
144147
nodeName string
145148
configFilePath string
146149
server *grpc.Server
150+
healthServer *grpc.Server
147151
stop chan struct{}
148152
ready chan bool
149153
apihelper apihelper.APIHelpers
@@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error {
270274

271275
// Run gRPC server
272276
grpcErr := make(chan error, 1)
273-
go m.runGrpcServer(grpcErr)
277+
// If the NodeFeature API is enabled, don'tregister the labeler API
278+
// server. Otherwise, register the labeler server.
279+
if !m.args.EnableNodeFeatureApi {
280+
go m.runGrpcServer(grpcErr)
281+
}
274282

275283
// Run updater that handles events from the nfd CRD API.
276284
if m.nfdController != nil {
@@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error {
281289
}
282290
}
283291

292+
// Start gRPC server for liveness probe (at this point we're "live")
293+
if m.args.GrpcHealthPort != 0 {
294+
if err := m.startGrpcHealthServer(grpcErr); err != nil {
295+
return fmt.Errorf("failed to start gRPC health server: %w", err)
296+
}
297+
}
298+
284299
// Notify that we're ready to accept connections
285300
m.ready <- true
286301
close(m.ready)
@@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error {
323338
}
324339
}
325340

341+
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
342+
// TODO: improve status checking e.g. with watchdog in the main event loop and
343+
// cheking that node updater pool is alive.
344+
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
345+
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
346+
if err != nil {
347+
return fmt.Errorf("failed to listen: %w", err)
348+
}
349+
350+
s := grpc.NewServer()
351+
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
352+
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
353+
354+
go func() {
355+
defer func() {
356+
lis.Close()
357+
}()
358+
if err := s.Serve(lis); err != nil {
359+
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
360+
}
361+
klog.InfoS("gRPC health server stopped")
362+
}()
363+
m.healthServer = s
364+
return nil
365+
}
366+
326367
func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
327368
// Create server listening for TCP connections
328369
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
@@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
352393
}
353394
m.server = grpc.NewServer(serverOpts...)
354395

355-
// If the NodeFeature API is enabled, don'tregister the labeler API
356-
// server. Otherwise, register the labeler server.
357-
if !m.args.EnableNodeFeatureApi {
358-
pb.RegisterLabelerServer(m.server, m)
359-
}
396+
pb.RegisterLabelerServer(m.server, m)
360397

361-
grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
362398
klog.InfoS("gRPC server serving", "port", m.args.Port)
363399

364400
// Run gRPC server
@@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
421457

422458
// Stop NfdMaster
423459
func (m *nfdMaster) Stop() {
424-
m.server.GracefulStop()
460+
if m.server != nil {
461+
m.server.GracefulStop()
462+
}
463+
if m.healthServer != nil {
464+
m.healthServer.GracefulStop()
465+
}
425466

426467
if m.nfdController != nil {
427468
m.nfdController.stop()

0 commit comments

Comments
 (0)