@@ -20,7 +20,7 @@ import (
2020 "encoding/json"
2121 "fmt"
2222 "maps"
23- "net"
23+ "net/http "
2424 "os"
2525 "path"
2626 "path/filepath"
@@ -31,10 +31,9 @@ import (
3131 "time"
3232
3333 "github.com/google/uuid"
34+ "github.com/prometheus/client_golang/prometheus"
35+ "github.com/prometheus/client_golang/prometheus/promhttp"
3436 "golang.org/x/net/context"
35- "google.golang.org/grpc"
36- "google.golang.org/grpc/health"
37- "google.golang.org/grpc/health/grpc_health_v1"
3837 corev1 "k8s.io/api/core/v1"
3938 apiequality "k8s.io/apimachinery/pkg/api/equality"
4039 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -115,18 +114,14 @@ type ConfigOverrideArgs struct {
115114
116115// Args holds command line arguments
117116type Args struct {
118- ConfigFile string
119- Instance string
120- Klog map [string ]* utils.KlogFlagVal
121- Kubeconfig string
122- Port int
123- // GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
124- // Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
125- GrpcHealthPort int
117+ ConfigFile string
118+ Instance string
119+ Klog map [string ]* utils.KlogFlagVal
120+ Kubeconfig string
121+ Port int
126122 Prune bool
127123 Options string
128124 EnableLeaderElection bool
129- MetricsPort int
130125
131126 Overrides ConfigOverrideArgs
132127}
@@ -139,7 +134,6 @@ type deniedNs struct {
139134type NfdMaster interface {
140135 Run () error
141136 Stop ()
142- WaitForReady (time.Duration ) bool
143137}
144138
145139type nfdMaster struct {
@@ -149,10 +143,7 @@ type nfdMaster struct {
149143 namespace string
150144 nodeName string
151145 configFilePath string
152- server * grpc.Server
153- healthServer * grpc.Server
154146 stop chan struct {}
155- ready chan struct {}
156147 kubeconfig * restclient.Config
157148 k8sClient k8sclient.Interface
158149 nfdClient nfdclientset.Interface
@@ -166,7 +157,6 @@ func NewNfdMaster(opts ...NfdMasterOption) (NfdMaster, error) {
166157 nfd := & nfdMaster {
167158 nodeName : utils .NodeName (),
168159 namespace : utils .GetKubernetesNamespace (),
169- ready : make (chan struct {}),
170160 stop : make (chan struct {}),
171161 }
172162
@@ -298,22 +288,22 @@ func (m *nfdMaster) Run() error {
298288 }
299289 }
300290
291+ httpMux := http .NewServeMux ()
292+
301293 // Register to metrics server
302- if m .args .MetricsPort > 0 {
303- m := utils .CreateMetricsServer (m .args .MetricsPort ,
304- buildInfo ,
305- nodeUpdateRequests ,
306- nodeUpdates ,
307- nodeUpdateFailures ,
308- nodeLabelsRejected ,
309- nodeERsRejected ,
310- nodeTaintsRejected ,
311- nfrProcessingTime ,
312- nfrProcessingErrors )
313- go m .Run ()
314- registerVersion (version .Get ())
315- defer m .Stop ()
316- }
294+ promRegistry := prometheus .NewRegistry ()
295+ promRegistry .MustRegister (
296+ buildInfo ,
297+ nodeUpdateRequests ,
298+ nodeUpdates ,
299+ nodeUpdateFailures ,
300+ nodeLabelsRejected ,
301+ nodeERsRejected ,
302+ nodeTaintsRejected ,
303+ nfrProcessingTime ,
304+ nfrProcessingErrors )
305+ httpMux .Handle ("/metrics" , promhttp .HandlerFor (promRegistry , promhttp.HandlerOpts {}))
306+ registerVersion (version .Get ())
317307
318308 // Run updater that handles events from the nfd CRD API.
319309 if m .nfdController != nil {
@@ -324,60 +314,29 @@ func (m *nfdMaster) Run() error {
324314 }
325315 }
326316
327- // Start gRPC server for liveness probe (at this point we're "live")
328- grpcErr := make (chan error )
329- if m .args .GrpcHealthPort != 0 {
330- if err := m .startGrpcHealthServer (grpcErr ); err != nil {
331- return fmt .Errorf ("failed to start gRPC health server: %w" , err )
332- }
333- }
334-
335- // Notify that we're ready to accept connections
336- close (m .ready )
337-
338- // NFD-Master main event loop
339- for {
340- select {
341- case err := <- grpcErr :
342- return fmt .Errorf ("error in serving gRPC: %w" , err )
343-
344- case <- m .stop :
345- klog .InfoS ("shutting down nfd-master" )
346- return nil
347- }
348- }
349- }
350-
351- // startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
352- // TODO: improve status checking e.g. with watchdog in the main event loop and
353- // cheking that node updater pool is alive.
354- func (m * nfdMaster ) startGrpcHealthServer (errChan chan <- error ) error {
355- lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , m .args .GrpcHealthPort ))
356- if err != nil {
357- return fmt .Errorf ("failed to listen: %w" , err )
358- }
359-
360- s := grpc .NewServer ()
361- grpc_health_v1 .RegisterHealthServer (s , health .NewServer ())
362- klog .InfoS ("gRPC health server serving" , "port" , m .args .GrpcHealthPort )
317+ // Register health probe (at this point we're "ready and live")
318+ httpMux .HandleFunc ("/healthz" , m .Healthz )
363319
320+ // Start HTTP server
321+ httpServer := http.Server {Addr : fmt .Sprintf (":%d" , m .args .Port ), Handler : httpMux }
364322 go func () {
365- defer func () {
366- lis .Close ()
367- }()
368- if err := s .Serve (lis ); err != nil {
369- errChan <- fmt .Errorf ("gRPC health server exited with an error: %w" , err )
370- }
371- klog .InfoS ("gRPC health server stopped" )
323+ klog .InfoS ("http server starting" , "port" , httpServer .Addr )
324+ klog .InfoS ("http server stopped" , "exitCode" , httpServer .ListenAndServe ())
372325 }()
373- m .healthServer = s
326+ defer httpServer .Close ()
327+
328+ <- m .stop
329+ klog .InfoS ("shutting down nfd-master" )
374330 return nil
375331}
376332
333+ func (m * nfdMaster ) Healthz (writer http.ResponseWriter , _ * http.Request ) {
334+ writer .WriteHeader (http .StatusOK )
335+ }
336+
377337// nfdAPIUpdateHandler handles events from the nfd API controller.
378338func (m * nfdMaster ) nfdAPIUpdateHandler () {
379- // We want to unconditionally update all nodes at startup if gRPC is
380- // disabled (i.e. NodeFeature API is enabled)
339+ // We want to unconditionally update all nodes at startup
381340 updateAll := true
382341 updateNodes := make (map [string ]struct {})
383342 nodeFeatureGroup := make (map [string ]struct {})
@@ -431,13 +390,6 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
431390
432391// Stop NfdMaster
433392func (m * nfdMaster ) Stop () {
434- if m .server != nil {
435- m .server .GracefulStop ()
436- }
437- if m .healthServer != nil {
438- m .healthServer .GracefulStop ()
439- }
440-
441393 if m .nfdController != nil {
442394 m .nfdController .stop ()
443395 }
@@ -447,16 +399,6 @@ func (m *nfdMaster) Stop() {
447399 close (m .stop )
448400}
449401
450- // Wait until NfdMaster is able able to accept connections.
451- func (m * nfdMaster ) WaitForReady (timeout time.Duration ) bool {
452- select {
453- case <- m .ready :
454- return true
455- case <- time .After (timeout ):
456- }
457- return false
458- }
459-
460402// Prune erases all NFD related properties from the node objects of the cluster.
461403func (m * nfdMaster ) prune () error {
462404 if m .config .NoPublish {
0 commit comments