Skip to content

Commit 0b0aed2

Browse files
committed
nfd-master: serve metrics and healthz endpoint on the same port
Changes the gRPC health endpoint to plain http. At the same time starts serving both the metrics and healthz endpoints on a single port. Replaces the -metrics and -grpc-health command line flags with a single -port flag. Changes the Helm and kustomize deployments correspondingly.
1 parent 4f24a38 commit 0b0aed2

File tree

6 files changed

+63
-129
lines changed

6 files changed

+63
-129
lines changed

cmd/nfd-master/main.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
108108
"Config file to use.")
109109
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
110110
"Kubeconfig to use")
111-
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
112-
"Port on which to expose metrics.")
113-
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
114-
"Port on which to expose the grpc health endpoint.")
111+
flagset.IntVar(&args.Port, "port", 8080,
112+
"Port which metrics and healthz endpoints are served on")
115113
flagset.BoolVar(&args.Prune, "prune", false,
116114
"Prune all NFD related attributes from all nodes of the cluster and exit.")
117115
flagset.StringVar(&args.Options, "options", "",

deployment/base/master/master-deployment.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,16 @@ spec:
3333
port: 8082
3434
failureThreshold: 30
3535
livenessProbe:
36-
grpc:
37-
port: 8082
36+
httpGet:
37+
path: /healthz
38+
port: http
3839
readinessProbe:
39-
grpc:
40-
port: 8082
40+
httpGet:
41+
path: /healthz
42+
port: http
4143
failureThreshold: 10
4244
command:
4345
- "nfd-master"
4446
ports:
45-
- name: metrics
46-
containerPort: 8081
47+
- name: http
48+
containerPort: 8080

deployment/helm/node-feature-discovery/templates/master.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ spec:
4949
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
5050
imagePullPolicy: {{ .Values.image.pullPolicy }}
5151
startupProbe:
52-
grpc:
53-
port: {{ .Values.master.healthPort | default "8082" }}
52+
httpGet:
53+
path: /healthz
54+
port: http
5455
{{- with .Values.master.startupProbe.initialDelaySeconds }}
5556
initialDelaySeconds: {{ . }}
5657
{{- end }}
@@ -64,8 +65,9 @@ spec:
6465
timeoutSeconds: {{ . }}
6566
{{- end }}
6667
livenessProbe:
67-
grpc:
68-
port: {{ .Values.master.healthPort | default "8082" }}
68+
httpGet:
69+
path: /healthz
70+
port: http
6971
{{- with .Values.master.livenessProbe.initialDelaySeconds }}
7072
initialDelaySeconds: {{ . }}
7173
{{- end }}
@@ -79,8 +81,9 @@ spec:
7981
timeoutSeconds: {{ . }}
8082
{{- end }}
8183
readinessProbe:
82-
grpc:
83-
port: {{ .Values.master.healthPort | default "8082" }}
84+
httpGet:
85+
path: /healthz
86+
port: http
8487
{{- with .Values.master.readinessProbe.initialDelaySeconds }}
8588
initialDelaySeconds: {{ . }}
8689
{{- end }}
@@ -97,10 +100,8 @@ spec:
97100
successThreshold: {{ . }}
98101
{{- end }}
99102
ports:
100-
- containerPort: {{ .Values.master.metricsPort | default "8081" }}
101-
name: metrics
102-
- containerPort: {{ .Values.master.healthPort | default "8082" }}
103-
name: health
103+
- containerPort: {{ .Values.master.port | default "8080" }}
104+
name: http
104105
env:
105106
- name: NODE_NAME
106107
valueFrom:
@@ -140,8 +141,7 @@ spec:
140141
{{- range $key, $value := .Values.featureGates }}
141142
- "-feature-gates={{ $key }}={{ $value }}"
142143
{{- end }}
143-
- "-metrics={{ .Values.master.metricsPort | default "8081" }}"
144-
- "-grpc-health={{ .Values.master.healthPort | default "8082" }}"
144+
- "-port={{ .Values.master.port | default "8080" }}"
145145
{{- with .Values.master.extraArgs }}
146146
{{- toYaml . | nindent 12 }}
147147
{{- end }}

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,7 @@ master:
6767
# retryPeriod: 2s
6868
# nfdApiParallelism: 10
6969
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
70-
metricsPort: 8081
71-
healthPort: 8082
70+
port: 8080
7271
instance:
7372
featureApi:
7473
resyncPeriod:
@@ -149,20 +148,14 @@ master:
149148
values: [""]
150149

151150
startupProbe:
152-
grpc:
153-
port: 8082
154151
failureThreshold: 30
155152
# periodSeconds: 10
156-
livenessProbe:
157-
grpc:
158-
port: 8082
153+
livenessProbe: {}
159154
# failureThreshold: 3
160155
# initialDelaySeconds: 0
161156
# periodSeconds: 10
162157
# timeoutSeconds: 1
163158
readinessProbe:
164-
grpc:
165-
port: 8082
166159
failureThreshold: 10
167160
# initialDelaySeconds: 0
168161
# periodSeconds: 10

docs/deployment/helm.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,7 @@ API's you need to install the prometheus operator in your cluster.
176176
| `master.*` | dict | | NFD master deployment configuration |
177177
| `master.enable` | bool | true | Specifies whether nfd-master should be deployed |
178178
| `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace |
179-
| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
180-
| `master.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
179+
| `master.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
181180
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
182181
| `master.resyncPeriod` | string | | NFD API controller resync period. |
183182
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |

pkg/nfd-master/nfd-master.go

Lines changed: 38 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import (
2020
"encoding/json"
2121
"fmt"
2222
"maps"
23-
"net"
23+
"net/http"
2424
"os"
2525
"path"
2626
"path/filepath"
@@ -31,10 +31,9 @@ import (
3131
"time"
3232

3333
"github.com/google/uuid"
34+
"github.com/prometheus/client_golang/prometheus"
35+
"github.com/prometheus/client_golang/prometheus/promhttp"
3436
"golang.org/x/net/context"
35-
"google.golang.org/grpc"
36-
"google.golang.org/grpc/health"
37-
"google.golang.org/grpc/health/grpc_health_v1"
3837
corev1 "k8s.io/api/core/v1"
3938
apiequality "k8s.io/apimachinery/pkg/api/equality"
4039
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -115,18 +114,14 @@ type ConfigOverrideArgs struct {
115114

116115
// Args holds command line arguments
117116
type Args struct {
118-
ConfigFile string
119-
Instance string
120-
Klog map[string]*utils.KlogFlagVal
121-
Kubeconfig string
122-
Port int
123-
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
124-
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
125-
GrpcHealthPort int
117+
ConfigFile string
118+
Instance string
119+
Klog map[string]*utils.KlogFlagVal
120+
Kubeconfig string
121+
Port int
126122
Prune bool
127123
Options string
128124
EnableLeaderElection bool
129-
MetricsPort int
130125

131126
Overrides ConfigOverrideArgs
132127
}
@@ -139,7 +134,6 @@ type deniedNs struct {
139134
type NfdMaster interface {
140135
Run() error
141136
Stop()
142-
WaitForReady(time.Duration) bool
143137
}
144138

145139
type nfdMaster struct {
@@ -149,10 +143,7 @@ type nfdMaster struct {
149143
namespace string
150144
nodeName string
151145
configFilePath string
152-
server *grpc.Server
153-
healthServer *grpc.Server
154146
stop chan struct{}
155-
ready chan struct{}
156147
kubeconfig *restclient.Config
157148
k8sClient k8sclient.Interface
158149
nfdClient nfdclientset.Interface
@@ -166,7 +157,6 @@ func NewNfdMaster(opts ...NfdMasterOption) (NfdMaster, error) {
166157
nfd := &nfdMaster{
167158
nodeName: utils.NodeName(),
168159
namespace: utils.GetKubernetesNamespace(),
169-
ready: make(chan struct{}),
170160
stop: make(chan struct{}),
171161
}
172162

@@ -298,22 +288,22 @@ func (m *nfdMaster) Run() error {
298288
}
299289
}
300290

291+
httpMux := http.NewServeMux()
292+
301293
// Register to metrics server
302-
if m.args.MetricsPort > 0 {
303-
m := utils.CreateMetricsServer(m.args.MetricsPort,
304-
buildInfo,
305-
nodeUpdateRequests,
306-
nodeUpdates,
307-
nodeUpdateFailures,
308-
nodeLabelsRejected,
309-
nodeERsRejected,
310-
nodeTaintsRejected,
311-
nfrProcessingTime,
312-
nfrProcessingErrors)
313-
go m.Run()
314-
registerVersion(version.Get())
315-
defer m.Stop()
316-
}
294+
promRegistry := prometheus.NewRegistry()
295+
promRegistry.MustRegister(
296+
buildInfo,
297+
nodeUpdateRequests,
298+
nodeUpdates,
299+
nodeUpdateFailures,
300+
nodeLabelsRejected,
301+
nodeERsRejected,
302+
nodeTaintsRejected,
303+
nfrProcessingTime,
304+
nfrProcessingErrors)
305+
httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
306+
registerVersion(version.Get())
317307

318308
// Run updater that handles events from the nfd CRD API.
319309
if m.nfdController != nil {
@@ -324,60 +314,29 @@ func (m *nfdMaster) Run() error {
324314
}
325315
}
326316

327-
// Start gRPC server for liveness probe (at this point we're "live")
328-
grpcErr := make(chan error)
329-
if m.args.GrpcHealthPort != 0 {
330-
if err := m.startGrpcHealthServer(grpcErr); err != nil {
331-
return fmt.Errorf("failed to start gRPC health server: %w", err)
332-
}
333-
}
334-
335-
// Notify that we're ready to accept connections
336-
close(m.ready)
337-
338-
// NFD-Master main event loop
339-
for {
340-
select {
341-
case err := <-grpcErr:
342-
return fmt.Errorf("error in serving gRPC: %w", err)
343-
344-
case <-m.stop:
345-
klog.InfoS("shutting down nfd-master")
346-
return nil
347-
}
348-
}
349-
}
350-
351-
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
352-
// TODO: improve status checking e.g. with watchdog in the main event loop and
353-
// cheking that node updater pool is alive.
354-
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
355-
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
356-
if err != nil {
357-
return fmt.Errorf("failed to listen: %w", err)
358-
}
359-
360-
s := grpc.NewServer()
361-
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
362-
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
317+
// Register health probe (at this point we're "ready and live")
318+
httpMux.HandleFunc("/healthz", m.Healthz)
363319

320+
// Start HTTP server
321+
httpServer := http.Server{Addr: fmt.Sprintf(":%d", m.args.Port), Handler: httpMux}
364322
go func() {
365-
defer func() {
366-
lis.Close()
367-
}()
368-
if err := s.Serve(lis); err != nil {
369-
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
370-
}
371-
klog.InfoS("gRPC health server stopped")
323+
klog.InfoS("http server starting", "port", httpServer.Addr)
324+
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
372325
}()
373-
m.healthServer = s
326+
defer httpServer.Close()
327+
328+
<-m.stop
329+
klog.InfoS("shutting down nfd-master")
374330
return nil
375331
}
376332

333+
func (m *nfdMaster) Healthz(writer http.ResponseWriter, _ *http.Request) {
334+
writer.WriteHeader(http.StatusOK)
335+
}
336+
377337
// nfdAPIUpdateHandler handles events from the nfd API controller.
378338
func (m *nfdMaster) nfdAPIUpdateHandler() {
379-
// We want to unconditionally update all nodes at startup if gRPC is
380-
// disabled (i.e. NodeFeature API is enabled)
339+
// We want to unconditionally update all nodes at startup
381340
updateAll := true
382341
updateNodes := make(map[string]struct{})
383342
nodeFeatureGroup := make(map[string]struct{})
@@ -431,13 +390,6 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
431390

432391
// Stop NfdMaster
433392
func (m *nfdMaster) Stop() {
434-
if m.server != nil {
435-
m.server.GracefulStop()
436-
}
437-
if m.healthServer != nil {
438-
m.healthServer.GracefulStop()
439-
}
440-
441393
if m.nfdController != nil {
442394
m.nfdController.stop()
443395
}
@@ -447,16 +399,6 @@ func (m *nfdMaster) Stop() {
447399
close(m.stop)
448400
}
449401

450-
// Wait until NfdMaster is able able to accept connections.
451-
func (m *nfdMaster) WaitForReady(timeout time.Duration) bool {
452-
select {
453-
case <-m.ready:
454-
return true
455-
case <-time.After(timeout):
456-
}
457-
return false
458-
}
459-
460402
// Prune erases all NFD related properties from the node objects of the cluster.
461403
func (m *nfdMaster) prune() error {
462404
if m.config.NoPublish {

0 commit comments

Comments
 (0)