Skip to content

Commit 4f24a38

Browse files
authored
Merge pull request #1929 from marquiz/devel/health
nfd-worker: use single http port for metrics and healthz
2 parents 837d0cf + 850c544 commit 4f24a38

File tree

6 files changed

+50
-85
lines changed

6 files changed

+50
-85
lines changed

cmd/nfd-worker/main.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs)
108108
"Kubeconfig to use")
109109
flagset.BoolVar(&args.Oneshot, "oneshot", false,
110110
"Do not publish feature labels")
111-
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
112-
"Port on which to expose metrics.")
113-
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
114-
"Port on which to expose the grpc health endpoint.")
111+
flagset.IntVar(&args.Port, "port", 8080,
112+
"Port on which to metrics and healthz endpoints are served")
115113
flagset.StringVar(&args.Options, "options", "",
116114
"Specify config options from command line. Config options are specified "+
117115
"in the same format as in the config file (i.e. json or yaml). These options")

deployment/base/worker-daemonset/worker-daemonset.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ spec:
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
2222
livenessProbe:
23-
grpc:
24-
port: 8082
23+
httpGet:
24+
path: /healthz
25+
port: http
2526
initialDelaySeconds: 10
2627
periodSeconds: 10
2728
readinessProbe:
28-
grpc:
29-
port: 8082
29+
httpGet:
30+
path: /healthz
31+
port: http
3032
initialDelaySeconds: 5
3133
periodSeconds: 10
3234
failureThreshold: 10
@@ -40,5 +42,5 @@ spec:
4042
cpu: 5m
4143
memory: 64Mi
4244
ports:
43-
- name: metrics
44-
containerPort: 8081
45+
- name: http
46+
containerPort: 8080

deployment/helm/node-feature-discovery/templates/worker.yaml

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ spec:
4747
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4848
imagePullPolicy: {{ .Values.image.pullPolicy }}
4949
livenessProbe:
50-
grpc:
51-
port: {{ .Values.worker.healthPort | default "8082" }}
50+
httpGet:
51+
path: /healthz
52+
port: http
5253
{{- with .Values.worker.livenessProbe.initialDelaySeconds }}
5354
initialDelaySeconds: {{ . }}
5455
{{- end }}
@@ -62,8 +63,9 @@ spec:
6263
timeoutSeconds: {{ . }}
6364
{{- end }}
6465
readinessProbe:
65-
grpc:
66-
port: {{ .Values.worker.healthPort | default "8082" }}
66+
httpGet:
67+
path: /healthz
68+
port: http
6769
{{- with .Values.worker.readinessProbe.initialDelaySeconds }}
6870
initialDelaySeconds: {{ . }}
6971
{{- end }}
@@ -104,16 +106,13 @@ spec:
104106
{{- range $key, $value := .Values.featureGates }}
105107
- "-feature-gates={{ $key }}={{ $value }}"
106108
{{- end }}
107-
- "-metrics={{ .Values.worker.metricsPort | default "8081"}}"
108-
- "-grpc-health={{ .Values.worker.healthPort | default "8082" }}"
109+
- "-port={{ .Values.worker.port | default "8080"}}"
109110
{{- with .Values.worker.extraArgs }}
110111
{{- toYaml . | nindent 8 }}
111112
{{- end }}
112113
ports:
113-
- containerPort: {{ .Values.worker.metricsPort | default "8081"}}
114-
name: metrics
115-
- containerPort: {{ .Values.worker.healthPort | default "8082" }}
116-
name: health
114+
- containerPort: {{ .Values.worker.port | default "8080"}}
115+
name: http
117116
volumeMounts:
118117
- name: host-boot
119118
mountPath: "/host-boot"

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -418,8 +418,7 @@ worker:
418418
# matchName: {op: In, value: ["SWAP", "X86", "ARM"]}
419419
### <NFD-WORKER-CONF-END-DO-NOT-REMOVE>
420420

421-
metricsPort: 8081
422-
healthPort: 8082
421+
port: 8080
423422
daemonsetAnnotations: {}
424423
podSecurityContext: {}
425424
# fsGroup: 2000
@@ -433,15 +432,11 @@ worker:
433432
# runAsUser: 1000
434433

435434
livenessProbe:
436-
grpc:
437-
port: 8082
438435
initialDelaySeconds: 10
439436
# failureThreshold: 3
440437
# periodSeconds: 10
441438
# timeoutSeconds: 1
442439
readinessProbe:
443-
grpc:
444-
port: 8082
445440
initialDelaySeconds: 5
446441
failureThreshold: 10
447442
# periodSeconds: 10

docs/deployment/helm.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,8 +233,7 @@ API's you need to install the prometheus operator in your cluster.
233233
| `worker.*` | dict | | NFD worker daemonset configuration |
234234
| `worker.enable` | bool | true | Specifies whether nfd-worker should be deployed |
235235
| `worker.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace |
236-
| `worker.metricsPort` | int | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `worker.port` in NFD v0.18. |
237-
| `worker.healthPort` | int | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `worker.port` in NFD v0.18. |
236+
| `worker.port` | int | 8080 | Port on which to serve http for metrics and healthz endpoints. |
238237
| `worker.config` | dict | | NFD worker [configuration](../reference/worker-configuration-reference) |
239238
| `worker.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settins |
240239
| `worker.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) |

pkg/nfd-worker/nfd-worker.go

Lines changed: 29 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,18 @@ package nfdworker
1919
import (
2020
"encoding/json"
2121
"fmt"
22-
"net"
22+
"net/http"
2323
"os"
2424
"path/filepath"
2525
"regexp"
2626
"sort"
2727
"strings"
2828
"time"
2929

30+
"github.com/prometheus/client_golang/prometheus"
31+
"github.com/prometheus/client_golang/prometheus/promhttp"
3032
"golang.org/x/exp/maps"
3133
"golang.org/x/net/context"
32-
"google.golang.org/grpc"
33-
"google.golang.org/grpc/health"
34-
"google.golang.org/grpc/health/grpc_health_v1"
3534
"k8s.io/apimachinery/pkg/api/errors"
3635
"k8s.io/apimachinery/pkg/types"
3736
"k8s.io/apimachinery/pkg/util/validation"
@@ -93,14 +92,13 @@ type Labels map[string]string
9392

9493
// Args are the command line arguments of NfdWorker.
9594
type Args struct {
96-
ConfigFile string
97-
Klog map[string]*utils.KlogFlagVal
98-
Kubeconfig string
99-
Oneshot bool
100-
Options string
101-
MetricsPort int
102-
GrpcHealthPort int
103-
NoOwnerRefs bool
95+
ConfigFile string
96+
Klog map[string]*utils.KlogFlagVal
97+
Kubeconfig string
98+
Oneshot bool
99+
Options string
100+
Port int
101+
NoOwnerRefs bool
104102

105103
Overrides ConfigOverrideArgs
106104
}
@@ -118,7 +116,6 @@ type nfdWorker struct {
118116
configFilePath string
119117
config *NFDConfig
120118
kubernetesNamespace string
121-
healthServer *grpc.Server
122119
k8sClient k8sclient.Interface
123120
nfdClient nfdclient.Interface
124121
stop chan struct{} // channel for signaling stop
@@ -206,6 +203,10 @@ func newDefaultConfig() *NFDConfig {
206203
}
207204
}
208205

206+
func (w *nfdWorker) Healthz(writer http.ResponseWriter, _ *http.Request) {
207+
writer.WriteHeader(http.StatusOK)
208+
}
209+
209210
func (i *infiniteTicker) Reset(d time.Duration) {
210211
switch {
211212
case d > 0:
@@ -217,29 +218,6 @@ func (i *infiniteTicker) Reset(d time.Duration) {
217218
}
218219
}
219220

220-
func (w *nfdWorker) startGrpcHealthServer(errChan chan<- error) error {
221-
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
222-
if err != nil {
223-
return fmt.Errorf("failed to listen: %w", err)
224-
}
225-
226-
s := grpc.NewServer()
227-
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
228-
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)
229-
230-
go func() {
231-
defer func() {
232-
lis.Close()
233-
}()
234-
if err := s.Serve(lis); err != nil {
235-
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
236-
}
237-
klog.InfoS("gRPC health server stopped")
238-
}()
239-
w.healthServer = s
240-
return nil
241-
}
242-
243221
// Run feature discovery.
244222
func (w *nfdWorker) runFeatureDiscovery() error {
245223
discoveryStart := time.Now()
@@ -324,15 +302,13 @@ func (w *nfdWorker) Run() error {
324302
labelTrigger.Reset(w.config.Core.SleepInterval.Duration)
325303
defer labelTrigger.Stop()
326304

305+
httpMux := http.NewServeMux()
306+
327307
// Register to metrics server
328-
if w.args.MetricsPort > 0 {
329-
m := utils.CreateMetricsServer(w.args.MetricsPort,
330-
buildInfo,
331-
featureDiscoveryDuration)
332-
go m.Run()
333-
registerVersion(version.Get())
334-
defer m.Stop()
335-
}
308+
promRegistry := prometheus.NewRegistry()
309+
promRegistry.MustRegister(buildInfo, featureDiscoveryDuration)
310+
httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
311+
registerVersion(version.Get())
336312

337313
err = w.runFeatureDiscovery()
338314
if err != nil {
@@ -344,20 +320,19 @@ func (w *nfdWorker) Run() error {
344320
return nil
345321
}
346322

347-
grpcErr := make(chan error)
323+
// Register health endpoint (at this point we're "ready and live")
324+
httpMux.HandleFunc("/healthz", w.Healthz)
348325

349-
// Start gRPC server for liveness probe (at this point we're "live")
350-
if w.args.GrpcHealthPort != 0 {
351-
if err := w.startGrpcHealthServer(grpcErr); err != nil {
352-
return fmt.Errorf("failed to start gRPC health server: %w", err)
353-
}
354-
}
326+
// Start HTTP server
327+
httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}
328+
go func() {
329+
klog.InfoS("http server starting", "port", httpServer.Addr)
330+
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
331+
}()
332+
defer httpServer.Close()
355333

356334
for {
357335
select {
358-
case err := <-grpcErr:
359-
return fmt.Errorf("error in serving gRPC: %w", err)
360-
361336
case <-labelTrigger.C:
362337
err = w.runFeatureDiscovery()
363338
if err != nil {
@@ -366,9 +341,6 @@ func (w *nfdWorker) Run() error {
366341

367342
case <-w.stop:
368343
klog.InfoS("shutting down nfd-worker")
369-
if w.healthServer != nil {
370-
w.healthServer.GracefulStop()
371-
}
372344
return nil
373345
}
374346
}

0 commit comments

Comments
 (0)