Skip to content

Commit 797c66f

Browse files
authored
Merge pull request #1947 from marquiz/devel/health-topology-updater
nfd-topology-updater: serve metrics and healthz on the same port
2 parents 2653488 + 0b9a8cf commit 797c66f

File tree

6 files changed

+43
-78
lines changed

6 files changed

+43
-78
lines changed

cmd/nfd-topology-updater/main.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
107107
"Do not create or update NodeResourceTopology objects.")
108108
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
109109
"Kube config file.")
110-
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
111-
"Port on which to expose metrics.")
112-
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
113-
"Port on which to expose the grpc health endpoint.")
110+
flagset.IntVar(&args.Port, "port", 8080,
111+
"Port which metrics and healthz endpoints are served on")
114112
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
115113
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
116114
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",

deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ spec:
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
2222
livenessProbe:
23-
grpc:
24-
port: 8082
23+
httpGet:
24+
path: /healthz
25+
port: http
2526
initialDelaySeconds: 10
2627
periodSeconds: 10
2728
readinessProbe:
28-
grpc:
29-
port: 8082
29+
httpGet:
30+
path: /healthz
31+
port: http
3032
initialDelaySeconds: 5
3133
periodSeconds: 10
3234
failureThreshold: 10
@@ -41,5 +43,5 @@ spec:
4143
cpu: 50m
4244
memory: 40Mi
4345
ports:
44-
- name: metrics
45-
containerPort: 8081
46+
- name: http
47+
containerPort: 8080

deployment/helm/node-feature-discovery/templates/topologyupdater.yaml

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,9 @@ spec:
4545
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4646
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
4747
livenessProbe:
48-
grpc:
49-
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
48+
httpGet:
49+
path: /healthz
50+
port: http
5051
{{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }}
5152
initialDelaySeconds: {{ . }}
5253
{{- end }}
@@ -60,8 +61,9 @@ spec:
6061
timeoutSeconds: {{ . }}
6162
{{- end }}
6263
readinessProbe:
63-
grpc:
64-
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
64+
httpGet:
65+
path: /healthz
66+
port: http
6567
{{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }}
6668
initialDelaySeconds: {{ . }}
6769
{{- end }}
@@ -113,16 +115,13 @@ spec:
113115
# Disable kubelet state tracking by giving an empty path
114116
- "-kubelet-state-dir="
115117
{{- end }}
116-
- "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}"
117-
- "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}"
118+
- "-port={{ .Values.topologyUpdater.port | default "8080"}}"
118119
{{- with .Values.topologyUpdater.extraArgs }}
119120
{{- toYaml . | nindent 10 }}
120121
{{- end }}
121122
ports:
122-
- containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
123-
name: metrics
124-
- containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }}
125-
name: health
123+
- containerPort: {{ .Values.topologyUpdater.port | default "8080"}}
124+
name: http
126125
volumeMounts:
127126
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
128127
- name: kubelet-config

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -502,8 +502,7 @@ topologyUpdater:
502502
rbac:
503503
create: true
504504

505-
metricsPort: 8081
506-
healthPort: 8082
505+
port: 8080
507506
kubeletConfigPath:
508507
kubeletPodResourcesSockPath:
509508
updateInterval: 60s
@@ -517,17 +516,13 @@ topologyUpdater:
517516
drop: [ "ALL" ]
518517
readOnlyRootFilesystem: true
519518
runAsUser: 0
520-
519+
521520
livenessProbe:
522-
grpc:
523-
port: 8082
524521
initialDelaySeconds: 10
525522
# failureThreshold: 3
526523
# periodSeconds: 10
527524
# timeoutSeconds: 1
528525
readinessProbe:
529-
grpc:
530-
port: 8082
531526
initialDelaySeconds: 5
532527
failureThreshold: 10
533528
# periodSeconds: 10

docs/deployment/helm.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,7 @@ API's you need to install the prometheus operator in your cluster.
274274
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
275275
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
276276
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
277-
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
278-
| `topologyUpdater.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
277+
| `topologyUpdater.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
279278
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
280279
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
281280
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |

pkg/nfd-topology-updater/nfd-topology-updater.go

Lines changed: 21 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,13 @@ package nfdtopologyupdater
1818

1919
import (
2020
"fmt"
21-
"net"
21+
"net/http"
2222
"net/url"
2323
"os"
2424
"path/filepath"
2525

2626
"golang.org/x/net/context"
2727

28-
"google.golang.org/grpc"
29-
"google.golang.org/grpc/health"
30-
"google.golang.org/grpc/health/grpc_health_v1"
3128
"k8s.io/apimachinery/pkg/api/errors"
3229
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3330
"k8s.io/apimachinery/pkg/types"
@@ -37,6 +34,7 @@ import (
3734

3835
"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
3936
topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
37+
"github.com/prometheus/client_golang/prometheus"
4038
"sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater/kubeletnotifier"
4139
"sigs.k8s.io/node-feature-discovery/pkg/podres"
4240
"sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor"
@@ -56,13 +54,12 @@ const (
5654

5755
// Args are the command line arguments
5856
type Args struct {
59-
MetricsPort int
57+
Port int
6058
NoPublish bool
6159
Oneshot bool
6260
KubeConfigFile string
6361
ConfigFile string
6462
KubeletStateDir string
65-
GrpcHealthPort int
6663

6764
Klog map[string]*utils.KlogFlagVal
6865
}
@@ -90,7 +87,6 @@ type nfdTopologyUpdater struct {
9087
ownerRefs []metav1.OwnerReference
9188
k8sClient k8sclient.Interface
9289
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
93-
healthServer *grpc.Server
9490
}
9591

9692
// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
@@ -134,27 +130,8 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
134130
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
135131
}
136132

137-
func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
138-
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
139-
if err != nil {
140-
return fmt.Errorf("failed to listen: %w", err)
141-
}
142-
143-
s := grpc.NewServer()
144-
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
145-
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)
146-
147-
go func() {
148-
defer func() {
149-
lis.Close()
150-
}()
151-
if err := s.Serve(lis); err != nil {
152-
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
153-
}
154-
klog.InfoS("gRPC health server stopped")
155-
}()
156-
w.healthServer = s
157-
return nil
133+
func (w *nfdTopologyUpdater) Healthz(writer http.ResponseWriter, _ *http.Request) {
134+
writer.WriteHeader(http.StatusOK)
158135
}
159136

160137
// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
@@ -187,15 +164,14 @@ func (w *nfdTopologyUpdater) Run() error {
187164
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
188165
}
189166

167+
httpMux := http.NewServeMux()
168+
190169
// Register to metrics server
191-
if w.args.MetricsPort > 0 {
192-
m := utils.CreateMetricsServer(w.args.MetricsPort,
193-
buildInfo,
194-
scanErrors)
195-
go m.Run()
196-
registerVersion(version.Get())
197-
defer m.Stop()
198-
}
170+
promRegistry := prometheus.NewRegistry()
171+
promRegistry.MustRegister(
172+
buildInfo,
173+
scanErrors)
174+
registerVersion(version.Get())
199175

200176
var resScan resourcemonitor.ResourcesScanner
201177

@@ -215,20 +191,19 @@ func (w *nfdTopologyUpdater) Run() error {
215191
return fmt.Errorf("failed to obtain node resource information: %w", err)
216192
}
217193

218-
grpcErr := make(chan error)
194+
// Register health probe (at this point we're "ready and live")
195+
httpMux.HandleFunc("/healthz", w.Healthz)
219196

220-
// Start gRPC server for liveness probe (at this point we're "live")
221-
if w.args.GrpcHealthPort != 0 {
222-
if err := w.startGrpcHealthServer(grpcErr); err != nil {
223-
return fmt.Errorf("failed to start gRPC health server: %w", err)
224-
}
225-
}
197+
// Start HTTP server
198+
httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}
199+
go func() {
200+
klog.InfoS("http server starting", "port", httpServer.Addr)
201+
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
202+
}()
203+
defer httpServer.Close()
226204

227205
for {
228206
select {
229-
case err := <-grpcErr:
230-
return fmt.Errorf("error in serving gRPC: %w", err)
231-
232207
case info := <-w.eventSource:
233208
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
234209
scanResponse, err := resScan.Scan()
@@ -257,9 +232,6 @@ func (w *nfdTopologyUpdater) Run() error {
257232

258233
case <-w.stop:
259234
klog.InfoS("shutting down nfd-topology-updater")
260-
if w.healthServer != nil {
261-
w.healthServer.GracefulStop()
262-
}
263235
return nil
264236
}
265237
}

0 commit comments

Comments
 (0)