Skip to content

Commit d862cac

Browse files
authored
Merge pull request #2418 from rexagod/livez
feat: add `livez` endpoint
2 parents f4ab888 + 6f8f7d1 commit d862cac

File tree

9 files changed

+57
-18
lines changed

9 files changed

+57
-18
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo
342342

343343
After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.
344344

345+
#### Healthcheck Endpoints
346+
347+
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
348+
349+
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
350+
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
351+
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
352+
345353
#### Limited privileges environment
346354

347355
If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:

README.md.tpl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo
343343

344344
After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.
345345

346+
#### Healthcheck Endpoints
347+
348+
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
349+
350+
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
351+
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
352+
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
353+
346354
#### Limited privileges environment
347355

348356
If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:

examples/autosharding/statefulset.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ spec:
3737
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
3838
livenessProbe:
3939
httpGet:
40-
path: /healthz
40+
path: /livez
4141
port: 8080
4242
initialDelaySeconds: 5
4343
timeoutSeconds: 5
@@ -49,7 +49,7 @@ spec:
4949
name: telemetry
5050
readinessProbe:
5151
httpGet:
52-
path: /
52+
path: /metrics
5353
port: 8081
5454
initialDelaySeconds: 5
5555
timeoutSeconds: 5

examples/daemonsetsharding/daemonset.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ spec:
3232
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
3333
livenessProbe:
3434
httpGet:
35-
path: /healthz
35+
path: /livez
3636
port: 8080
3737
initialDelaySeconds: 5
3838
timeoutSeconds: 5
@@ -44,7 +44,7 @@ spec:
4444
name: telemetry
4545
readinessProbe:
4646
httpGet:
47-
path: /
47+
path: /metrics
4848
port: 8081
4949
initialDelaySeconds: 5
5050
timeoutSeconds: 5

examples/daemonsetsharding/deployment-no-node-pods.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
2828
livenessProbe:
2929
httpGet:
30-
path: /healthz
30+
path: /livez
3131
port: 8080
3232
initialDelaySeconds: 5
3333
timeoutSeconds: 5
@@ -39,7 +39,7 @@ spec:
3939
name: telemetry
4040
readinessProbe:
4141
httpGet:
42-
path: /
42+
path: /metrics
4343
port: 8081
4444
initialDelaySeconds: 5
4545
timeoutSeconds: 5

examples/daemonsetsharding/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ spec:
2626
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
2727
livenessProbe:
2828
httpGet:
29-
path: /healthz
29+
path: /livez
3030
port: 8080
3131
initialDelaySeconds: 5
3232
timeoutSeconds: 5
@@ -38,7 +38,7 @@ spec:
3838
name: telemetry
3939
readinessProbe:
4040
httpGet:
41-
path: /
41+
path: /metrics
4242
port: 8081
4343
initialDelaySeconds: 5
4444
timeoutSeconds: 5

examples/standard/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ spec:
2424
- image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
2525
livenessProbe:
2626
httpGet:
27-
path: /healthz
27+
path: /livez
2828
port: 8080
2929
initialDelaySeconds: 5
3030
timeoutSeconds: 5
@@ -36,7 +36,7 @@ spec:
3636
name: telemetry
3737
readinessProbe:
3838
httpGet:
39-
path: /
39+
path: /metrics
4040
port: 8081
4141
initialDelaySeconds: 5
4242
timeoutSeconds: 5

jsonnet/kube-state-metrics/kube-state-metrics.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,11 @@
193193
},
194194
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
195195
port: 8080,
196-
path: '/healthz',
196+
path: '/livez',
197197
} },
198198
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
199199
port: 8081,
200-
path: '/',
200+
path: '/metrics',
201201
} },
202202
};
203203

pkg/app/server.go

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ import (
3030
"strings"
3131
"time"
3232

33+
"gopkg.in/yaml.v3"
34+
"k8s.io/client-go/kubernetes"
35+
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
36+
"k8s.io/client-go/tools/clientcmd"
37+
"k8s.io/klog/v2"
38+
3339
"github.com/oklog/run"
3440
"github.com/prometheus/client_golang/prometheus"
3541
"github.com/prometheus/client_golang/prometheus/collectors"
@@ -38,10 +44,6 @@ import (
3844
"github.com/prometheus/client_golang/prometheus/promhttp"
3945
"github.com/prometheus/common/version"
4046
"github.com/prometheus/exporter-toolkit/web"
41-
"gopkg.in/yaml.v3"
42-
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
43-
"k8s.io/client-go/tools/clientcmd"
44-
"k8s.io/klog/v2"
4547

4648
"k8s.io/kube-state-metrics/v2/internal/discovery"
4749
"k8s.io/kube-state-metrics/v2/internal/store"
@@ -59,6 +61,7 @@ import (
5961
const (
6062
metricsPath = "/metrics"
6163
healthzPath = "/healthz"
64+
livezPath = "/livez"
6265
)
6366

6467
// promLogger implements promhttp.Logger
@@ -321,7 +324,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error {
321324
WebConfigFile: &tlsConfig,
322325
}
323326

324-
metricsMux := buildMetricsServer(m, durationVec)
327+
metricsMux := buildMetricsServer(m, durationVec, kubeClient)
325328
metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port))
326329
metricsServer := http.Server{
327330
Handler: metricsMux,
@@ -393,7 +396,7 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
393396
return mux
394397
}
395398

396-
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec) *http.ServeMux {
399+
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
397400
mux := http.NewServeMux()
398401

399402
// TODO: This doesn't belong into serveMetrics
@@ -403,7 +406,23 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
403406
mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
404407
mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))
405408

409+
// Add metricsPath
406410
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))
411+
412+
// Add livezPath
413+
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
414+
415+
// Query the Kube API to make sure we are not affected by a network outage.
416+
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
417+
if got.Error() != nil {
418+
w.WriteHeader(http.StatusServiceUnavailable)
419+
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
420+
return
421+
}
422+
w.WriteHeader(http.StatusOK)
423+
w.Write([]byte(http.StatusText(http.StatusOK)))
424+
}))
425+
407426
// Add healthzPath
408427
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
409428
w.WriteHeader(http.StatusOK)
@@ -424,6 +443,10 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
424443
Address: healthzPath,
425444
Text: "Healthz",
426445
},
446+
{
447+
Address: livezPath,
448+
Text: "Livez",
449+
},
427450
},
428451
}
429452
landingPage, err := web.NewLandingPage(landingConfig)

0 commit comments

Comments
 (0)