Skip to content

Commit dae1f06

Browse files
committed
fix: add readyz endpoint
Discourage the usage of `/metrics` for any probe, and use `/readyz` in place of the earlier telemetry metrics endpoint to secure the exposition data. Signed-off-by: Pranshu Srivastava <[email protected]>
1 parent 85d1423 commit dae1f06

File tree

9 files changed

+46
-39
lines changed

9 files changed

+46
-39
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -348,9 +348,11 @@ After running the above, if you see `Clusterrolebinding "cluster-admin-binding"
348348

349349
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
350350

351-
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
352-
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
353-
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
351+
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this for the startup probe.
352+
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to using this for the liveness probe.
353+
* `/readyz`: Returns a 200 status code if the application is ready to accept traffic. We recommend using this for the readiness probe.
354+
355+
Note that it is discouraged to use the telemetry metrics endpoint for any probe when proxying the exposition data.
354356

355357
#### Limited privileges environment
356358

README.md.tpl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -349,9 +349,11 @@ After running the above, if you see `Clusterrolebinding "cluster-admin-binding"
349349

350350
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
351351

352-
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
353-
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
354-
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
352+
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this for the startup probe.
353+
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to using this for the liveness probe.
354+
* `/readyz`: Returns a 200 status code if the application is ready to accept traffic. We recommend using this for the readiness probe.
355+
356+
Note that it is discouraged to use the telemetry metrics endpoint for any probe when proxying the exposition data.
355357

356358
#### Limited privileges environment
357359

examples/autosharding/statefulset.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ spec:
3838
livenessProbe:
3939
httpGet:
4040
path: /livez
41-
port: 8080
41+
port: http-metrics
4242
initialDelaySeconds: 5
4343
timeoutSeconds: 5
4444
name: kube-state-metrics
@@ -49,8 +49,8 @@ spec:
4949
name: telemetry
5050
readinessProbe:
5151
httpGet:
52-
path: /metrics
53-
port: 8081
52+
path: /readyz
53+
port: http-metrics
5454
initialDelaySeconds: 5
5555
timeoutSeconds: 5
5656
securityContext:

examples/daemonsetsharding/daemonset.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ spec:
3333
livenessProbe:
3434
httpGet:
3535
path: /livez
36-
port: 8080
36+
port: http-metrics
3737
initialDelaySeconds: 5
3838
timeoutSeconds: 5
3939
name: kube-state-metrics-shard
@@ -44,8 +44,8 @@ spec:
4444
name: telemetry
4545
readinessProbe:
4646
httpGet:
47-
path: /metrics
48-
port: 8081
47+
path: /readyz
48+
port: http-metrics
4949
initialDelaySeconds: 5
5050
timeoutSeconds: 5
5151
securityContext:

examples/daemonsetsharding/deployment-no-node-pods.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ spec:
2828
livenessProbe:
2929
httpGet:
3030
path: /livez
31-
port: 8080
31+
port: http-metrics
3232
initialDelaySeconds: 5
3333
timeoutSeconds: 5
3434
name: kube-state-metrics
@@ -39,8 +39,8 @@ spec:
3939
name: telemetry
4040
readinessProbe:
4141
httpGet:
42-
path: /metrics
43-
port: 8081
42+
path: /readyz
43+
port: http-metrics
4444
initialDelaySeconds: 5
4545
timeoutSeconds: 5
4646
securityContext:

examples/daemonsetsharding/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
livenessProbe:
2828
httpGet:
2929
path: /livez
30-
port: 8080
30+
port: http-metrics
3131
initialDelaySeconds: 5
3232
timeoutSeconds: 5
3333
name: kube-state-metrics
@@ -38,8 +38,8 @@ spec:
3838
name: telemetry
3939
readinessProbe:
4040
httpGet:
41-
path: /metrics
42-
port: 8081
41+
path: /readyz
42+
port: http-metrics
4343
initialDelaySeconds: 5
4444
timeoutSeconds: 5
4545
securityContext:

examples/standard/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ spec:
2525
livenessProbe:
2626
httpGet:
2727
path: /livez
28-
port: 8080
28+
port: http-metrics
2929
initialDelaySeconds: 5
3030
timeoutSeconds: 5
3131
name: kube-state-metrics
@@ -36,8 +36,8 @@ spec:
3636
name: telemetry
3737
readinessProbe:
3838
httpGet:
39-
path: /metrics
40-
port: 8081
39+
path: /readyz
40+
port: http-metrics
4141
initialDelaySeconds: 5
4242
timeoutSeconds: 5
4343
securityContext:

jsonnet/kube-state-metrics/kube-state-metrics.libsonnet

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,12 +192,12 @@
192192
seccompProfile: { type: 'RuntimeDefault' },
193193
},
194194
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
195-
port: 8080,
195+
port: "http-metrics",
196196
path: '/livez',
197197
} },
198198
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
199-
port: 8081,
200-
path: '/metrics',
199+
port: "http-metrics",
200+
path: '/readyz',
201201
} },
202202
};
203203

pkg/app/server.go

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ const (
6262
metricsPath = "/metrics"
6363
healthzPath = "/healthz"
6464
livezPath = "/livez"
65+
readyzPath = "/readyz"
6566
)
6667

6768
// promLogger implements promhttp.Logger
@@ -396,6 +397,19 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
396397
return mux
397398
}
398399

400+
func handleClusterDelegationForProber(client kubernetes.Interface, probeType string) http.HandlerFunc {
401+
return func(w http.ResponseWriter, _ *http.Request) {
402+
got := client.CoreV1().RESTClient().Get().AbsPath(probeType).Do(context.Background())
403+
if got.Error() != nil {
404+
w.WriteHeader(http.StatusServiceUnavailable)
405+
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
406+
return
407+
}
408+
w.WriteHeader(http.StatusOK)
409+
w.Write([]byte(http.StatusText(http.StatusOK)))
410+
}
411+
}
412+
399413
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
400414
mux := http.NewServeMux()
401415

@@ -410,24 +424,13 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
410424
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))
411425

412426
// Add livezPath
413-
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
427+
mux.Handle(livezPath, handleClusterDelegationForProber(client, livezPath))
414428

415-
// Query the Kube API to make sure we are not affected by a network outage.
416-
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
417-
if got.Error() != nil {
418-
w.WriteHeader(http.StatusServiceUnavailable)
419-
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
420-
return
421-
}
422-
w.WriteHeader(http.StatusOK)
423-
w.Write([]byte(http.StatusText(http.StatusOK)))
424-
}))
429+
// Add readyzPath
430+
mux.Handle(readyzPath, handleClusterDelegationForProber(client, readyzPath))
425431

426432
// Add healthzPath
427-
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
428-
w.WriteHeader(http.StatusOK)
429-
w.Write([]byte(http.StatusText(http.StatusOK)))
430-
})
433+
mux.Handle(healthzPath, handleClusterDelegationForProber(client, healthzPath))
431434

432435
// Add index
433436
landingConfig := web.LandingConfig{

0 commit comments

Comments
 (0)