Skip to content

Commit f7618df

Browse files
authored
Merge pull request #2442 from rexagod/readyz
fix: add `readyz` endpoint
2 parents a1fb0ce + dbb0276 commit f7618df

File tree

10 files changed

+76
-39
lines changed

10 files changed

+76
-39
lines changed

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,11 +346,13 @@ After running the above, if you see `Clusterrolebinding "cluster-admin-binding"
346346

347347
#### Healthcheck Endpoints
348348

349-
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
349+
The following healthcheck endpoints are available (`self` refers to the telemetry port, while `main` refers to the exposition port):
350350

351-
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
352-
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
353-
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
351+
* `/healthz` (exposed on `main`): Returns a 200 status code if the application is running. We recommend to use this for the startup probe.
352+
* `/livez` (exposed on `main`): Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to using this for the liveness probe.
353+
* `/readyz` (exposed on `self`): Returns a 200 status code if the application is ready to accept requests and expose metrics. We recommend using this for the readiness probe.
354+
355+
Note that it is discouraged to use the telemetry metrics endpoint for any probe when proxying the exposition data.
354356

355357
#### Limited privileges environment
356358

README.md.tpl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,11 +347,13 @@ After running the above, if you see `Clusterrolebinding "cluster-admin-binding"
347347

348348
#### Healthcheck Endpoints
349349

350-
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
350+
The following healthcheck endpoints are available (`self` refers to the telemetry port, while `main` refers to the exposition port):
351351

352-
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
353-
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
354-
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
352+
* `/healthz` (exposed on `main`): Returns a 200 status code if the application is running. We recommend to use this for the startup probe.
353+
* `/livez` (exposed on `main`): Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to using this for the liveness probe.
354+
* `/readyz` (exposed on `self`): Returns a 200 status code if the application is ready to accept requests and expose metrics. We recommend using this for the readiness probe.
355+
356+
Note that it is discouraged to use the telemetry metrics endpoint for any probe when proxying the exposition data.
355357

356358
#### Limited privileges environment
357359

examples/autosharding/statefulset.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ spec:
3838
livenessProbe:
3939
httpGet:
4040
path: /livez
41-
port: 8080
41+
port: http-metrics
4242
initialDelaySeconds: 5
4343
timeoutSeconds: 5
4444
name: kube-state-metrics
@@ -49,8 +49,8 @@ spec:
4949
name: telemetry
5050
readinessProbe:
5151
httpGet:
52-
path: /metrics
53-
port: 8081
52+
path: /readyz
53+
port: telemetry
5454
initialDelaySeconds: 5
5555
timeoutSeconds: 5
5656
securityContext:

examples/daemonsetsharding/daemonset.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ spec:
3333
livenessProbe:
3434
httpGet:
3535
path: /livez
36-
port: 8080
36+
port: http-metrics
3737
initialDelaySeconds: 5
3838
timeoutSeconds: 5
3939
name: kube-state-metrics-shard
@@ -44,8 +44,8 @@ spec:
4444
name: telemetry
4545
readinessProbe:
4646
httpGet:
47-
path: /metrics
48-
port: 8081
47+
path: /readyz
48+
port: telemetry
4949
initialDelaySeconds: 5
5050
timeoutSeconds: 5
5151
securityContext:

examples/daemonsetsharding/deployment-no-node-pods.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ spec:
2828
livenessProbe:
2929
httpGet:
3030
path: /livez
31-
port: 8080
31+
port: http-metrics
3232
initialDelaySeconds: 5
3333
timeoutSeconds: 5
3434
name: kube-state-metrics
@@ -39,8 +39,8 @@ spec:
3939
name: telemetry
4040
readinessProbe:
4141
httpGet:
42-
path: /metrics
43-
port: 8081
42+
path: /readyz
43+
port: telemetry
4444
initialDelaySeconds: 5
4545
timeoutSeconds: 5
4646
securityContext:

examples/daemonsetsharding/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
livenessProbe:
2828
httpGet:
2929
path: /livez
30-
port: 8080
30+
port: http-metrics
3131
initialDelaySeconds: 5
3232
timeoutSeconds: 5
3333
name: kube-state-metrics
@@ -38,8 +38,8 @@ spec:
3838
name: telemetry
3939
readinessProbe:
4040
httpGet:
41-
path: /metrics
42-
port: 8081
41+
path: /readyz
42+
port: telemetry
4343
initialDelaySeconds: 5
4444
timeoutSeconds: 5
4545
securityContext:

examples/standard/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ spec:
2525
livenessProbe:
2626
httpGet:
2727
path: /livez
28-
port: 8080
28+
port: http-metrics
2929
initialDelaySeconds: 5
3030
timeoutSeconds: 5
3131
name: kube-state-metrics
@@ -36,8 +36,8 @@ spec:
3636
name: telemetry
3737
readinessProbe:
3838
httpGet:
39-
path: /metrics
40-
port: 8081
39+
path: /readyz
40+
port: telemetry
4141
initialDelaySeconds: 5
4242
timeoutSeconds: 5
4343
securityContext:

jsonnet/kube-state-metrics/kube-state-metrics.libsonnet

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,12 +192,12 @@
192192
seccompProfile: { type: 'RuntimeDefault' },
193193
},
194194
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
195-
port: 8080,
195+
port: "http-metrics",
196196
path: '/livez',
197197
} },
198198
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
199-
port: 8081,
200-
path: '/metrics',
199+
port: "telemetry",
200+
path: '/readyz',
201201
} },
202202
};
203203

pkg/app/server.go

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ const (
6262
metricsPath = "/metrics"
6363
healthzPath = "/healthz"
6464
livezPath = "/livez"
65+
readyzPath = "/readyz"
6566
)
6667

6768
// promLogger implements promhttp.Logger
@@ -376,6 +377,18 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
376377
// Add metricsPath
377378
mux.Handle(metricsPath, promhttp.HandlerFor(registry, promhttp.HandlerOpts{ErrorLog: promLogger{}}))
378379

380+
// Add readyzPath
381+
mux.Handle(readyzPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
382+
count, err := util.GatherAndCount(registry)
383+
if err != nil || count == 0 {
384+
w.WriteHeader(http.StatusServiceUnavailable)
385+
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
386+
return
387+
}
388+
w.WriteHeader(http.StatusOK)
389+
w.Write([]byte(http.StatusText(http.StatusOK)))
390+
}))
391+
379392
// Add index
380393
landingConfig := web.LandingConfig{
381394
Name: "kube-state-metrics",
@@ -396,6 +409,19 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
396409
return mux
397410
}
398411

412+
func handleClusterDelegationForProber(client kubernetes.Interface, probeType string) http.HandlerFunc {
413+
return func(w http.ResponseWriter, _ *http.Request) {
414+
got := client.CoreV1().RESTClient().Get().AbsPath(probeType).Do(context.Background())
415+
if got.Error() != nil {
416+
w.WriteHeader(http.StatusServiceUnavailable)
417+
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
418+
return
419+
}
420+
w.WriteHeader(http.StatusOK)
421+
w.Write([]byte(http.StatusText(http.StatusOK)))
422+
}
423+
}
424+
399425
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
400426
mux := http.NewServeMux()
401427

@@ -410,18 +436,7 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
410436
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))
411437

412438
// Add livezPath
413-
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
414-
415-
// Query the Kube API to make sure we are not affected by a network outage.
416-
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
417-
if got.Error() != nil {
418-
w.WriteHeader(http.StatusServiceUnavailable)
419-
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
420-
return
421-
}
422-
w.WriteHeader(http.StatusOK)
423-
w.Write([]byte(http.StatusText(http.StatusOK)))
424-
}))
439+
mux.Handle(livezPath, handleClusterDelegationForProber(client, livezPath))
425440

426441
// Add healthzPath
427442
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {

pkg/util/utils.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
"runtime"
2222
"strings"
2323

24-
"github.com/prometheus/common/version"
2524
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2625
"k8s.io/apimachinery/pkg/runtime/schema"
2726
"k8s.io/client-go/discovery"
@@ -32,6 +31,9 @@ import (
3231
"k8s.io/klog/v2"
3332
testUnstructuredMock "k8s.io/sample-controller/pkg/apis/samplecontroller/v1alpha1"
3433

34+
"github.com/prometheus/client_golang/prometheus"
35+
"github.com/prometheus/common/version"
36+
3537
"k8s.io/kube-state-metrics/v2/pkg/customresource"
3638
)
3739

@@ -154,3 +156,19 @@ func GVRFromType(resourceName string, expectedType interface{}) *schema.GroupVer
154156
Resource: r,
155157
}
156158
}
159+
160+
// GatherAndCount gathers all metrics from the provided Gatherer and counts
161+
// them. It returns the number of metric children in all gathered metric
162+
// families together.
163+
func GatherAndCount(g prometheus.Gatherer) (int, error) {
164+
got, err := g.Gather()
165+
if err != nil {
166+
return 0, fmt.Errorf("gathering metrics failed: %w", err)
167+
}
168+
169+
result := 0
170+
for _, mf := range got {
171+
result += len(mf.GetMetric())
172+
}
173+
return result, nil
174+
}

0 commit comments

Comments
 (0)