diff --git a/Makefile b/Makefile index f51e7e7ac..acf280038 100644 --- a/Makefile +++ b/Makefile @@ -40,22 +40,23 @@ dev: generate .PHONY: dev-port-forward dev-port-forward: - kubectl --context k3d-kubernetes-mixin port-forward service/lgtm 3000:3000 4317:4317 4318:4318 9090:9090 + kubectl --context kind-kubernetes-mixin wait --for=condition=Ready pods -l app=lgtm --timeout=300s + kubectl --context kind-kubernetes-mixin port-forward service/lgtm 3000:3000 4317:4317 4318:4318 9090:9090 dev-reload: generate @cp -v prometheus_alerts.yaml scripts/provisioning/prometheus/ && \ cp -v prometheus_rules.yaml scripts/provisioning/prometheus/ && \ - kubectl --context k3d-kubernetes-mixin rollout restart deployment/lgtm && \ + kubectl --context kind-kubernetes-mixin rollout restart deployment/lgtm && \ echo '╔═══════════════════════════════════════════════════════════════╗' && \ echo '║ ║' && \ echo '║ 🔄 Reloading Alert and Recording Rules... ║' && \ echo '║ ║' && \ echo '╚═══════════════════════════════════════════════════════════════╝' && \ - kubectl --context k3d-kubernetes-mixin rollout status deployment/lgtm + kubectl --context kind-kubernetes-mixin rollout status deployment/lgtm .PHONY: dev-down dev-down: - k3d cluster delete kubernetes-mixin + kind delete cluster --name kubernetes-mixin .PHONY: generate generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR) diff --git a/README.md b/README.md index b2690f01f..f2d92199d 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,41 @@ [![ci](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml/badge.svg)](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml) -> NOTE: This project is *pre-release* stage. Flags, configuration, behaviour and design may change significantly in following releases. - A set of Grafana dashboards and Prometheus alerts for Kubernetes. +## Local development + +Run the following command to setup a local [kind](https://kind.sigs.k8s.io) cluster: + +```shell +make dev +``` + +You should see the following output if successful: + +```shell +╔═══════════════════════════════════════════════════════════════╗ +║ 🚀 Development Environment Ready! 🚀 ║ +║ ║ +║ Run `make dev-port-forward` ║ +║ Grafana will be available at http://localhost:3000 ║ +║ ║ +║ Data will be available in a few minutes. ║ +║ ║ +║ Dashboards will refresh every 10s, run `make generate` ║ +║ and refresh your browser to see the changes. ║ +║ ║ +║ Alert and recording rules require `make dev-reload`. ║ +║ ║ +╚═══════════════════════════════════════════════════════════════╝ +``` + +To delete the cluster, run the following: + +```shell +make dev-down +``` + ## Releases > Note: Releases up until `release-0.12` are changes in their own branches. Changelogs are included in releases starting from [version-0.13.0](https://github.com/kubernetes-monitoring/kubernetes-mixin/releases/tag/version-0.13.0). @@ -33,7 +64,7 @@ Some alerts now use Prometheus filters made available in Prometheus 2.11.0, whic Warning: This compatibility matrix was initially created based on experience, we do not guarantee the compatibility, it may be updated based on new learnings. -Warning: By default the expressions will generate *grafana 7.2+* compatible rules using the *$__rate_interval* variable for rate functions. If you need backward compatible rules please set *grafana72: false* in your *_config* +Warning: By default the expressions will generate *grafana 7.2+* compatible rules using the *$\_\_rate_interval* variable for rate functions. If you need backward compatible rules please set *grafana72: false* in your *\_config* ### Release steps @@ -75,6 +106,7 @@ node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler): You then have three options for deploying your dashboards + 1. Generate the config files and deploy them yourself 2. Use ksonnet to deploy this mixin along with Prometheus and Grafana 3. Use prometheus-operator to deploy this mixin (TODO) @@ -109,11 +141,12 @@ The `prometheus_alerts.yaml` and `prometheus_rules.yaml` file then need to passe ### Dashboards for Windows Nodes There exist separate dashboards for windows resources. -1) Compute Resources / Cluster(Windows) -2) Compute Resources / Namespace(Windows) -3) Compute Resources / Pod(Windows) -4) USE Method / Cluster(Windows) -5) USE Method / Node(Windows) + +1. Compute Resources / Cluster(Windows) +2. Compute Resources / Namespace(Windows) +3. Compute Resources / Pod(Windows) +4. USE Method / Cluster(Windows) +5. USE Method / Node(Windows) These dashboards are based on metrics populated by [windows-exporter](https://github.com/prometheus-community/windows_exporter) from each Windows node. @@ -270,14 +303,14 @@ Same result can be achieved by modyfying the existing `config.libsonnet` with th While the community has not yet fully agreed on alert severities and their to be used, this repository assumes the following paradigms when setting the severities: -* Critical: An issue, that needs to page a person to take instant action -* Warning: An issue, that needs to be worked on but in the regular work queue or for during office hours rather than paging the oncall -* Info: Is meant to support a trouble shooting process by informing about a non-normal situation for one or more systems but not worth a page or ticket on its own. +- Critical: An issue, that needs to page a person to take instant action +- Warning: An issue, that needs to be worked on but in the regular work queue or for during office hours rather than paging the oncall +- Info: Is meant to support a trouble shooting process by informing about a non-normal situation for one or more systems but not worth a page or ticket on its own. ### Architecture and Technical Decisions -* For more motivation, see "[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin. -* For more information about monitoring mixins, see this [design doc](DESIGN.md). +- For more motivation, see "[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin. +- For more information about monitoring mixins, see this [design doc](DESIGN.md). ## Note diff --git a/scripts/lgtm.sh b/scripts/lgtm.sh index b513ae926..81bdd069c 100755 --- a/scripts/lgtm.sh +++ b/scripts/lgtm.sh @@ -1,27 +1,47 @@ #!/bin/bash - set -ex -# export time in milliseconds -# export OTEL_METRIC_EXPORT_INTERVAL=500 - -# use http instead of https (needed because of https://github.com/open-telemetry/opentelemetry-go/issues/4834) -# export OTEL_EXPORTER_OTLP_INSECURE="true" - -# https://github.com/grafana/docker-otel-lgtm/tree/main/examples - -# docker run -p 3001:3000 -p 4317:4317 -p 4318:4318 \ -# -v ./provisioning/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards \ -# -v ../dashboards_out:/kubernetes-mixin/dashboards_out \ -# --rm -ti grafana/otel-lgtm - cp ../prometheus_alerts.yaml provisioning/prometheus/ cp ../prometheus_rules.yaml provisioning/prometheus/ -# set up 1-node k3d cluster -k3d cluster create kubernetes-mixin \ - -v "$PWD"/provisioning:/kubernetes-mixin/provisioning \ - -v "$PWD"/../dashboards_out:/kubernetes-mixin/dashboards_out +# Create kind cluster with kube-scheduler resource metrics enabled +kind create cluster --name kubernetes-mixin --config - <