diff --git a/Makefile b/Makefile index 37086c989..f51e7e7ac 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,44 @@ OUT_DIR ?=dashboards_out .PHONY: all all: fmt generate lint test +.PHONY: dev +dev: generate + @cd scripts && ./lgtm.sh && \ + echo '' && \ + echo '╔═══════════════════════════════════════════════════════════════╗' && \ + echo '║ 🚀 Development Environment Ready! 🚀 ║' && \ + echo '║ ║' && \ + echo '║ Run `make dev-port-forward` ║' && \ + echo '║ Grafana will be available at http://localhost:3000 ║' && \ + echo '║ ║' && \ + echo '║ Data will be available in a few minutes. ║' && \ + echo '║ ║' && \ + echo '║ Dashboards will refresh every 10s, run `make generate` ║' && \ + echo '║ and refresh your browser to see the changes. ║' && \ + echo '║ ║' && \ + echo '║ Alert and recording rules require `make dev-reload`. ║' && \ + echo '║ ║' && \ + echo '╚═══════════════════════════════════════════════════════════════╝' + +.PHONY: dev-port-forward +dev-port-forward: + kubectl --context k3d-kubernetes-mixin port-forward service/lgtm 3000:3000 4317:4317 4318:4318 9090:9090 + +dev-reload: generate + @cp -v prometheus_alerts.yaml scripts/provisioning/prometheus/ && \ + cp -v prometheus_rules.yaml scripts/provisioning/prometheus/ && \ + kubectl --context k3d-kubernetes-mixin rollout restart deployment/lgtm && \ + echo '╔═══════════════════════════════════════════════════════════════╗' && \ + echo '║ ║' && \ + echo '║ 🔄 Reloading Alert and Recording Rules... ║' && \ + echo '║ ║' && \ + echo '╚═══════════════════════════════════════════════════════════════╝' && \ + kubectl --context k3d-kubernetes-mixin rollout status deployment/lgtm + +.PHONY: dev-down +dev-down: + k3d cluster delete kubernetes-mixin + .PHONY: generate generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR) diff --git a/scripts/lgtm.sh b/scripts/lgtm.sh new file mode 100755 index 000000000..b513ae926 --- /dev/null +++ b/scripts/lgtm.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -ex + +# export time in milliseconds +# export OTEL_METRIC_EXPORT_INTERVAL=500 + +# use http instead of https (needed because of https://github.com/open-telemetry/opentelemetry-go/issues/4834) +# export OTEL_EXPORTER_OTLP_INSECURE="true" + +# https://github.com/grafana/docker-otel-lgtm/tree/main/examples + +# docker run -p 3001:3000 -p 4317:4317 -p 4318:4318 \ +# -v ./provisioning/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards \ +# -v ../dashboards_out:/kubernetes-mixin/dashboards_out \ +# --rm -ti grafana/otel-lgtm + +cp ../prometheus_alerts.yaml provisioning/prometheus/ +cp ../prometheus_rules.yaml provisioning/prometheus/ + +# set up 1-node k3d cluster +k3d cluster create kubernetes-mixin \ + -v "$PWD"/provisioning:/kubernetes-mixin/provisioning \ + -v "$PWD"/../dashboards_out:/kubernetes-mixin/dashboards_out + +# run grafana, prometheus +kubectl apply -f lgtm.yaml +# kubectl port-forward service/lgtm 3000:3000 4317:4317 4318:4318 9090:9090 + +# scrape kube-state-metrics, node_exporter, cAdvisor, kubelet, kube-proxy, kube-apiserver, kube-controller-manager, kube-scheduler... write to prometheus +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo update +helm upgrade --install otel-collector-deployment open-telemetry/opentelemetry-collector \ + -n default \ + -f otel-collector-deployment.values.yaml + +# install kube-state-metrics +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm upgrade --install kube-state-metrics prometheus-community/kube-state-metrics \ + -n default + +# install node_exporter +helm upgrade --install prometheus-node-exporter prometheus-community/prometheus-node-exporter \ + -n default + +# TODO OATs: +# https://github.com/grafana/oats + +# test metrics in prometheus +# test recording rules in prometheus +# test alerting rules in prometheus +# e2e test dashboards? diff --git a/scripts/lgtm.yaml b/scripts/lgtm.yaml new file mode 100644 index 000000000..c0e617029 --- /dev/null +++ b/scripts/lgtm.yaml @@ -0,0 +1,107 @@ +# Modified to support custom prometheus config and volume mounts +# https://raw.githubusercontent.com/grafana/docker-otel-lgtm/refs/heads/main/k8s/lgtm.yaml +# this is intended for demo / testing purposes only, not for production usage +apiVersion: v1 +kind: Service +metadata: + name: lgtm +spec: + selector: + app: lgtm + ports: + - name: grafana + protocol: TCP + port: 3000 + targetPort: 3000 + - name: otel-grpc + protocol: TCP + port: 4317 + targetPort: 4317 + - name: otel-http + protocol: TCP + port: 4318 + targetPort: 4318 + - name: prometheus + protocol: TCP + port: 9090 + targetPort: 9090 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lgtm +spec: + replicas: 1 + selector: + matchLabels: + app: lgtm + template: + metadata: + labels: + app: lgtm + spec: + containers: + - name: lgtm + image: grafana/otel-lgtm:latest + ports: + - containerPort: 3000 + - containerPort: 4317 + - containerPort: 4318 + - containerPort: 9090 + readinessProbe: + exec: + command: + - cat + - /tmp/ready + command: ['/bin/sh'] + args: + - -c + - | + cat /kubernetes-mixin/prometheus/prometheus.yaml >> /otel-lgtm/prometheus.yaml + exec /otel-lgtm/run-all.sh + # NOTE: By default OpenShift does not allow writing the root directory. + # Thats why the data dirs for grafana, prometheus and loki can not be + # created and the pod never becomes ready. + # See: https://github.com/grafana/docker-otel-lgtm/issues/132 + volumeMounts: + - name: tempo-data + mountPath: /data/tempo + - name: grafana-data + mountPath: /data/grafana + - name: loki-data + mountPath: /data/loki + - name: loki-storage + mountPath: /loki + - name: p8s-storage + mountPath: /data/prometheus + - name: pyroscope-storage + mountPath: /data/pyroscope + + - name: prometheus + mountPath: /kubernetes-mixin/prometheus + - name: dashboards + mountPath: /otel-lgtm/grafana/conf/provisioning/dashboards + - name: dashboards-out + mountPath: /kubernetes-mixin/dashboards_out + volumes: + - name: tempo-data + emptyDir: {} + - name: loki-data + emptyDir: {} + - name: grafana-data + emptyDir: {} + - name: loki-storage + emptyDir: {} + - name: p8s-storage + emptyDir: {} + - name: pyroscope-storage + emptyDir: {} + - name: dashboards + hostPath: + path: /kubernetes-mixin/provisioning/dashboards + - name: prometheus + hostPath: + path: /kubernetes-mixin/provisioning/prometheus + - name: dashboards-out + hostPath: + path: /kubernetes-mixin/dashboards_out diff --git a/scripts/otel-collector-deployment.values.yaml b/scripts/otel-collector-deployment.values.yaml new file mode 100644 index 000000000..dbf4ccfc3 --- /dev/null +++ b/scripts/otel-collector-deployment.values.yaml @@ -0,0 +1,161 @@ +# Based on the following guide: +# https://grafana.com/docs/grafana-cloud/monitor-infrastructure/kubernetes-monitoring/configuration/config-other-methods/otel-collector/ +mode: deployment + +image: + repository: otel/opentelemetry-collector-contrib + +clusterRole: + create: true + rules: + - apiGroups: + - '' + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + - events + - namespaces + - namespaces/status + - pods/status + - replicationcontrollers + - replicationcontrollers/status + - resourcequotas + verbs: + - get + - list + - watch + - nonResourceURLs: + - /metrics + verbs: + - get + - apiGroups: + - apps + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - list + - watch + +config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: cadvisor + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + + - job_name: kubelet + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + + - job_name: kube-state-metrics + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: keep + regex: kube-state-metrics + + - job_name: node-exporter + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: keep + regex: prometheus-node-exporter.* + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: instance + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + + processors: + batch: {} + + exporters: + prometheusremotewrite: + external_labels: + cluster: 'kubernetes-mixin' + endpoint: 'http://lgtm:9090/api/v1/write' + + prometheus: + endpoint: "0.0.0.0:8889" + resource_to_telemetry_conversion: + enabled: true + + service: + extensions: [health_check] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [prometheus, prometheusremotewrite] + +ports: + prometheus: + enabled: true + containerPort: 8889 + servicePort: 8889 + protocol: TCP diff --git a/scripts/provisioning/dashboards/dashboards.yaml b/scripts/provisioning/dashboards/dashboards.yaml new file mode 100644 index 000000000..41f04ed28 --- /dev/null +++ b/scripts/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 + +providers: + - name: dashboards + type: file + updateIntervalSeconds: 10 + options: + path: /kubernetes-mixin/dashboards_out + foldersFromFilesStructure: true diff --git a/scripts/provisioning/prometheus/.gitignore b/scripts/provisioning/prometheus/.gitignore new file mode 100644 index 000000000..1a7b57d50 --- /dev/null +++ b/scripts/provisioning/prometheus/.gitignore @@ -0,0 +1,2 @@ +prometheus_alerts.yaml +prometheus_rules.yaml diff --git a/scripts/provisioning/prometheus/prometheus.yaml b/scripts/provisioning/prometheus/prometheus.yaml new file mode 100644 index 000000000..61924b1ed --- /dev/null +++ b/scripts/provisioning/prometheus/prometheus.yaml @@ -0,0 +1,3 @@ +rule_files: + - "/kubernetes-mixin/prometheus/prometheus_rules.yaml" + - "/kubernetes-mixin/prometheus/prometheus_alerts.yaml"