From 867b52af7dd98fab39256b346fcfeda56508fc3c Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Thu, 3 Jul 2025 19:24:43 +0100 Subject: [PATCH] wip: local k3d dev env (cAdvisor+kubelet) --- scripts/lgtm.sh | 44 ++++++ scripts/lgtm.yaml | 94 ++++++++++++ scripts/otel-collector-deployment.values.yaml | 141 ++++++++++++++++++ .../provisioning/dashboards/dashboards.yaml | 9 ++ 4 files changed, 288 insertions(+) create mode 100755 scripts/lgtm.sh create mode 100644 scripts/lgtm.yaml create mode 100644 scripts/otel-collector-deployment.values.yaml create mode 100644 scripts/provisioning/dashboards/dashboards.yaml diff --git a/scripts/lgtm.sh b/scripts/lgtm.sh new file mode 100755 index 000000000..69d934efa --- /dev/null +++ b/scripts/lgtm.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -ex + +# export time in milliseconds +# export OTEL_METRIC_EXPORT_INTERVAL=500 + +# use http instead of https (needed because of https://github.com/open-telemetry/opentelemetry-go/issues/4834) +# export OTEL_EXPORTER_OTLP_INSECURE="true" + +# https://github.com/grafana/docker-otel-lgtm/tree/main/examples + +# docker run -p 3001:3000 -p 4317:4317 -p 4318:4318 \ +# -v ./provisioning/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards \ +# -v ../dashboards_out:/kubernetes-mixin/dashboards_out \ +# --rm -ti grafana/otel-lgtm + +# set up 1-node k3d cluster +k3d cluster create kubernetes-mixin \ + -v "$PWD"/provisioning:/kubernetes-mixin/provisioning \ + -v "$PWD"/../dashboards_out:/kubernetes-mixin/dashboards_out + +# run grafana, prometheus +# install dashboards in grafana +# wget https://raw.githubusercontent.com/grafana/docker-otel-lgtm/refs/heads/main/k8s/lgtm.yaml +kubectl apply -f lgtm.yaml +# kubectl port-forward service/lgtm 3001:3000 4317:4317 4318:4318 + +# scrape kube-state-metrics, node_exporter, cAdvisor, kubelet, kube-proxy, kube-apiserver, kube-controller-manager, kube-scheduler... write to prometheus +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo update +helm upgrade --install otel-collector-deployment open-telemetry/opentelemetry-collector \ + -n default \ + -f otel-collector-deployment.values.yaml + +# TODO install kube-state-metrics, node_exporter + +# TODO OATs: +# https://github.com/grafana/oats +# test metrics in prometheus +# test recording rules in prometheus +# test alerting rules in prometheus + +# TODO: e2e test dashboards? diff --git a/scripts/lgtm.yaml b/scripts/lgtm.yaml new file mode 100644 index 000000000..97d8becf4 --- /dev/null +++ b/scripts/lgtm.yaml @@ -0,0 +1,94 @@ +# this is intended for demo / testing purposes only, not for production usage +apiVersion: v1 +kind: Service +metadata: + name: lgtm +spec: + selector: + app: lgtm + ports: + - name: grafana + protocol: TCP + port: 3000 + targetPort: 3000 + - name: otel-grpc + protocol: TCP + port: 4317 + targetPort: 4317 + - name: otel-http + protocol: TCP + port: 4318 + targetPort: 4318 + - name: prometheus + protocol: TCP + port: 9090 + targetPort: 9090 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lgtm +spec: + replicas: 1 + selector: + matchLabels: + app: lgtm + template: + metadata: + labels: + app: lgtm + spec: + containers: + - name: lgtm + image: grafana/otel-lgtm:latest + ports: + - containerPort: 3000 + - containerPort: 4317 + - containerPort: 4318 + - containerPort: 9090 + readinessProbe: + exec: + command: + - cat + - /tmp/ready + # NOTE: By default OpenShift does not allow writing the root directory. + # Thats why the data dirs for grafana, prometheus and loki can not be + # created and the pod never becomes ready. + # See: https://github.com/grafana/docker-otel-lgtm/issues/132 + volumeMounts: + - name: tempo-data + mountPath: /data/tempo + - name: grafana-data + mountPath: /data/grafana + - name: loki-data + mountPath: /data/loki + - name: loki-storage + mountPath: /loki + - name: p8s-storage + mountPath: /data/prometheus + - name: pyroscope-storage + mountPath: /data/pyroscope + + - name: dashboards + mountPath: /otel-lgtm/grafana/conf/provisioning/dashboards + - name: dashboards-out + mountPath: /kubernetes-mixin/dashboards_out + volumes: + - name: tempo-data + emptyDir: {} + - name: loki-data + emptyDir: {} + - name: grafana-data + emptyDir: {} + - name: loki-storage + emptyDir: {} + - name: p8s-storage + emptyDir: {} + - name: pyroscope-storage + emptyDir: {} + - name: dashboards + hostPath: + path: /kubernetes-mixin/provisioning/dashboards + - name: dashboards-out + hostPath: + path: /kubernetes-mixin/dashboards_out diff --git a/scripts/otel-collector-deployment.values.yaml b/scripts/otel-collector-deployment.values.yaml new file mode 100644 index 000000000..f9f699eab --- /dev/null +++ b/scripts/otel-collector-deployment.values.yaml @@ -0,0 +1,141 @@ +# Based on the following guide: +# https://grafana.com/docs/grafana-cloud/monitor-infrastructure/kubernetes-monitoring/configuration/config-other-methods/otel-collector/ +mode: deployment + +image: + repository: otel/opentelemetry-collector-contrib + +clusterRole: + create: true + rules: + - apiGroups: + - '' + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + - events + - namespaces + - namespaces/status + - pods/status + - replicationcontrollers + - replicationcontrollers/status + - resourcequotas + verbs: + - get + - list + - watch + - nonResourceURLs: + - /metrics + verbs: + - get + - apiGroups: + - apps + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - list + - watch + +config: + receivers: + prometheus: + config: + scrape_configs: + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubelet + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + + processors: + batch: {} + + exporters: + prometheusremotewrite: + external_labels: + cluster: 'kubernetes-mixin' + endpoint: 'http://lgtm:9090/api/v1/write' + + prometheus: + endpoint: "0.0.0.0:8889" + resource_to_telemetry_conversion: + enabled: true + + service: + extensions: [health_check] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [prometheus, prometheusremotewrite] + +ports: + prometheus: + enabled: true + containerPort: 8889 + servicePort: 8889 + protocol: TCP diff --git a/scripts/provisioning/dashboards/dashboards.yaml b/scripts/provisioning/dashboards/dashboards.yaml new file mode 100644 index 000000000..41f04ed28 --- /dev/null +++ b/scripts/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 + +providers: + - name: dashboards + type: file + updateIntervalSeconds: 10 + options: + path: /kubernetes-mixin/dashboards_out + foldersFromFilesStructure: true