diff --git a/charts/sourcegraph/examples/prometheus/README.md b/charts/sourcegraph/examples/prometheus/README.md new file mode 100644 index 00000000..e95fca83 --- /dev/null +++ b/charts/sourcegraph/examples/prometheus/README.md @@ -0,0 +1,32 @@ +# Prometheus ConfigMap Override + +## Why + +- Some self-hosted customers run their instances on non-standard Kubernetes clusters, such as k3s, which expose metrics using different names / labels +- Our Grafana dashboards expect metrics to be on our Prometheus container with specific names +- Using the default configMap, the Grafana graphs do not show some metrics, although they may exist on Prometheus +- Use this configMap to rename k3s' metrics to match our Grafana dashboard queries + +## How to Use + +- Apply the override configMap via `kubectl apply -f prometheus-override-k3s.ConfigMap.yaml` +- Add the new configMap's name in your Helm values override file, ex: +```yaml +prometheus: + existingConfig: prometheus-override-k3s +``` +- Re-apply your Helm values override file, which may restart the Prometheus pod, but should not restart other services + +## Notes + +- Copied from https://github.com/sourcegraph/deploy/blob/main/install/prometheus-override.ConfigMap.yaml +- If this situation (matching symptoms and root cause) is found with other types of Kubernetes clusters, new Prometheus override configMaps could be created + +## Troubleshooting Empty Grafana Dashboards + +- There are a handful of steps in the metrics pipeline where data could be getting lost: + - Are the cAdvisor, node-exporter, Prometheus, and Grafana containers all running, and healthy? + - Are any of these pods reporting any issues in their Kubernetes events, or container logs? + - Is network connectivity open from Prometheus to each of the cAdvisor / node-exporter containers? + - Is network connectivity open from Grafana to Prometheus? + - Does Prometheus have access to Kubernetes RBAC roles to use Service Discovery to find the IP addresses of cAdvisor and node-exporter pods? \ No newline at end of file diff --git a/charts/sourcegraph/examples/prometheus/prometheus-override-k3s.ConfigMap.yaml b/charts/sourcegraph/examples/prometheus/prometheus-override-k3s.ConfigMap.yaml new file mode 100644 index 00000000..5343dee1 --- /dev/null +++ b/charts/sourcegraph/examples/prometheus/prometheus-override-k3s.ConfigMap.yaml @@ -0,0 +1,188 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + deploy: sourcegraph + name: prometheus-override-k3s +data: + node_rules.yml: '' + extra_rules.yml: '' + prometheus.yml: | + global: + scrape_interval: 30s + evaluation_interval: 30s + + alerting: + alertmanagers: + - static_configs: + - targets: ['127.0.0.1:9093'] + path_prefix: /alertmanager + + rule_files: + - '*_rules.yml' + - "/sg_config_prometheus/*_rules.yml" + - "/sg_prometheus_add_ons/*_rules.yml" + + scrape_configs: + + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + - job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + + ############################################################################################################ + # k3s and cAdvisor-specific customization + # name container metrics after their container name labels + # Note that 'io.kubernetes.container.name' and 'io.kubernetes.pod.name' must be provided in cAdvisor + ############################################################################################################ + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: [container, pod] + regex: (.+) + action: replace + target_label: name + separator: '-' + - source_labels: [container] + regex: (.+) + action: replace + target_label: container_label_io_kubernetes_container_name + - source_labels: [pod] + regex: (.+) + action: replace + target_label: container_label_io_kubernetes_pod_name + ############################################################################################################ + + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_sourcegraph_prometheus_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: ns + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + # Sourcegraph specific customization. We want a nicer name for job + - source_labels: [app] + action: replace + target_label: job + # Sourcegraph specific customization. We want a nicer name for instance + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: instance + + - job_name: 'kubernetes-services' + metrics_path: /probe + params: + module: [http_2xx] + kubernetes_sd_configs: + - role: service + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_service_namespace] + target_label: ns + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_name + + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Sourcegraph specific customization, only scrape pods with our annotation + - source_labels: [__meta_kubernetes_pod_annotation_sourcegraph_prometheus_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: (.+):(?:\d+);(\d+) + replacement: ${1}:${2} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + ############################################################################################################ + # k3s and cAdvisor-specific customization + ############################################################################################################ + - source_labels: [namespace] + action: replace + target_label: ns + metric_relabel_configs: + - source_labels: [kubernetes_io_hostname] + regex: sourcegraph-0 + action: keep + - source_labels: [namespace] + regex: default + action: keep + ############################################################################################################ + + # Scrape prometheus itself for metrics. + - job_name: 'builtin-prometheus' + static_configs: + - targets: ['127.0.0.1:9092'] + labels: + app: prometheus + - job_name: 'builtin-alertmanager' + metrics_path: /alertmanager/metrics + static_configs: + - targets: ['127.0.0.1:9093'] + labels: + app: alertmanager