Skip to content
Merged

fixes #6486

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 158 additions & 3 deletions k8s-production/observability-stack/alloy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ rules:
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
Expand All @@ -21,6 +22,11 @@ rules:
resources:
- configmaps
verbs: ["get"]
- apiGroups: ["metrics.k8s.io"]
resources:
- nodes
- pods
verbs: ["get", "list"]
- apiGroups: ["apps"]
resources:
- daemonsets
Expand Down Expand Up @@ -152,7 +158,7 @@ data:
values = {
traceID = "trace_id",
spanID = "span_id",
level = null,
level = "level",
}
}

Expand Down Expand Up @@ -192,7 +198,7 @@ data:
send = false
}

send_exemplars = true
send_exemplars = false
send_native_histograms = false
}

Expand Down Expand Up @@ -336,6 +342,151 @@ data:
job_name = "alloy-internal"
scrape_interval = "30s"
}

// ===== Kubernetes Infrastructure Monitoring =====

// kube-state-metrics discovery
discovery.kubernetes "kube_state_metrics" {
role = "service"
namespaces {
names = ["countly-observability"]
}
}

discovery.relabel "kube_state_metrics" {
targets = discovery.kubernetes.kube_state_metrics.targets

rule {
source_labels = ["__meta_kubernetes_service_name"]
regex = "kube-state-metrics"
action = "keep"
}

rule {
source_labels = ["__meta_kubernetes_service_port_name"]
regex = "http-metrics"
action = "keep"
}

rule {
source_labels = ["__address__"]
target_label = "__address__"
replacement = "${1}:8080"
regex = "([^:]+)(?::\\d+)?"
}
}

prometheus.scrape "kube_state_metrics" {
targets = discovery.relabel.kube_state_metrics.output
forward_to = [prometheus.remote_write.default.receiver]
job_name = "kube-state-metrics"
scrape_interval = "30s"
scrape_timeout = "10s"
honor_labels = true
}

// node-exporter discovery
discovery.kubernetes "node_exporter" {
role = "pod"
namespaces {
names = ["countly-observability"]
}
}

discovery.relabel "node_exporter" {
targets = discovery.kubernetes.node_exporter.targets

rule {
source_labels = ["__meta_kubernetes_pod_label_app"]
regex = "node-exporter"
action = "keep"
}

rule {
source_labels = ["__meta_kubernetes_pod_container_port_name"]
regex = "http-metrics"
action = "keep"
}

rule {
source_labels = ["__meta_kubernetes_pod_node_name"]
target_label = "node"
}

rule {
source_labels = ["__meta_kubernetes_pod_host_ip"]
target_label = "__address__"
replacement = "${1}:9100"
}
}

prometheus.scrape "node_exporter" {
targets = discovery.relabel.node_exporter.output
forward_to = [prometheus.remote_write.default.receiver]
job_name = "node-exporter"
scrape_interval = "30s"
scrape_timeout = "29s"

// Limit metrics to reduce timeout issues
metric_relabel_configs = [
// Drop detailed filesystem metrics for docker/k8s internal mounts
{
source_labels = ["__name__", "mountpoint"]
regex = "node_filesystem.*;/var/lib/docker/.*"
action = "drop"
},
{
source_labels = ["__name__", "mountpoint"]
regex = "node_filesystem.*;/var/lib/kubelet/.*"
action = "drop"
}
]
}

// cAdvisor metrics from kubelet
discovery.kubernetes "kubelet" {
role = "node"
}

discovery.relabel "kubelet_cadvisor" {
targets = discovery.kubernetes.kubelet.targets

rule {
source_labels = ["__address__"]
regex = "([^:]+):?(\\d+)?"
target_label = "__address__"
replacement = "${1}:10250"
}

rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}

rule {
target_label = "__metrics_path__"
replacement = "/metrics/cadvisor"
}

rule {
target_label = "__scheme__"
replacement = "https"
}
}

prometheus.scrape "kubelet_cadvisor" {
targets = discovery.relabel.kubelet_cadvisor.output
forward_to = [prometheus.remote_write.default.receiver]
job_name = "kubelet-cadvisor"
scrape_interval = "30s"
scrape_timeout = "10s"

tls_config {
insecure_skip_verify = true
}

bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
}

//////////////////////////////////////////////////////////////
// === Profiles pipeline (Pyroscope Push API) ================
Expand Down Expand Up @@ -455,6 +606,8 @@ spec:
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
- name: alloy-storage
mountPath: /tmp/alloy
resources:
requests:
cpu: 200m
Expand Down Expand Up @@ -483,4 +636,6 @@ spec:
path: /var/log
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
path: /var/lib/docker/containers
- name: alloy-storage
emptyDir: {}
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,9 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name)",
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name)",
"legendFormat": "{{service_name}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"countly-frontend\"}[$__rate_interval]))",
"legendFormat": "countly-frontend",
"refId": "B"
}
],
"title": "🔄 Request Rate (req/s)",
Expand Down Expand Up @@ -226,18 +217,9 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.95, sum(rate(countly_http_server_duration_milliseconds_bucket{http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name, le))",
"expr": "histogram_quantile(0.95, sum(rate(countly_http_server_duration_milliseconds_bucket{service_name=~\"$service_name\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name, le))",
"legendFormat": "{{service_name}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"countly-frontend\"}[$__rate_interval])) by (le)) * 1000",
"legendFormat": "countly-frontend",
"refId": "B"
}
],
"title": "⏱️ Response Time P95 (ms)",
Expand Down Expand Up @@ -330,18 +312,9 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{http_status_code=~\"5..\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name) / sum(rate(countly_http_server_duration_milliseconds_count{http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name)",
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\",http_status_code=~\"5..\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name) / sum(rate(countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name)",
"legendFormat": "{{service_name}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"countly-frontend\",http_response_status_code=~\"5..\"}[$__rate_interval])) / sum(rate(http_server_request_duration_seconds_count{job=\"countly-frontend\"}[$__rate_interval]))",
"legendFormat": "countly-frontend",
"refId": "B"
}
],
"title": "💥 Error Rate (5xx)",
Expand Down Expand Up @@ -432,7 +405,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "increase(process_memory_usage_bytes[30s]) / 6",
"expr": "process_memory_usage_bytes{service_name=~\"$service_name\"} / 1024 / 1024 / 1024",
"legendFormat": "{{service_name}}",
"refId": "A"
}
Expand Down Expand Up @@ -606,7 +579,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "up{job=~\"countly.*\"}",
"expr": "count by (service_name) (countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\"}) > 0",
"format": "table",
"instant": true,
"legendFormat": "",
Expand All @@ -617,7 +590,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "increase(process_memory_usage_bytes[30s]) / 6",
"expr": "process_memory_usage_bytes{service_name=~\"$service_name\"} / 1024 / 1024 / 1024",
"format": "table",
"instant": true,
"legendFormat": "",
Expand All @@ -628,7 +601,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name)",
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name)",
"format": "table",
"instant": true,
"legendFormat": "",
Expand All @@ -639,7 +612,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{http_status_code=~\"5..\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name) / sum(rate(countly_http_server_duration_milliseconds_count{http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[$__rate_interval])) by (service_name)",
"expr": "sum(rate(countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\",http_status_code=~\"5..\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name) / sum(rate(countly_http_server_duration_milliseconds_count{service_name=~\"$service_name\",http_route!~\".*/(v1|v2)/(traces|metrics|logs).*\"}[${__rate_interval}])) by (service_name)",
"format": "table",
"instant": true,
"legendFormat": "",
Expand Down Expand Up @@ -863,7 +836,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(otelcol_receiver_accepted_spans_total[$__rate_interval])",
"expr": "rate(otelcol_receiver_accepted_spans_total[${__rate_interval}])",
"legendFormat": "Spans/sec",
"refId": "A"
},
Expand All @@ -872,7 +845,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(otelcol_receiver_accepted_metric_points_total[$__rate_interval])",
"expr": "rate(otelcol_receiver_accepted_metric_points_total[${__rate_interval}])",
"legendFormat": "Metrics/sec",
"refId": "B"
}
Expand Down Expand Up @@ -957,7 +930,7 @@
"type": "loki",
"uid": "loki"
},
"expr": "sum(rate({namespace=\"countly\"}[$__rate_interval])) by (app)",
"expr": "sum(rate({namespace=\"countly\"}[${__rate_interval}])) by (app)",
"legendFormat": "{{app}}",
"refId": "A"
}
Expand Down Expand Up @@ -1042,7 +1015,7 @@
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(pyroscope_receive_http_request_duration_seconds_count[$__rate_interval])",
"expr": "rate(pyroscope_receive_http_request_duration_seconds_count[${__rate_interval}])",
"legendFormat": "Profiles/sec",
"refId": "A"
}
Expand Down Expand Up @@ -1143,19 +1116,19 @@
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(up{job=~\"countly.*\"}, job)",
"definition": "label_values(countly_http_server_duration_milliseconds_count, service_name)",
"hide": 0,
"includeAll": true,
"label": "Service",
"multi": true,
"name": "service",
"name": "service_name",
"options": [],
"query": {
"query": "label_values(up{job=~\"countly.*\"}, job)",
"query": "label_values(countly_http_server_duration_milliseconds_count, service_name)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"regex": "/^countly-.*/",
"skipUrlSync": false,
"sort": 1,
"type": "query"
Expand Down
Loading
Loading