From b283e11e679b646f2e979c44d907509760d3d577 Mon Sep 17 00:00:00 2001 From: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> Date: Mon, 29 Sep 2025 11:44:26 +0200 Subject: [PATCH] feat: Add new alerts and enhance configuration --- .../alerts/alerts.libsonnet | 136 +++++++++++++++++- .../config.libsonnet | 24 +++- .../dashboards.jsonnet | 6 +- .../dashboards/collector.libsonnet | 7 +- .../dashboards/utils/variables.libsonnet | 2 +- 5 files changed, 159 insertions(+), 16 deletions(-) diff --git a/opentelemetry-collector-mixin/alerts/alerts.libsonnet b/opentelemetry-collector-mixin/alerts/alerts.libsonnet index 90a897be2..5e4af76bd 100644 --- a/opentelemetry-collector-mixin/alerts/alerts.libsonnet +++ b/opentelemetry-collector-mixin/alerts/alerts.libsonnet @@ -5,17 +5,143 @@ name: 'otelcol', rules: [ { - alert: 'OtelcolSendingQueueFull', + alert: 'ReceiverDroppedSpans', expr: ||| - otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity + rate(otelcol_receiver_refused_spans_total[5m]) > 0 |||, - 'for': '30m', + 'for': '2m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'The {{ $labels.receiver }} receiver is dropping spans at a rate of {{ humanize $value }} per second.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures', + }, + }, + { + alert: 'ReceiverDroppedMetrics', + expr: ||| + rate(otelcol_receiver_refused_metric_points_total[5m]) > 0 + |||, + 'for': '2m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'The {{ $labels.receiver }} receiver is dropping metrics at a rate of {{ humanize $value }} per second.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures', + }, + }, + { + alert: 'ReceiverDroppedLogs', + expr: ||| + rate(otelcol_receiver_refused_log_records_total[5m]) > 0 + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'The {{ $labels.receiver }} is dropping logs at a rate of {{ humanize $value }} per second.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures', + }, + }, + { + alert: 'ExporterDroppedSpans', + expr: ||| + rate(otelcol_exporter_send_failed_spans_total[5m]) > 0 + |||, + 'for': '2m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'The {{ $labels.exporter }} exporter is dropping spans at a rate of {{ humanize $value }} per second.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures', + }, + }, + { + alert: 'ExporterDroppedMetrics', + expr: ||| + rate(otelcol_exporter_send_failed_metric_points_total[5m]) > 0 + |||, + 'for': '2m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'The {{ $labels.exporter }} exporter is dropping metrics at a rate of {{ humanize $value }} per second.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures', + }, + }, + { + alert: 'ExporterDroppedLogs', + expr: ||| + rate(otelcol_exporter_send_failed_log_records_total[5m]) > 0 + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'The {{ $labels.exporter }} is dropping logs at a rate of {{ humanize $value }} per second.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures', + }, + }, + { + alert: 'ExporterQueueSize', + expr: ||| + otelcol_exporter_queue_size > otelcol_exporter_queue_capacity * 0.8 + |||, + 'for': '1m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'The {{ $labels.exporter }} queue has reached a size of {{ $value }}.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', + }, + }, + { + alert: 'SendQueueFailedSpans', + expr: ||| + rate(otelcol_exporter_enqueue_failed_spans_total[5m]) > 0 + |||, + 'for': '1m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} spans.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', + }, + }, + { + alert: 'SendQueueFailedMetricPoints', + expr: ||| + rate(otelcol_exporter_enqueue_failed_metric_points_total[5m]) > 0 + |||, + 'for': '1m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} metric points.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', + }, + }, + { + alert: 'SendQueueFailedLogRecords', + expr: ||| + rate(otelcol_exporter_enqueue_failed_log_records_total[5m]) > 0 + |||, + 'for': '1m', labels: { severity: 'warning', }, annotations: { - summary: 'The sending queue has filled up.', - description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data', + description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} log records.', + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', }, }, ], diff --git a/opentelemetry-collector-mixin/config.libsonnet b/opentelemetry-collector-mixin/config.libsonnet index fe921f26d..85d042e4e 100644 --- a/opentelemetry-collector-mixin/config.libsonnet +++ b/opentelemetry-collector-mixin/config.libsonnet @@ -1,14 +1,30 @@ { _config+:: { + // Selector to apply to all dashboard variables, panel queries, alerts and recording rules. // Can be used to filter metrics to specific OpenTelemetry Collector instances. // Example: 'job="integrations/otel-collector"' filteringSelector: '', + // The label used to differentiate between different Kubernetes clusters. + clusterLabel: 'cluster', + namespaceLabel: 'namespace', + jobLabel: 'job', + + // Configuration for which group labels are enabled. + labels: { + cluster: false, + namespace: false, + job: true, + }, + // Labels that represent a group of instances. // Used in dashboard variables and alert aggregations. - // Examples: ['job'] or ['environment', 'job', 'cluster'] - groupLabels: ['job'], + groupLabels: [ + label + for label in std.objectFields($._config.labels) + if $._config.labels[label] + ], // Labels that represent a single instance. // Used in dashboard variables and legend formats. @@ -26,10 +42,12 @@ refresh: '60s', // Timezone for Grafana dashboards:: UTC, browser, ... - grafanaTimezone: 'UTC', + grafanaTimezone: 'browser', // Tags for Grafana dashboards dashboardTags: ['otelcol'], + + dashboardNamePrefix: 'OpenTelemetry Collector / ', }, // Default datasource name diff --git a/opentelemetry-collector-mixin/dashboards.jsonnet b/opentelemetry-collector-mixin/dashboards.jsonnet index c0d94ffa6..3d6535735 100644 --- a/opentelemetry-collector-mixin/dashboards.jsonnet +++ b/opentelemetry-collector-mixin/dashboards.jsonnet @@ -2,10 +2,6 @@ local dashboards = (import 'mixin.libsonnet').grafanaDashboards; local cfg = import 'config.libsonnet'; { - [name]: dashboards[name] { - timezone: cfg._config.grafana.grafanaTimezone, - refresh: cfg._config.grafana.refresh, - tags: cfg._config.grafana.dashboardTags, - } + [name]: dashboards[name] for name in std.objectFields(dashboards) } diff --git a/opentelemetry-collector-mixin/dashboards/collector.libsonnet b/opentelemetry-collector-mixin/dashboards/collector.libsonnet index 70582c322..55dfd06ec 100644 --- a/opentelemetry-collector-mixin/dashboards/collector.libsonnet +++ b/opentelemetry-collector-mixin/dashboards/collector.libsonnet @@ -9,12 +9,15 @@ local cfg = import '../config.libsonnet'; grafanaDashboards+:: { 'collector.json': g.dashboard.new( - 'OpenTelemetry collector health', + cfg._config.grafana.dashboardNamePrefix + 'Operational', ) + g.dashboard.withDescription('A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.') + g.dashboard.graphTooltip.withSharedCrosshair() - + g.dashboard.withVariables(variables.multiInstance) + + g.dashboard.withEditable(false) + g.dashboard.withUid(cfg._config.grafanaDashboardIDs['collector.json']) + + g.dashboard.withTimezone(cfg._config.grafana.grafanaTimezone) + + g.dashboard.withTags(cfg._config.grafana.dashboardTags) + + g.dashboard.withVariables(variables.multiInstance) + g.dashboard.withPanels( g.util.grid.wrapPanels([ // Overview row diff --git a/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet index e52ec94c4..d514202d1 100644 --- a/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet +++ b/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet @@ -8,7 +8,7 @@ commonlib.variables.new( varMetric='otelcol_process_uptime', enableLokiLogs=false, customAllValue='.*', - prometheusDatasourceName='datasource', + prometheusDatasourceName=cfg._config.datasourceName, prometheusDatasourceLabel='Data source', adHocEnabled=false, )