Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 131 additions & 5 deletions opentelemetry-collector-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,143 @@
name: 'otelcol',
rules: [
{
alert: 'OtelcolSendingQueueFull',
alert: 'ReceiverDroppedSpans',
expr: |||
otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity
rate(otelcol_receiver_refused_spans_total[5m]) > 0
|||,
'for': '30m',
'for': '2m',
labels: {
severity: 'critical',
},
annotations: {
description: 'The {{ $labels.receiver }} receiver is dropping spans at a rate of {{ humanize $value }} per second.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures',
},
},
{
alert: 'ReceiverDroppedMetrics',
expr: |||
rate(otelcol_receiver_refused_metric_points_total[5m]) > 0
|||,
'for': '2m',
labels: {
severity: 'critical',
},
annotations: {
description: 'The {{ $labels.receiver }} receiver is dropping metrics at a rate of {{ humanize $value }} per second.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures',
},
},
{
alert: 'ReceiverDroppedLogs',
expr: |||
rate(otelcol_receiver_refused_log_records_total[5m]) > 0
|||,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
description: 'The {{ $labels.receiver }} is dropping logs at a rate of {{ humanize $value }} per second.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures',
},
},
{
alert: 'ExporterDroppedSpans',
expr: |||
rate(otelcol_exporter_send_failed_spans_total[5m]) > 0
|||,
'for': '2m',
labels: {
severity: 'critical',
},
annotations: {
description: 'The {{ $labels.exporter }} exporter is dropping spans at a rate of {{ humanize $value }} per second.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures',
},
},
{
alert: 'ExporterDroppedMetrics',
expr: |||
rate(otelcol_exporter_send_failed_metric_points_total[5m]) > 0
|||,
'for': '2m',
labels: {
severity: 'critical',
},
annotations: {
description: 'The {{ $labels.exporter }} exporter is dropping metrics at a rate of {{ humanize $value }} per second.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures',
},
},
{
alert: 'ExporterDroppedLogs',
expr: |||
rate(otelcol_exporter_send_failed_log_records_total[5m]) > 0
|||,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
description: 'The {{ $labels.exporter }} is dropping logs at a rate of {{ humanize $value }} per second.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures',
},
},
{
alert: 'ExporterQueueSize',
expr: |||
otelcol_exporter_queue_size > otelcol_exporter_queue_capacity * 0.8
|||,
'for': '1m',
labels: {
severity: 'warning',
},
annotations: {
description: 'The {{ $labels.exporter }} queue has reached a size of {{ $value }}.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
},
},
{
alert: 'SendQueueFailedSpans',
expr: |||
rate(otelcol_exporter_enqueue_failed_spans_total[5m]) > 0
|||,
'for': '1m',
labels: {
severity: 'warning',
},
annotations: {
description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} spans.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
},
},
{
alert: 'SendQueueFailedMetricPoints',
expr: |||
rate(otelcol_exporter_enqueue_failed_metric_points_total[5m]) > 0
|||,
'for': '1m',
labels: {
severity: 'warning',
},
annotations: {
description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} metric points.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
},
},
{
alert: 'SendQueueFailedLogRecords',
expr: |||
rate(otelcol_exporter_enqueue_failed_log_records_total[5m]) > 0
|||,
'for': '1m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'The sending queue has filled up.',
description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data',
description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} log records.',
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
},
},
],
Expand Down
24 changes: 21 additions & 3 deletions opentelemetry-collector-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
{
_config+:: {

// Selector to apply to all dashboard variables, panel queries, alerts and recording rules.
// Can be used to filter metrics to specific OpenTelemetry Collector instances.
// Example: 'job="integrations/otel-collector"'
filteringSelector: '',

// The label used to differentiate between different Kubernetes clusters.
clusterLabel: 'cluster',
namespaceLabel: 'namespace',
jobLabel: 'job',

// Configuration for which group labels are enabled.
labels: {
cluster: false,
namespace: false,
job: true,
},

// Labels that represent a group of instances.
// Used in dashboard variables and alert aggregations.
// Examples: ['job'] or ['environment', 'job', 'cluster']
groupLabels: ['job'],
groupLabels: [
label
for label in std.objectFields($._config.labels)
if $._config.labels[label]
],

// Labels that represent a single instance.
// Used in dashboard variables and legend formats.
Expand All @@ -26,10 +42,12 @@
refresh: '60s',

// Timezone for Grafana dashboards:: UTC, browser, ...
grafanaTimezone: 'UTC',
grafanaTimezone: 'browser',

// Tags for Grafana dashboards
dashboardTags: ['otelcol'],

dashboardNamePrefix: 'OpenTelemetry Collector / ',
},

// Default datasource name
Expand Down
6 changes: 1 addition & 5 deletions opentelemetry-collector-mixin/dashboards.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
local cfg = import 'config.libsonnet';

{
[name]: dashboards[name] {
timezone: cfg._config.grafana.grafanaTimezone,
refresh: cfg._config.grafana.refresh,
tags: cfg._config.grafana.dashboardTags,
}
[name]: dashboards[name]
for name in std.objectFields(dashboards)
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ local cfg = import '../config.libsonnet';
grafanaDashboards+:: {
'collector.json':
g.dashboard.new(
'OpenTelemetry collector health',
cfg._config.grafana.dashboardNamePrefix + 'Operational',
)
+ g.dashboard.withDescription('A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.')
+ g.dashboard.graphTooltip.withSharedCrosshair()
+ g.dashboard.withVariables(variables.multiInstance)
+ g.dashboard.withEditable(false)
+ g.dashboard.withUid(cfg._config.grafanaDashboardIDs['collector.json'])
+ g.dashboard.withTimezone(cfg._config.grafana.grafanaTimezone)
+ g.dashboard.withTags(cfg._config.grafana.dashboardTags)
+ g.dashboard.withVariables(variables.multiInstance)
+ g.dashboard.withPanels(
g.util.grid.wrapPanels([
// Overview row
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ commonlib.variables.new(
varMetric='otelcol_process_uptime',
enableLokiLogs=false,
customAllValue='.*',
prometheusDatasourceName='datasource',
prometheusDatasourceName=cfg._config.datasourceName,
prometheusDatasourceLabel='Data source',
adHocEnabled=false,
)