Skip to content

Commit b58ebd9

Browse files
timojohlotrouaux
andauthored
fix(logs): logic to avoid empty alert rules. rm tpl (#1216)
* fix(logs): logic to avoid empty alert rules * resolve alertname * add check for monitoring crd * Update logs/charts/values.yaml Co-authored-by: Thomas Rouaux <33913696+trouaux@users.noreply.github.com> * update README.md --------- Co-authored-by: Thomas Rouaux <33913696+trouaux@users.noreply.github.com>
1 parent b2c7048 commit b58ebd9

File tree

8 files changed

+101
-106
lines changed

8 files changed

+101
-106
lines changed

logs/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ The **Logs** Plugin comes with a [Failover Connector](https://github.com/open-te
9595
| openTelemetry.openSearchLogs.index | string | `nil` | Name for OpenSearch index |
9696
| openTelemetry.prometheus.additionalLabels | object | `{}` | Label selectors for the Prometheus resources to be picked up by prometheus-operator. |
9797
| openTelemetry.prometheus.podMonitor | object | `{"enabled":true}` | Activates the pod-monitoring for the Logs Collector. |
98-
| openTelemetry.prometheus.rules | object | `{"additionalRuleLabels":null,"annotations":{},"create":true,"disabled":["ReconcileErrors","WorkqueueDepth","ReceiverRefusedMetric"],"labels":{}}` | Default rules for monitoring the opentelemetry components. |
98+
| openTelemetry.prometheus.rules | object | `{"additionalRuleLabels":null,"annotations":{},"create":true,"enabled":["FilelogRefusedLogs","LogsOTelLogsMissing","LogsOTelLogsDecreasing","ReconcileErrors","ReceiverRefusedMetric","WorkqueueDepth"],"labels":{}}` | Default rules for monitoring the opentelemetry components. |
9999
| openTelemetry.prometheus.rules.additionalRuleLabels | string | `nil` | Additional labels for PrometheusRule alerts. |
100100
| openTelemetry.prometheus.rules.annotations | object | `{}` | Annotations for PrometheusRules. |
101101
| openTelemetry.prometheus.rules.create | bool | `true` | Enables PrometheusRule resources to be created. |
102-
| openTelemetry.prometheus.rules.disabled | list | `["ReconcileErrors","WorkqueueDepth","ReceiverRefusedMetric"]` | PrometheusRules to disable. |
102+
| openTelemetry.prometheus.rules.enabled | list | `["FilelogRefusedLogs","LogsOTelLogsMissing","LogsOTelLogsDecreasing","ReconcileErrors","ReceiverRefusedMetric","WorkqueueDepth"]` | PrometheusRules to enable. |
103103
| openTelemetry.prometheus.rules.labels | object | `{}` | Labels for PrometheusRules. |
104104
| openTelemetry.prometheus.serviceMonitor | object | `{"enabled":true}` | Activates the service-monitoring for the Logs Collector. |
105105
| openTelemetry.region | string | `nil` | Region label for Logging |

logs/charts/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
apiVersion: v2
55
name: opentelemetry-operator
6-
version: 0.11.8
6+
version: 0.11.9
77
description: OpenTelemetry Operator Helm chart for Kubernetes
88
icon: https://raw.githubusercontent.com/cncf/artwork/a718fa97fffec1b9fd14147682e9e3ac0c8817cb/projects/opentelemetry/icon/color/opentelemetry-icon-color.png
99
type: application

logs/charts/alerts/collector-alerts.yaml

Lines changed: 0 additions & 54 deletions
This file was deleted.

logs/charts/alerts/operator-alerts.yaml

Lines changed: 0 additions & 28 deletions
This file was deleted.

logs/charts/ci/test-values.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,11 @@ openTelemetry:
5555
key2: value2
5656

5757
rules:
58-
create: true
59-
disabled:
58+
enabled:
6059
- FilelogRefusedLogs
60+
- ReconcileErrors
61+
- FilelogRefusedLogs
62+
- LogsOTelLogsMissing
6163

6264
testFramework:
6365
enabled: true

logs/charts/templates/alerts.yaml

Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
{{- if and (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") .Values.openTelemetry.prometheus.rules.create }}
2-
{{- $root := . -}}
3-
{{- range $path, $bytes := .Files.Glob "alerts/*.yaml" }}
1+
{{- if and (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") .Values.openTelemetry.prometheus.rules.enabled }}
42
apiVersion: monitoring.coreos.com/v1
53
kind: PrometheusRule
64
metadata:
7-
name: {{ printf "%s-%s" $.Release.Name $path | replace "/" "-" | trimSuffix "-yaml" | trunc 63 }}
5+
name: {{ printf "%s-alerts" $.Release.Name | trunc 63 }}
86
labels:
9-
{{- include "plugin.labels" $root | nindent 4 }}
10-
{{- include "plugin.prometheusLabels" $root | nindent 4 }}
7+
{{- include "plugin.labels" . | nindent 4 }}
8+
{{- include "plugin.prometheusLabels" . | nindent 4 }}
119
{{- if $.Values.openTelemetry.prometheus.rules.labels }}
1210
{{ toYaml $.Values.openTelemetry.prometheus.rules.labels | indent 4 }}
1311
{{- end }}
@@ -16,10 +14,84 @@ metadata:
1614
{{ toYaml $.Values.openTelemetry.prometheus.rules.annotations | indent 4 }}
1715
{{- end }}
1816
spec:
19-
{{- with $root -}}
20-
{{- $content := printf "%s" $bytes }}
21-
{{ tpl $content . | indent 2 }}
22-
{{- end }}
23-
---
24-
{{- end }}
17+
groups:
18+
- name: logs-plugin-alerts
19+
rules:
20+
{{- if (has "FilelogRefusedLogs" .Values.openTelemetry.prometheus.rules.enabled) }}
21+
- alert: FilelogRefusedLogs
22+
expr: sum(rate(otelcol_receiver_refused_log_records_total{receiver=~"filelog"}[1m])) > 0
23+
for: 5m
24+
labels:
25+
severity: warning
26+
playbook: https://github.com/cloudoperators/greenhouse-extensions/tree/main/logs/playbooks/FilelogRefusedLogs.md
27+
{{- include "plugin.additionalRuleLabels" . | nindent 10 }}
28+
annotations:
29+
summary: Logs are not successfully pushed into the filelog-receiver
30+
description: Filelog receiver is increasingly rejecting logs
31+
{{- end }}
32+
33+
{{- if (has "ReceiverRefusedMetric" .Values.openTelemetry.prometheus.rules.enabled) }}
34+
- alert: ReceiverRefusedMetric
35+
expr: sum(rate(otelcol_receiver_refused_metric_points_total{}[1m])) > 0
36+
for: 5m
37+
labels:
38+
severity: warning
39+
playbook: https://github.com/cloudoperators/greenhouse-extensions/tree/main/logs/playbooks/ReceiverRefusedMetric.md
40+
{{- include "plugin.additionalRuleLabels" . | nindent 10 }}
41+
annotations:
42+
summary: OTel is refusing metric points
43+
description: The OTel Collector has refused metric points for over 5 minutes. This may indicate malformed metrics. Review configuration and incoming traffic for issues.
44+
{{- end }}
45+
46+
{{- if (has "LogsOTelLogsMissing" .Values.openTelemetry.prometheus.rules.enabled) }}
47+
- alert: LogsOTelLogsMissing
48+
expr: sum by (region, k8s_node_name) (rate(otelcol_exporter_sent_log_records_total{job=~".*/opentelemetry-collector-logs", exporter !~"debug"}[60m])) == 0
49+
for: 120m
50+
labels:
51+
severity: warning
52+
playbook: https://github.com/cloudoperators/greenhouse-extensions/tree/main/logs/playbooks/OTelLogsMissing.md
53+
{{- include "plugin.additionalRuleLabels" . | nindent 10 }}
54+
annotations:
55+
summary: OTel is not shipping logs
56+
description: 'otel-logs on {{`{{ $labels.k8s_node_name }}`}} in {{`{{ $labels.region }}`}} is not shipping logs. Please check.'
57+
{{- end }}
58+
59+
{{- if (has "LogsOTelLogsDecreasing" .Values.openTelemetry.prometheus.rules.enabled) }}
60+
- alert: LogsOTelLogsDecreasing
61+
expr: sum(rate(otelcol_exporter_sent_log_records_total{job="logs/opentelemetry-collector-logs"}[1h]offset 2h)) by (k8s_cluster_name)/sum(rate(otelcol_exporter_sent_log_records_total{job="logs/opentelemetry-collector-logs"}[1h])) by (k8s_cluster_name) > 4
62+
for: 2h
63+
labels:
64+
severity: warning
65+
playbook: https://github.com/cloudoperators/greenhouse-extensions/tree/main/logs/playbooks/LogsOTelLogsDecreasing.md
66+
{{- include "plugin.additionalRuleLabels" . | nindent 10 }}
67+
annotations:
68+
summary: OTel log volume is decreasing, check log volume.
69+
description: 'OTel on {{`{{ $labels.k8s_cluster_name }}`}} in {{`{{ $labels.region }}`}} is sending 4 times fewer logs in the last 2h. Please check.'
70+
{{- end }}
71+
72+
{{- if (has "ReconcileErrors" .Values.openTelemetry.prometheus.rules.enabled) }}
73+
- alert: ReconcileErrors
74+
expr: rate(controller_runtime_reconcile_total{controller="opentelemetrycollector",result="error"}[5m]) > 0
75+
for: 5m
76+
labels:
77+
severity: warning
78+
playbook: https://github.com/cloudoperators/greenhouse-extensions/tree/main/logs/playbooks/ReconcileErrors.md
79+
{{- include "plugin.additionalRuleLabels" . | nindent 10 }}
80+
annotations:
81+
summary: OpenTelemetryCollector Reconciliation
82+
description: Reconciliation errors for opentelemetrycollector are increasing
83+
{{- end }}
84+
85+
{{- if (has "WorkqueueDepth" .Values.openTelemetry.prometheus.rules.enabled) }}
86+
- alert: WorkqueueDepth
87+
expr: rate(controller_runtime_reconcile_total{controller="opentelemetrycollector",result="error"}[5m]) > 0
88+
for: 5m
89+
labels:
90+
severity: warning
91+
playbook: https://github.com/cloudoperators/greenhouse-extensions/tree/main/logs/playbooks/WorkqueueDepth.md
92+
{{- include "plugin.additionalRuleLabels" . | nindent 10 }}
93+
annotations:
94+
summary: WorkqueueDepth is increasing
95+
description: Check manager logs for reasons why this might happen
96+
{{- end }}
2597
{{- end }}

logs/charts/values.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,14 @@ openTelemetry:
9999
# -- Enables PrometheusRule resources to be created.
100100
create: true
101101

102-
# -- PrometheusRules to disable.
103-
disabled:
102+
# -- PrometheusRules to enable.
103+
enabled:
104+
- FilelogRefusedLogs
105+
- LogsOTelLogsMissing
106+
- LogsOTelLogsDecreasing
104107
- ReconcileErrors
105-
- WorkqueueDepth
106108
- ReceiverRefusedMetric
109+
- WorkqueueDepth
107110

108111
# -- Labels for PrometheusRules.
109112
labels: {}

logs/plugindefinition.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ kind: ClusterPluginDefinition
66
metadata:
77
name: logs
88
spec:
9-
version: 0.11.8
9+
version: 0.11.9
1010
displayName: Logs
1111
description: Observability framework for instrumenting, generating, collecting, and exporting logs.
1212
icon: https://raw.githubusercontent.com/cloudoperators/greenhouse-extensions/main/logs/logo.png
1313
helmChart:
1414
name: opentelemetry-operator
1515
repository: oci://ghcr.io/cloudoperators/greenhouse-extensions/charts
16-
version: 0.11.8
16+
version: 0.11.9
1717
options:
1818
- default: true
1919
description: Activates the standard configuration for logs
@@ -86,8 +86,8 @@ spec:
8686
name: openTelemetry.prometheus.additionalLabels
8787
required: false
8888
type: map
89-
- description: openTelemetry.prometheus.rules.create
90-
name: openTelemetry.prometheus.rules.create
89+
- description: openTelemetry.prometheus.rules.enabled
90+
name: openTelemetry.prometheus.rules.enabled
9191
required: false
9292
type: map
9393
- default: true

0 commit comments

Comments
 (0)