Skip to content

Commit e65e5d7

Browse files
authored
Add elastic agent alerting rule templates (#15572)
Add alerting rule templates to the Elastic Agent package: * CPU usage spike * Excessive memory usage * High pipeline queue * Dropped events * Output errors * Excessive restarts * Unhealthy status
1 parent 788aaa4 commit e65e5d7

9 files changed

+245
-2
lines changed

packages/elastic_agent/changelog.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
# newer versions go on top
2+
- version: "2.6.4"
3+
changes:
4+
- description: Adds alerting rule templates
5+
type: enhancement
6+
link: https://github.com/elastic/integrations/pull/15572
27
- version: "2.6.3"
38
changes:
49
- description: Elastic Agent memory charts now prioritise RSS memory for more accurate usage reporting.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-cpu-usage-spike-rule",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] CPU usage spike",
6+
"tags": ["Elastic Agent", "Resource Consumption"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 7,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5"
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-dropped-events",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] Dropped events",
6+
"tags": ["Elastic Agent", "Pipeline and Queues"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 3,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM metrics-elastic_agent.*beat-default, *:metrics-elastic_agent.*beat-default*\n| WHERE data_stream.dataset LIKE \"elastic_agent.*beat\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS \n events_dropped_max = max(to_long(beat.stats.libbeat.pipeline.events.dropped)),\n events_dropped_min = min(to_long(beat.stats.libbeat.pipeline.events.dropped)), \n pipeline_acked_max = max(to_long(beat.stats.libbeat.pipeline.queue.acked)), \n pipeline_acked_min = min(to_long(beat.stats.libbeat.pipeline.queue.acked)) \n BY time_bucket = DATE_TRUNC(1 minute, @timestamp), elastic_agent.id, component.id\n| EVAL \n events_dropped = events_dropped_max - events_dropped_min, \n events_acked = pipeline_acked_max - pipeline_acked_min\n| EVAL drop_pct = CASE(\n events_acked > 0, events_dropped / events_acked, \n 0\n)\n| WHERE drop_pct >= 0.05\n| STATS MAX(drop_pct) BY elastic_agent.id, component.id"
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-excessive-memory-usage-rule",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] Excessive memory usage",
6+
"tags": ["Elastic Agent", "Resource Consumption"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 5,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50"
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-excessive-restarts",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] Excessive restarts",
6+
"tags": ["Elastic Agent"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 5,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name"
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-high-pipeline-queue",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] High pipeline queue",
6+
"tags": ["Elastic Agent", "Pipeline and Queues"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 1,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM metrics-elastic_agent.*beat-default, *:metrics-elastic_agent.*beat-default*\n| WHERE data_stream.dataset LIKE \"elastic_agent.*beat\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS pipeline_queue_pct = MAX(beat.stats.libbeat.pipeline.queue.filled.pct) * 100 BY elastic_agent.id, component.id\n| WHERE pipeline_queue_pct >= 90"
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-output-errors",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] Output errors",
6+
"tags": ["Elastic Agent", "Pipeline and Queues"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 3,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM metrics-elastic_agent.*beat-default*, *:metrics-elastic_agent.*beat-default*\n| WHERE data_stream.dataset LIKE \"elastic_agent.*beat\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS \n max_errors = MAX(TO_LONG(beat.stats.libbeat.output.write.errors)),\n min_errors = MIN(TO_LONG(beat.stats.libbeat.output.write.errors)) \n BY time_bucket = DATE_TRUNC(1 minute, @timestamp), elastic_agent.id, component.id\n| EVAL errors_count = max_errors - min_errors \n| WHERE errors_count > 5 \n| STATS MAX(errors_count) BY elastic_agent.id, component.id"
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"id": "elastic-agent-unhealthy-status",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[Elastic Agent] Unhealthy status",
6+
"tags": ["Elastic Agent"],
7+
"ruleTypeId": ".es-query",
8+
"schedule": {
9+
"interval": "1m"
10+
},
11+
"params": {
12+
"searchType": "esqlQuery",
13+
"timeWindowSize": 5,
14+
"timeWindowUnit": "m",
15+
"threshold": [0],
16+
"thresholdComparator": ">",
17+
"size": 100,
18+
"esqlQuery": {
19+
"esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and status == \"error\""
20+
},
21+
"aggType": "count",
22+
"groupBy": "row",
23+
"termSize": 5,
24+
"sourceFields": [],
25+
"timeField": "@timestamp",
26+
"excludeHitsFromPreviousRun": true
27+
},
28+
"alertDelay": {
29+
"active": 1
30+
}
31+
},
32+
"coreMigrationVersion": "8.8.0",
33+
"typeMigrationVersion": "10.1.0"
34+
}

packages/elastic_agent/manifest.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
name: elastic_agent
22
title: Elastic Agent
3-
version: 2.6.3
3+
version: 2.6.4
44
description: Collect logs and metrics from Elastic Agents.
55
type: integration
6-
format_version: 3.1.4
6+
format_version: 3.5.0
77
categories: ["elastic_stack"]
88
conditions:
99
kibana:

0 commit comments

Comments
 (0)