Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 8 additions & 13 deletions opensearch-mixin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@ and the following alerts:
- OpenSearchRedCluster
- OpenSearchUnstableShardReallocation
- OpenSearchUnstableShardUnassigned
- OpenSearchModerateNodeDiskUsage
- OpenSearchHighNodeDiskUsage
- OpenSearchModerateNodeCPUUsage
- OpenSearchHighNodeCPUUsage
- OpenSearchModerateNodeMemoryUsage
- OpenSearchHighNodeMemoryUsage
- OpenSearchHighNodeDiskUsage (warning and critical)
- OpenSearchHighNodeCpuUsage (warning and critical)
- OpenSearchHighNodeMemoryUsage (warning and critical)
- OpenSearchModerateRequestLatency
- OpenSearchModerateIndexLatency

Expand Down Expand Up @@ -85,22 +82,20 @@ The OpenSearch search and index overview dashboard provides details on request p

## Alerts Overview


| Alert | Summary |
|-------------------------------------|---------------------------------------------------------------------------------|
| OpenSearchYellowCluster | At least one of the clusters is reporting a yellow status. |
| OpenSearchRedCluster | At least one of the clusters is reporting a red status. |
| OpenSearchUnstableShardReallocation | A node has gone offline or has been disconnected triggering shard reallocation. |
| OpenSearchUnstableShardUnassigned | There are shards that have been detected as unassigned. |
| OpenSearchModerateNodeDiskUsage | The node disk usage has exceeded the warning threshold. |
| OpenSearchHighNodeDiskUsage | The node disk usage has exceeded the critical threshold. |
| OpenSearchModerateNodeCpuUsage | The node CPU usage has exceeded the warning threshold. |
| OpenSearchHighNodeCpuUsage | The node CPU usage has exceeded the critical threshold. |
| OpenSearchModerateNodeMemoryUsage | The node memory usage has exceeded the warning threshold. |
| OpenSearchHighNodeMemoryUsage | The node memory usage has exceeded the critical threshold. |
| OpenSearchHighNodeDiskUsage | The node disk usage has exceeded the configured threshold (warning or critical). |
| OpenSearchHighNodeCpuUsage | The node CPU usage has exceeded the configured threshold (warning or critical). |
| OpenSearchHighNodeMemoryUsage | The node memory usage has exceeded the configured threshold (warning or critical). |
| OpenSearchModerateRequestLatency | The request latency has exceeded the warning threshold. |
| OpenSearchModerateIndexLatency | The index latency has exceeded the warning threshold. |

Node resource alerts (disk, CPU, memory) use the same alert name for both warning and critical severity levels. This follows the Alertmanager inhibition pattern, allowing warning alerts to be automatically suppressed when critical alerts fire.

Default thresholds can be configured in `config.libsonnet`

```js
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: $._config.uid + '-alerts',
name: this.config.uid + '-alerts',
rules: [
{
alert: 'OpenSearchYellowCluster',
expr: |||
opensearch_cluster_status{%(filteringSelector)s} == 1
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -18,14 +18,14 @@
description:
(
'{{$labels.cluster}} health status is yellow over the last 5 minutes'
) % $._config,
) % this.config,
},
},
{
alert: 'OpenSearchRedCluster',
expr: |||
opensearch_cluster_status{%(filteringSelector)s} == 2
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -35,14 +35,14 @@
description:
(
'{{$labels.cluster}} health status is red over the last 5 minutes'
) % $._config,
) % this.config,
},
},
{
alert: 'OpenSearchUnstableShardReallocation',
expr: |||
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="relocating"}) > %(alertsWarningShardReallocations)s
||| % $._config,
||| % this.config,
'for': '1m',
labels: {
severity: 'warning',
Expand All @@ -51,14 +51,14 @@
summary: 'A node has gone offline or has been disconnected triggering shard reallocation.',
description: |||
{{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard reallocation over the last 1m which is above the threshold of %(alertsWarningShardReallocations)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchUnstableShardUnassigned',
expr: |||
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="unassigned"}) > %(alertsWarningShardUnassigned)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -67,14 +67,14 @@
summary: 'There are shards that have been detected as unassigned.',
description: |||
{{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard unassigned over the last 5m which is above the threshold of %(alertsWarningShardUnassigned)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeDiskUsage',
expr: |||
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsWarningDiskUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -83,14 +83,14 @@
summary: 'The node disk usage has exceeded the warning threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }} disk usage over the last 5m which is above the threshold of %(alertsWarningDiskUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeDiskUsage',
expr: |||
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsCriticalDiskUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -99,14 +99,14 @@
summary: 'The node disk usage has exceeded the critical threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% disk usage over the last 5m which is above the threshold of %(alertsCriticalDiskUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeCpuUsage',
expr: |||
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsWarningCPUUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -115,14 +115,14 @@
summary: 'The node CPU usage has exceeded the warning threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsWarningCPUUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeCpuUsage',
expr: |||
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsCriticalCPUUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -131,14 +131,14 @@
summary: 'The node CPU usage has exceeded the critical threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsCriticalCPUUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeMemoryUsage',
expr: |||
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsWarningMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -147,14 +147,14 @@
summary: 'The node memory usage has exceeded the warning threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsWarningMemoryUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeMemoryUsage',
expr: |||
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsCriticalMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -163,14 +163,14 @@
summary: 'The node memory usage has exceeded the critical threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsCriticalMemoryUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchModerateRequestLatency',
expr: |||
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{%(filteringSelector)s, context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -179,14 +179,14 @@
summary: 'The request latency has exceeded the warning threshold.',
description: |||
{{$labels.index}} has had {{ printf "%%.0f" $value }}s of request latency over the last 5m which is above the threshold of %(alertsWarningRequestLatency)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchModerateIndexLatency',
expr: |||
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{%(filteringSelector)s, context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -195,7 +195,7 @@
summary: 'The index latency has exceeded the warning threshold.',
description: |||
{{$labels.index}} has had {{ printf "%%.0f" $value }}s of index latency over the last 5m which is above the threshold of %(alertsWarningIndexLatency)s.
||| % $._config,
||| % this.config,
},
},
],
Expand Down
63 changes: 38 additions & 25 deletions opensearch-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
{
_config+:: {
enableMultiCluster: false,
// extra static selector to apply to all templated variables and alerts
filteringSelector: if self.enableMultiCluster then 'cluster!="",opensearch_cluster!=""' else 'opensearch_cluster!=""',
groupLabels: if self.enableMultiCluster then ['job', 'cluster', 'opensearch_cluster'] else ['job', 'opensearch_cluster'],
instanceLabels: ['node'],
dashboardTags: ['opensearch-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardNamePrefix: '',
local this = self,
filteringSelector: 'job="integrations/opensearch"',
groupLabels: ['job', 'cluster', 'opensearch_cluster'],
logLabels: ['job', 'cluster', 'opensearch_cluster'],
instanceLabels: ['instance'],

// prefix dashboards uids
uid: 'opensearch',
uid: 'opensearch',
dashboardTags: [self.uid],
dashboardNamePrefix: 'OpenSearch',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: 'prometheus', // metrics source for signals

// alerts thresholds
alertsWarningShardReallocations: 0,
alertsWarningShardUnassigned: 0,
alertsWarningDiskUsage: 60,
alertsCriticalDiskUsage: 80,
alertsWarningCPUUsage: 70,
alertsCriticalCPUUsage: 85,
alertsWarningMemoryUsage: 70,
alertsCriticalMemoryUsage: 85,
alertsWarningRequestLatency: 0.5, // seconds
alertsWarningIndexLatency: 0.5, // seconds
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level', 'severity'], // Required by logs-lib
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
// Agg Lists
groupAggList: std.join(',', this.groupLabels),
groupAggListWithInstance: std.join(',', this.groupLabels + this.instanceLabels),

// Alerts configuration
alertsWarningShardReallocations: 0, // count
alertsWarningShardUnassigned: 0, // count
alertsWarningDiskUsage: 60, // %
alertsCriticalDiskUsage: 80, // %
alertsWarningCPUUsage: 70, // %
alertsCriticalCPUUsage: 85, // %
alertsWarningMemoryUsage: 70, // %
alertsCriticalMemoryUsage: 85, // %
alertsWarningRequestLatency: 0.5, // seconds
alertsWarningIndexLatency: 0.5, // seconds

// Signals configuration
signals+: {
clusterOverview: (import './signals/cluster-overview.libsonnet')(this),
nodeOverview: (import './signals/node-overview.libsonnet')(this),
searchAndIndexOverview: (import './signals/search-and-index-overview.libsonnet')(this),
},
}
Loading
Loading