Skip to content

Commit e5d392a

Browse files
authored
Refactor opensearch to use as submixin (modular observ lib) (#1045)
* Add filteringSelector to opensearch to use as submixin * Format jsonnet * Add filtering selector to alerts * Add uid prefix to all dashboards/alerts * Add commonlib * Add variables file * Add grafonnet * Fix variables file * Fix cluster dashboard * Update config * Refactor dashboard contstructor to new grafonnet * Update node dashboard * Update node dashboards * Update search and index * Add prefix * Add os role panels * Fmt * Fix nodeIO panel * jsonnetfmt * Fix io panel * Lint exclusions * lint
1 parent 2df707c commit e5d392a

10 files changed

+4748
-4591
lines changed

opensearch-mixin/.lint

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
exclusions:
2+
panel-datasource-rule:
3+
reason: Uses --mixed-- (generated from grafonnet)
4+
entries:
5+
- panel: 'OpenSearch cluster overview'
6+
- panel: 'Roles'
27
panel-units-rule:
38
reason: "Custom units are used for better user experience in these panels"
49
entries:
@@ -27,6 +32,8 @@ exclusions:
2732
- panel: "Merge count"
2833
- panel: "Shard count"
2934
- panel: "Node open connections"
35+
- panel: 'OpenSearch cluster overview'
36+
- panel: 'Roles'
3037
template-instance-rule:
3138
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
3239
target-instance-rule:

opensearch-mixin/alerts/alerts.libsonnet

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
prometheusAlerts+:: {
33
groups+: [
44
{
5-
name: 'opensearch',
5+
name: $._config.uid + '-alerts',
66
rules: [
77
{
88
alert: 'OpenSearchYellowCluster',
99
expr: |||
10-
opensearch_cluster_status == 1
10+
opensearch_cluster_status{%(filteringSelector)s} == 1
1111
||| % $._config,
1212
'for': '5m',
1313
labels: {
@@ -24,7 +24,7 @@
2424
{
2525
alert: 'OpenSearchRedCluster',
2626
expr: |||
27-
opensearch_cluster_status == 2
27+
opensearch_cluster_status{%(filteringSelector)s} == 2
2828
||| % $._config,
2929
'for': '5m',
3030
labels: {
@@ -41,7 +41,7 @@
4141
{
4242
alert: 'OpenSearchUnstableShardReallocation',
4343
expr: |||
44-
sum without(type) (opensearch_cluster_shards_number{type="relocating"}) > %(alertsWarningShardReallocations)s
44+
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="relocating"}) > %(alertsWarningShardReallocations)s
4545
||| % $._config,
4646
'for': '1m',
4747
labels: {
@@ -57,7 +57,7 @@
5757
{
5858
alert: 'OpenSearchUnstableShardUnassigned',
5959
expr: |||
60-
sum without(type) (opensearch_cluster_shards_number{type="unassigned"}) > %(alertsWarningShardUnassigned)s
60+
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="unassigned"}) > %(alertsWarningShardUnassigned)s
6161
||| % $._config,
6262
'for': '5m',
6363
labels: {
@@ -73,7 +73,7 @@
7373
{
7474
alert: 'OpenSearchModerateNodeDiskUsage',
7575
expr: |||
76-
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes) > %(alertsWarningDiskUsage)s
76+
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsWarningDiskUsage)s
7777
||| % $._config,
7878
'for': '5m',
7979
labels: {
@@ -89,7 +89,7 @@
8989
{
9090
alert: 'OpenSearchHighNodeDiskUsage',
9191
expr: |||
92-
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes) > %(alertsCriticalDiskUsage)s
92+
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsCriticalDiskUsage)s
9393
||| % $._config,
9494
'for': '5m',
9595
labels: {
@@ -105,7 +105,7 @@
105105
{
106106
alert: 'OpenSearchModerateNodeCpuUsage',
107107
expr: |||
108-
sum without(nodeid) (opensearch_os_cpu_percent) > %(alertsWarningCPUUsage)s
108+
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsWarningCPUUsage)s
109109
||| % $._config,
110110
'for': '5m',
111111
labels: {
@@ -121,7 +121,7 @@
121121
{
122122
alert: 'OpenSearchHighNodeCpuUsage',
123123
expr: |||
124-
sum without(nodeid) (opensearch_os_cpu_percent) > %(alertsCriticalCPUUsage)s
124+
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsCriticalCPUUsage)s
125125
||| % $._config,
126126
'for': '5m',
127127
labels: {
@@ -137,7 +137,7 @@
137137
{
138138
alert: 'OpenSearchModerateNodeMemoryUsage',
139139
expr: |||
140-
sum without(nodeid) (opensearch_os_mem_used_percent) > %(alertsWarningMemoryUsage)s
140+
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsWarningMemoryUsage)s
141141
||| % $._config,
142142
'for': '5m',
143143
labels: {
@@ -153,7 +153,7 @@
153153
{
154154
alert: 'OpenSearchHighNodeMemoryUsage',
155155
expr: |||
156-
sum without(nodeid) (opensearch_os_mem_used_percent) > %(alertsCriticalMemoryUsage)s
156+
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsCriticalMemoryUsage)s
157157
||| % $._config,
158158
'for': '5m',
159159
labels: {
@@ -169,7 +169,7 @@
169169
{
170170
alert: 'OpenSearchModerateRequestLatency',
171171
expr: |||
172-
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s
172+
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{%(filteringSelector)s, context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s
173173
||| % $._config,
174174
'for': '5m',
175175
labels: {
@@ -185,7 +185,7 @@
185185
{
186186
alert: 'OpenSearchModerateIndexLatency',
187187
expr: |||
188-
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s
188+
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{%(filteringSelector)s, context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s
189189
||| % $._config,
190190
'for': '5m',
191191
labels: {

opensearch-mixin/config.libsonnet

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
{
22
_config+:: {
3+
// extra static selector to apply to all templated variables and alerts
4+
filteringSelector: 'cluster!=""',
5+
groupLabels: ['job', 'cluster'],
6+
instanceLabels: ['node'],
37
dashboardTags: ['opensearch-mixin'],
48
dashboardPeriod: 'now-1h',
59
dashboardTimezone: 'default',
610
dashboardRefresh: '1m',
11+
dashboardNamePrefix: '',
12+
13+
// prefix dashboards uids
14+
uid: 'opensearch',
715

816
// alerts thresholds
917
alertsWarningShardReallocations: 0,

0 commit comments

Comments
 (0)