Skip to content

Commit d2fe0ae

Browse files
feat: add serviceId label to csp-mixin alerts (#1404)
* feat: add serviceId label to alerts * feat: add serviceId label to alerts for azure * fix label * Update rows and dashboards to new collapseble format * Fix linter error --------- Co-authored-by: Vitaly Zhuravlev <[email protected]>
1 parent b843f6e commit d2fe0ae

File tree

9 files changed

+677
-525
lines changed

9 files changed

+677
-525
lines changed

csp-mixin/alerts/azure-alerts.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ groups:
99
labels:
1010
severity: critical
1111
service: 'Azure Virtual Machines'
12+
serviceId: 'microsoft.compute/virtualmachines'
1213
namespace: cloud-provider-azure
1314
annotations:
1415
summary: 'VM CPU utilization is too high.'
@@ -23,6 +24,7 @@ groups:
2324
labels:
2425
severity: critical
2526
service: 'Azure Virtual Machines'
27+
serviceId: 'microsoft.compute/virtualmachines'
2628
namespace: cloud-provider-azure
2729
annotations:
2830
summary: 'VM unavailable.'
@@ -51,6 +53,7 @@ groups:
5153
labels:
5254
severity: critical
5355
service: 'Azure SQL database'
56+
serviceId: 'microsoft.sql/servers/databases'
5457
namespace: cloud-provider-azure
5558
annotations:
5659
summary: 'High database Storage usage.'
@@ -65,6 +68,7 @@ groups:
6568
labels:
6669
severity: info
6770
service: 'Azure SQL database'
71+
serviceId: 'microsoft.sql/servers/databases'
6872
namespace: cloud-provider-azure
6973
annotations:
7074
summary: 'High database Deadlock count.'
@@ -79,6 +83,7 @@ groups:
7983
labels:
8084
severity: warning
8185
service: 'Azure SQL database'
86+
serviceId: 'microsoft.sql/servers/databases'
8287
namespace: cloud-provider-azure
8388
annotations:
8489
summary: 'High database User CPU usage.'
@@ -93,6 +98,7 @@ groups:
9398
labels:
9499
severity: warning
95100
service: 'Azure SQL database'
101+
serviceId: 'microsoft.sql/servers/databases'
96102
namespace: cloud-provider-azure
97103
annotations:
98104
summary: 'High number of database System Failed connections.'
@@ -107,6 +113,7 @@ groups:
107113
labels:
108114
severity: warning
109115
service: 'Azure SQL database'
116+
serviceId: 'microsoft.sql/servers/databases'
110117
namespace: cloud-provider-azure
111118
annotations:
112119
summary: 'High number of database User Failed connections.'
@@ -121,6 +128,7 @@ groups:
121128
labels:
122129
severity: critical
123130
service: 'Azure SQL database'
131+
serviceId: 'microsoft.sql/servers/databases'
124132
namespace: cloud-provider-azure
125133
annotations:
126134
summary: 'High database worker usage.'
@@ -135,6 +143,7 @@ groups:
135143
labels:
136144
severity: info
137145
service: 'Azure SQL database'
146+
serviceId: 'microsoft.sql/servers/databases'
138147
namespace: cloud-provider-azure
139148
annotations:
140149
summary: 'High database data IO usage.'
@@ -149,6 +158,7 @@ groups:
149158
labels:
150159
severity: critical
151160
service: 'Azure SQL database'
161+
serviceId: 'microsoft.sql/servers/databases'
152162
namespace: cloud-provider-azure
153163
annotations:
154164
summary: 'Low database tempdb log space.'

csp-mixin/alerts/gcp-alerts.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ groups:
99
labels:
1010
severity: critical
1111
service: 'Compute Engine'
12+
serviceId: 'compute'
1213
namespace: cloud-provider-gcp
1314
annotations:
1415
summary: 'VM CPU utilization is too high.'
@@ -23,6 +24,7 @@ groups:
2324
labels:
2425
severity: critical
2526
service: 'Compute Engine'
27+
serviceId: 'compute'
2628
namespace: cloud-provider-gcp
2729
annotations:
2830
summary: 'VM IO latency is too high.'
@@ -37,6 +39,7 @@ groups:
3739
labels:
3840
severity: critical
3941
service: 'Cloud SQL'
42+
serviceId: 'cloudsql'
4043
namespace: cloud-provider-gcp
4144
annotations:
4245
summary: 'Database CPU utilization is too high.'
@@ -51,6 +54,7 @@ groups:
5154
labels:
5255
severity: critical
5356
service: 'Cloud SQL'
57+
serviceId: 'cloudsql'
5458
namespace: cloud-provider-gcp
5559
annotations:
5660
summary: 'Database memory utilization is too high.'
@@ -65,6 +69,7 @@ groups:
6569
labels:
6670
severity: critical
6771
service: 'Cloud SQL'
72+
serviceId: 'cloudsql'
6873
namespace: cloud-provider-gcp
6974
annotations:
7075
summary: 'Database disk utilization is too high.'
@@ -79,6 +84,7 @@ groups:
7984
labels:
8085
severity: critical
8186
service: 'Cloud SQL'
87+
serviceId: 'cloudsql'
8288
namespace: cloud-provider-gcp
8389
annotations:
8490
summary: 'Too many database active connections.'
@@ -93,6 +99,7 @@ groups:
9399
labels:
94100
severity: critical
95101
service: 'Cloud SQL'
102+
serviceId: 'cloudsql'
96103
namespace: cloud-provider-gcp
97104
annotations:
98105
summary: 'More than 5 MySQL failed connections in 5 minutes.'
@@ -107,6 +114,7 @@ groups:
107114
labels:
108115
severity: warning
109116
service: 'Cloud SQL'
117+
serviceId: 'cloudsql'
110118
namespace: cloud-provider-gcp
111119
annotations:
112120
summary: 'More than 5 seconds lag between database read replica and primary.'
@@ -121,6 +129,7 @@ groups:
121129
labels:
122130
severity: warning
123131
service: 'Pub/Sub'
132+
serviceId: 'pubsub'
124133
namespace: cloud-provider-gcp
125134
annotations:
126135
summary: 'More than 1000 unacknowledged messages for a PubSub subscription.'
@@ -135,6 +144,7 @@ groups:
135144
labels:
136145
severity: warning
137146
service: 'Pub/Sub'
147+
serviceId: 'pubsub'
138148
namespace: cloud-provider-gcp
139149
annotations:
140150
summary: 'Unacknowledged messages for more than 60 seconds for a PubSub subscription.'

csp-mixin/dashboards.libsonnet

Lines changed: 82 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@ local commonlib = import 'common-lib/common/main.libsonnet';
1919
for v in variables
2020
])
2121
+ g.dashboard.withPanels(
22-
g.util.grid.wrapPanels(
23-
csplib.grafana.rows.overview
24-
+ csplib.grafana.rows.api
25-
+ csplib.grafana.rows.network,
22+
g.util.panel.resolveCollapsedFlagOnRows(
23+
g.util.grid.wrapPanels(
24+
[
25+
csplib.grafana.rows.overview,
26+
csplib.grafana.rows.api,
27+
csplib.grafana.rows.network,
28+
]
29+
)
2630
)
2731
),
2832
}
@@ -46,10 +50,14 @@ local commonlib = import 'common-lib/common/main.libsonnet';
4650
for v in variables
4751
])
4852
+ g.dashboard.withPanels(
49-
g.util.grid.wrapPanels(
50-
csplib.grafana.rows.glb_requests
51-
+ csplib.grafana.rows.glb_latency
52-
+ csplib.grafana.rows.glb_traffic_metrics
53+
g.util.panel.resolveCollapsedFlagOnRows(
54+
g.util.grid.wrapPanels(
55+
[
56+
csplib.grafana.rows.glb_requests,
57+
csplib.grafana.rows.glb_latency,
58+
csplib.grafana.rows.glb_traffic_metrics,
59+
]
60+
)
5361
)
5462
),
5563
[csplib.config.uid + '-computeengine.json']:
@@ -62,9 +70,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
6270
+ g.dashboard.timepicker.withTimeOptions(csplib.config.dashboardPeriod)
6371
+ g.dashboard.withVariables(variables)
6472
+ g.dashboard.withPanels(
65-
g.util.grid.wrapPanels(
66-
csplib.grafana.rows.gce_overview +
67-
csplib.grafana.rows.gce_instance
73+
g.util.panel.resolveCollapsedFlagOnRows(
74+
g.util.grid.wrapPanels(
75+
[
76+
csplib.grafana.rows.gce_overview,
77+
csplib.grafana.rows.gce_instance,
78+
]
79+
)
6880
)
6981
),
7082

@@ -83,10 +95,14 @@ local commonlib = import 'common-lib/common/main.libsonnet';
8395
for v in variables
8496
])
8597
+ g.dashboard.withPanels(
86-
g.util.grid.wrapPanels(
87-
csplib.grafana.rows.gcpvpc_overview +
88-
csplib.grafana.rows.gcpvpc_service +
89-
csplib.grafana.rows.gcpvpc_tunnel,
98+
g.util.panel.resolveCollapsedFlagOnRows(
99+
g.util.grid.wrapPanels(
100+
[
101+
csplib.grafana.rows.gcpvpc_overview,
102+
csplib.grafana.rows.gcpvpc_service,
103+
csplib.grafana.rows.gcpvpc_tunnel,
104+
]
105+
)
90106
),
91107
),
92108
} else {}
@@ -103,9 +119,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
103119
+ g.dashboard.timepicker.withTimeOptions(csplib.config.dashboardPeriod)
104120
+ g.dashboard.withVariables(variables)
105121
+ g.dashboard.withPanels(
106-
g.util.grid.wrapPanels(
107-
csplib.grafana.rows.aep_storage +
108-
csplib.grafana.rows.aep_resources
122+
g.util.panel.resolveCollapsedFlagOnRows(
123+
g.util.grid.wrapPanels(
124+
[
125+
csplib.grafana.rows.aep_storage,
126+
csplib.grafana.rows.aep_resources,
127+
]
128+
)
109129
)
110130
),
111131

@@ -119,9 +139,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
119139
+ g.dashboard.timepicker.withTimeOptions(csplib.config.dashboardPeriod)
120140
+ g.dashboard.withVariables(variables)
121141
+ g.dashboard.withPanels(
122-
g.util.grid.wrapPanels(
123-
csplib.grafana.rows.asql_connections +
124-
csplib.grafana.rows.asql_resources
142+
g.util.panel.resolveCollapsedFlagOnRows(
143+
g.util.grid.wrapPanels(
144+
[
145+
csplib.grafana.rows.asql_connections,
146+
csplib.grafana.rows.asql_resources,
147+
]
148+
)
125149
)
126150
),
127151

@@ -135,10 +159,14 @@ local commonlib = import 'common-lib/common/main.libsonnet';
135159
+ g.dashboard.timepicker.withTimeOptions(csplib.config.dashboardPeriod)
136160
+ g.dashboard.withVariables(variables)
137161
+ g.dashboard.withPanels(
138-
g.util.grid.wrapPanels(
139-
csplib.grafana.rows.alb_summary +
140-
csplib.grafana.rows.alb_details +
141-
csplib.grafana.rows.alb_loadbalancers
162+
g.util.panel.resolveCollapsedFlagOnRows(
163+
g.util.grid.wrapPanels(
164+
[
165+
csplib.grafana.rows.alb_summary,
166+
csplib.grafana.rows.alb_details,
167+
csplib.grafana.rows.alb_loadbalancers,
168+
]
169+
)
142170
)
143171
),
144172

@@ -153,9 +181,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
153181
+ g.dashboard.withVariables(variables)
154182
+ g.dashboard.withPanels(
155183
g.util.grid.wrapPanels(
156-
csplib.grafana.rows.vn_overview +
157-
csplib.grafana.rows.vn_bytes +
158-
csplib.grafana.rows.vn_packets
184+
g.util.panel.resolveCollapsedFlagOnRows(
185+
[
186+
csplib.grafana.rows.vn_overview,
187+
csplib.grafana.rows.vn_bytes,
188+
csplib.grafana.rows.vn_packets,
189+
]
190+
)
159191
)
160192
),
161193

@@ -178,9 +210,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
178210
for v in variables
179211
])
180212
+ g.dashboard.withPanels(
181-
g.util.grid.wrapPanels(
182-
csplib.grafana.rows.avm_overview +
183-
csplib.grafana.rows.avm_instance
213+
g.util.panel.resolveCollapsedFlagOnRows(
214+
g.util.grid.wrapPanels(
215+
[
216+
csplib.grafana.rows.avm_overview,
217+
csplib.grafana.rows.avm_instance,
218+
]
219+
)
184220
)
185221
),
186222

@@ -203,9 +239,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
203239
for v in variables
204240
])
205241
+ g.dashboard.withPanels(
206-
g.util.grid.wrapPanels(
207-
csplib.grafana.rows.afd_overview
208-
+ csplib.grafana.rows.afd_endpoints,
242+
g.util.panel.resolveCollapsedFlagOnRows(
243+
g.util.grid.wrapPanels(
244+
[
245+
csplib.grafana.rows.afd_overview,
246+
csplib.grafana.rows.afd_endpoints,
247+
]
248+
)
209249
)
210250
),
211251

@@ -219,10 +259,14 @@ local commonlib = import 'common-lib/common/main.libsonnet';
219259
+ g.dashboard.timepicker.withTimeOptions(csplib.config.dashboardPeriod)
220260
+ g.dashboard.withVariables(variables)
221261
+ g.dashboard.withPanels(
222-
g.util.grid.wrapPanels(
223-
csplib.grafana.rows.azqueuestore_overview
224-
+ csplib.grafana.rows.azqueuestore_api
225-
+ csplib.grafana.rows.azqueuestore_network,
262+
g.util.panel.resolveCollapsedFlagOnRows(
263+
g.util.grid.wrapPanels(
264+
[
265+
csplib.grafana.rows.azqueuestore_overview,
266+
csplib.grafana.rows.azqueuestore_api,
267+
csplib.grafana.rows.azqueuestore_network,
268+
]
269+
)
226270
)
227271
),
228272
} else {},

csp-mixin/panels/azureelasticpool.libsonnet

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,21 +69,13 @@ local commonlib = import 'common-lib/common/main.libsonnet';
6969
]),
7070

7171
aep_storage:
72-
this.signals.azureelasticpool.storageAllocTbl.common
72+
this.signals.azureelasticpool.storageAllocTbl.common(type='table')
7373
+ commonlib.panels.generic.table.base.new(
7474
'Elastic pool storage',
7575
[
76-
this.signals.azureelasticpool.storageAllocTbl.asTarget()
77-
+ g.query.prometheus.withFormat('table')
78-
+ g.query.prometheus.withInstant(true),
79-
80-
this.signals.azureelasticpool.storageUsedTbl.asTarget()
81-
+ g.query.prometheus.withFormat('table')
82-
+ g.query.prometheus.withInstant(true),
83-
84-
this.signals.azureelasticpool.storageLimitTbl.asTarget()
85-
+ g.query.prometheus.withFormat('table')
86-
+ g.query.prometheus.withInstant(true),
76+
this.signals.azureelasticpool.storageAllocTbl.asTableTarget(),
77+
this.signals.azureelasticpool.storageUsedTbl.asTableTarget(),
78+
this.signals.azureelasticpool.storageLimitTbl.asTableTarget(),
8779
],
8880
'Storage overview per elasticpool.'
8981
) + self._aep_tableCommon(),

0 commit comments

Comments
 (0)