@@ -80,14 +80,14 @@ spec:
8080 be cleaned up or removed in order to provide a consistent customer experience.
8181
8282 - alert : CinderVolumeInDeleting
83- expr : ' openstack_cinder_volume_status == 7'
84- for : 10m
85- labels :
86- severity : warning
87- annotations :
88- summary : " [`{{`{{$labels.id}}`}}`] Volume in deleting state"
89- description : >
90- The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes"
83+ expr : ' openstack_cinder_volume_status == 7'
84+ for : 10m
85+ labels :
86+ severity : warning
87+ annotations :
88+ summary : " [`{{`{{$labels.id}}`}}`] Volume in deleting state"
89+ description : >
90+ The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes"
9191
9292 - alert : CinderVolumeInCreating
9393 expr : ' openstack_cinder_volume_status == 0'
@@ -197,22 +197,22 @@ spec:
197197 as quickly as possible.
198198
199199 - alert : NovaInstanceInError
200- for : 15m
200+ for : 10m
201201 expr : |
202202 openstack_nova_server_status{status="ERROR"}
203203 labels :
204- severity : warning
204+ severity : critical
205205 annotations :
206206 summary : " [`{{`{{$labels.id}}`}}`] Instance in ERROR state"
207207 description : >
208- The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 15 minutes. It must
208+ The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must
209209 be cleaned up or removed in order to provide a consistent customer experience.
210210
211211 - alert : NovaInstanceInBuilding
212212 for : 15m
213213 expr : ' openstack_nova_server_status == 1'
214214 labels :
215- severity : warning
215+ severity : critical
216216 annotations :
217217 summary : " [`{{`{{$labels.id}}`}}`] Instance in BUILD state"
218218 description : >
@@ -222,7 +222,7 @@ spec:
222222 for : 15m
223223 expr : ' openstack_nova_server_status == 10'
224224 labels :
225- severity : warning
225+ severity : critical
226226 annotations :
227227 summary : " [`{{`{{$labels.id}}`}}`] Instance in RESIZE state"
228228 description : >
@@ -232,7 +232,7 @@ spec:
232232 for : 15m
233233 expr : ' openstack_nova_server_status == 13'
234234 labels :
235- severity : warning
235+ severity : critical
236236 annotations :
237237 summary : " [`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state"
238238 description : >
@@ -242,21 +242,21 @@ spec:
242242 for : 15m
243243 expr : ' openstack_nova_server_status == 14'
244244 labels :
245- severity : warning
245+ severity : critical
246246 annotations :
247247 summary : " [`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state"
248248 description : >
249249 The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes.
250250
251251 - alert : NovaInstanceInMIGRATING
252- for : 15m
252+ for : 30m
253253 expr : ' openstack_nova_server_status == 15'
254254 labels :
255- severity : warning
255+ severity : critical
256256 annotations :
257257 summary : " [`{{`{{$labels.id}}`}}`] Instance in MIGRATING state"
258258 description : >
259- The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 15 minutes.
259+ The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes.
260260
261261 - alert : NovaFailureRisk
262262 for : 6h
@@ -272,7 +272,7 @@ spec:
272272 failures occur. Please ensure that adequate amount of infrastructure is assigned to this
273273 deployment to prevent this.
274274
275- - alert : NovaCapacity
275+ - alert : NovaCapacityNearFull
276276 for : 6h
277277 expr : |
278278 sum (
@@ -286,9 +286,59 @@ spec:
286286 ) * 100 > 75
287287 labels :
288288 severity : warning
289+ annotations :
290+ summary : " [nova] near full Capacity risk"
291+ description : >
292+ The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running
293+ out of capacity due to the timeline required to add new nodes. Please ensure that adequate
294+ amount of infrastructure is assigned to this deployment to prevent this.
295+
296+ - alert : NovaCapacityFull
297+ for : 6h
298+ expr : |
299+ sum (
300+ openstack_nova_memory_used_bytes
301+ + on(hostname) group_left(adminState)
302+ (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
303+ ) / sum (
304+ openstack_nova_memory_available_bytes
305+ + on(hostname) group_left(adminState)
306+ (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
307+ ) * 100 > 85
308+ labels :
309+ severity : critical
289310 annotations :
290311 summary : " [nova] Capacity risk"
291312 description : >
292313 The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running
293314 out of capacity due to the timeline required to add new nodes. Please ensure that adequate
294315 amount of infrastructure is assigned to this deployment to prevent this.
316+
317+ - name : octavia
318+ rules :
319+ - alert : LoadbalancerDown
320+ for : 5m
321+ expr : ' openstack_loadbalancer_up != 1'
322+ labels :
323+ severity : critical
324+ annotations :
325+ summary : " OpenStack loadbalancer service down"
326+ description : " OpenStack loadbalancer service down"
327+
328+ - alert : LoadbalancerNotActive
329+ for : 5m
330+ expr : openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"}
331+ labels :
332+ severity : critical
333+ annotations :
334+ summary : " OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
335+ description : " OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
336+
337+ - alert : LoadbalancerPoolNotActive
338+ for : 5m
339+ expr : openstack_loadbalancer_pool_status{provisioning_status!="ACTIVE"}
340+ labels :
341+ severity : critical
342+ annotations :
343+ summary : " OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
344+ description : " OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE"
0 commit comments