Skip to content

Commit 9fb56f1

Browse files
authored
Merge branch 'master' into jl-contrib-kafka
2 parents 8b14f5c + b42789a commit 9fb56f1

File tree

16 files changed

+347
-124
lines changed

16 files changed

+347
-124
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
cache-hit: ${{ steps.tools-cache.outputs.cache-hit }}
2020
steps:
2121
- name: Checkout
22-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
22+
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
2323
with:
2424
fetch-depth: 0
2525
persist-credentials: false
@@ -45,7 +45,7 @@ jobs:
4545
runs-on: ubuntu-latest
4646
steps:
4747
- name: Checkout
48-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
48+
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
4949
with:
5050
fetch-depth: 0
5151
persist-credentials: false
@@ -69,7 +69,7 @@ jobs:
6969
runs-on: ubuntu-latest
7070
steps:
7171
- name: Checkout
72-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
72+
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
7373
with:
7474
fetch-depth: 0
7575
persist-credentials: false

.github/workflows/test-mixins.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
cache-hit: ${{ steps.tools-cache.outputs.cache-hit }}
1919
steps:
2020
- name: Checkout
21-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
21+
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
2222
with:
2323
fetch-depth: 0
2424
persist-credentials: false
@@ -46,7 +46,7 @@ jobs:
4646
changed-mixins: ${{ steps.changed-mixins.outputs.all_changed_files }}
4747
steps:
4848
- name: Checkout
49-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
49+
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
5050
with:
5151
fetch-depth: 0
5252
persist-credentials: false
@@ -91,7 +91,7 @@ jobs:
9191
GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin"
9292
steps:
9393
- name: Checkout
94-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
94+
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
9595
with:
9696
fetch-depth: 0
9797
persist-credentials: false

csp-mixin/alerts/azure-alerts.yml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,50 @@ groups:
199199
summary: 'Azure VNet Peering Connection Issues.'
200200
description: 'The success rate of Virtual Network {{ $labels.resourceName }} peering connections has fallen below the defined threshold, indicating possible connectivity failures. Investigate potential misconfigurations, network latency, or service disruptions affecting peering communication.'
201201
dashboard_url: '/a/grafana-csp-app/azure/dashboards/vnet'
202+
203+
- name: azure.flexibleserverspostgresql
204+
rules:
205+
- alert: AzureFlexibleServerHighCpuUtilization
206+
expr: |
207+
azure_microsoft_dbforpostgresql_flexibleservers_cpu_percent_average_percent{resourceName!=""} > 90
208+
for: 5m
209+
keep_firing_for: 10m
210+
labels:
211+
severity: critical
212+
service: 'Azure PostgreSQL Flexible Servers'
213+
serviceId: 'microsoft.dbforpostgresql/flexibleservers'
214+
namespace: cloud-provider-azure
215+
annotations:
216+
summary: 'Flexible Server CPU utilization is too high.'
217+
description: 'The Flexible Server {{ $labels.resourceName }} is under heavy load and may become unresponsive.'
218+
dashboard_url: '/a/grafana-csp-app/azure/dashboards/postgresql-flexible-servers'
219+
220+
- alert: AzureFlexibleServerHighMemoryUsage
221+
expr: |
222+
azure_microsoft_dbforpostgresql_flexibleservers_memory_percent_average_percent{resourceName!=""} > 90
223+
for: 5m
224+
keep_firing_for: 10m
225+
labels:
226+
severity: critical
227+
service: 'Azure PostgreSQL Flexible Servers'
228+
serviceId: 'microsoft.dbforpostgresql/flexibleservers'
229+
namespace: cloud-provider-azure
230+
annotations:
231+
summary: 'Flexible Server Memory usage is too high.'
232+
description: 'The Flexible Server {{ $labels.resourceName }} is experiencing high memory usage, which may lead to performance degradation.'
233+
dashboard_url: '/a/grafana-csp-app/azure/dashboards/postgresql-flexible-servers'
234+
235+
- alert: AzureFlexibleServerHighStorageUsage
236+
expr: |
237+
azure_microsoft_dbforpostgresql_flexibleservers_storage_percent_maximum_percent{resourceName!=""} > 90
238+
for: 5m
239+
keep_firing_for: 10m
240+
labels:
241+
severity: critical
242+
service: 'Azure PostgreSQL Flexible Servers'
243+
serviceId: 'microsoft.dbforpostgresql/flexibleservers'
244+
namespace: cloud-provider-azure
245+
annotations:
246+
summary: 'Flexible Server Storage usage is too high.'
247+
description: 'The Flexible Server {{ $labels.resourceName }} is running low on storage space, which may impact database operations.'
248+
dashboard_url: '/a/grafana-csp-app/azure/dashboards/postgresql-flexible-servers'

csp-mixin/prometheus_rules_out/prometheus_alerts.yaml

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

grafana-builder/grafana.libsonnet

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -508,35 +508,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
508508
native: template % { metricQuery: nativeClassicQuery.native, label: statusLabelName },
509509
classic: template % { metricQuery: nativeClassicQuery.classic, label: statusLabelName },
510510
},
511-
fieldConfig+: {
512-
defaults+: {
513-
custom+: {
514-
lineWidth: 0,
515-
fillOpacity: 100, // Get solid fill.
516-
stacking: {
517-
mode: 'normal',
518-
group: 'A',
519-
},
520-
},
521-
unit: 'reqps',
522-
min: 0,
523-
},
524-
overrides+: [{
525-
matcher: {
526-
id: 'byName',
527-
options: status,
528-
},
529-
properties: [
530-
{
531-
id: 'color',
532-
value: {
533-
mode: 'fixed',
534-
fixedColor: $.httpStatusColors[status],
535-
},
536-
},
537-
],
538-
} for status in std.objectFieldsAll($.httpStatusColors)],
539-
},
511+
aliasColors: $.httpStatusColors,
540512
targets: [
541513
{
542514
expr: utils.showClassicHistogramQuery(sumByStatus(utils.ncHistogramCountRate(metricName, selector))),

openstack-mixin/alerts.libsonnet

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,13 @@
6262
},
6363
},
6464
{
65-
alert: 'OpenStackPlacementHighMemoryUsageWarning',
65+
alert: 'OpenStackPlacementHighMemoryUsage',
6666
expr: |||
67-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
67+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
6868
/
69-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
69+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}
70+
*
71+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
7072
> %(alertsWarningPlacementHighMemoryUsage)s
7173
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
7274
'for': '5m',
@@ -75,19 +77,21 @@
7577
severity: 'warning',
7678
},
7779
annotations: {
78-
summary: 'OpenStack is using a significant percentage of its allocated memory.',
80+
summary: 'OpenStack node is using a significant percentage of its allocated memory.',
7981
description: |||
80-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
82+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
8183
which is above the threshold of %(alertsWarningPlacementHighMemoryUsage)s percent.
82-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
84+
||| % this.config { nodeLabel: this.config.nodeLabel },
8385
},
8486
},
8587
{
86-
alert: 'OpenStackNovaAgentDown',
88+
alert: 'OpenStackPlacementHighMemoryUsage',
8789
expr: |||
88-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
90+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
8991
/
90-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
92+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}
93+
*
94+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
9195
> %(alertsCriticalPlacementHighMemoryUsage)s
9296
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
9397
'for': '5m',
@@ -96,19 +100,21 @@
96100
severity: 'critical',
97101
},
98102
annotations: {
99-
summary: 'OpenStack is using a large percentage of its allocated memory, consider allocating more resources.',
103+
summary: 'OpenStack node is using a large percentage of its allocated memory, consider allocating more resources.',
100104
description: |||
101-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
105+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
102106
which is above the threshold of %(alertsCriticalPlacementHighMemoryUsage)s percent.
103-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
107+
||| % this.config { nodeLabel: this.config.nodeLabel },
104108
},
105109
},
106110
{
107-
alert: 'OpenStackPlacementHighVCPUUsageWarning',
111+
alert: 'OpenStackPlacementHighVCPUUsage',
108112
expr: |||
109-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
113+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
110114
/
111-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
115+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}
116+
*
117+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
112118
> %(alertsWarningPlacementHighVCPUUsage)s
113119
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
114120
'for': '5m',
@@ -117,20 +123,21 @@
117123
severity: 'warning',
118124
},
119125
annotations: {
120-
summary: 'OpenStack is using a significant percentage of its allocated vCPU.',
126+
summary: 'OpenStack node is using a significant percentage of its allocated vCPU.',
121127
description: |||
122-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
128+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
123129
which is above the threshold of %(alertsWarningPlacementHighVCPUUsage)s percent.
124-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
130+
||| % this.config { nodeLabel: this.config.nodeLabel },
125131
},
126132
},
127133
{
128-
alert: 'OpenStackPlacementHighVCPUUsageCritical',
129-
134+
alert: 'OpenStackPlacementHighVCPUUsage',
130135
expr: |||
131-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
136+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
132137
/
133-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
138+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}
139+
*
140+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
134141
> %(alertsCriticalPlacementHighVCPUUsage)s
135142
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
136143
'for': '5m',
@@ -139,15 +146,15 @@
139146
severity: 'critical',
140147
},
141148
annotations: {
142-
summary: 'OpenStack is using a large percentage of its allocated vCPU, consider allocating more resources.',
149+
summary: 'OpenStack node is using a large percentage of its allocated vCPU, consider allocating more resources.',
143150
description: |||
144-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
151+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
145152
which is above the threshold of %(alertsCriticalPlacementHighVCPUUsage)s percent.
146-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
153+
||| % this.config { nodeLabel: this.config.nodeLabel },
147154
},
148155
},
149156
{
150-
alert: 'OpenStackNeutronHighIPsUsageWarning',
157+
alert: 'OpenStackNeutronHighIPsUsage',
151158
expr: |||
152159
100 *
153160
sum by (%(agg)s, network_name) (openstack_neutron_network_ip_availabilities_used{%(filteringSelector)s, network_name=~"%(alertsIPutilizationNetworksMatcher)s"})
@@ -171,7 +178,7 @@
171178
},
172179
},
173180
{
174-
alert: 'OpenStackNeutronHighIPsUsageCritical',
181+
alert: 'OpenStackNeutronHighIPsUsage',
175182
expr: |||
176183
100 *
177184
sum by (%(agg)s, network_name) (openstack_neutron_network_ip_availabilities_used{%(filteringSelector)s, network_name=~"%(alertsIPutilizationNetworksMatcher)s"})
@@ -265,6 +272,26 @@
265272
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
266273
},
267274
},
275+
{
276+
alert: 'OpenStackNovaTooManyVMsNotRunning',
277+
expr: |||
278+
count by (%(agg)s, hypervisor_hostname, availability_zone) (openstack_nova_server_status{%(filteringSelector)s, status=~"SHUTOFF|ERROR", hypervisor_hostname!=""})/
279+
(count by (%(agg)s, hypervisor_hostname, availability_zone) (openstack_nova_server_status{%(filteringSelector)s}) > %(alertsCriticalVMsNotRunningInstanceMin)s) * 100 > %(alertsCriticalVMsNotRunningPercent)s
280+
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
281+
'for': '15m',
282+
labels: {
283+
severity: 'critical',
284+
},
285+
annotations: {
286+
summary: 'Too many VMs are in SHUTOFF or ERROR states on the single hypervisor.',
287+
description: |||
288+
There are too many VMs in `SHUTOFF` or `ERROR` states on the hypervisor {{ $labels.hypervisor_hostname }}: {{ printf "%%.0f" $value }} percent,
289+
which is above the threshold of %(alertsCriticalVMsNotRunningPercent)s percent.
290+
291+
Please check if the hypervisor was rebooted and if instances need to be started manually.
292+
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
293+
},
294+
},
268295
],
269296
},
270297
{

openstack-mixin/config.libsonnet

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
groupLabels: ['job'],
44
// instance of openstack cluster
55
instanceLabels: ['instance'],
6-
6+
nodeLabel: 'hostname',
77
uid: 'openstack',
88
dashboardTags: [self.uid],
99
dashboardPeriod: 'now-30m',
@@ -23,7 +23,10 @@
2323
alertsWarningCinderHighBackupMemoryUsage: 80, // %
2424
alertsWarningCinderHighVolumeMemoryUsage: 80, // %
2525
alertsWarningCinderHighPoolCapacityUsage: 80, // %
26-
26+
// alert when this percent of VMs not running on the single host,
27+
// while there is at least this total number of instances overall.
28+
alertsCriticalVMsNotRunningPercent: 75, // %
29+
alertsCriticalVMsNotRunningInstanceMin: 10,
2730

2831
// regex to match network names where we should track IP address utilization:
2932
alertsIPutilizationNetworksMatcher: '.+',

openstack-mixin/dashboards_out/cinder

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@
553553
"label": "Job",
554554
"multi": true,
555555
"name": "job",
556-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\"}, job)",
556+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\"}, job)",
557557
"refresh": 2,
558558
"sort": 1,
559559
"type": "query"
@@ -568,7 +568,7 @@
568568
"label": "Instance",
569569
"multi": true,
570570
"name": "instance",
571-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\",job=~\"$job\"}, instance)",
571+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\",job=~\"$job\"}, instance)",
572572
"refresh": 2,
573573
"sort": 1,
574574
"type": "query"

0 commit comments

Comments
 (0)