diff --git a/csp-mixin/.lint b/csp-mixin/.lint index 1d76545b9..3652e0e64 100644 --- a/csp-mixin/.lint +++ b/csp-mixin/.lint @@ -71,3 +71,9 @@ exclusions: panel-title-description-rule: entries: - dashboard: Azure Service Bus + - dashboard: GCP Compute Engine + - dashboard: Azure Virtual Machines + target-rate-interval-rule: + entries: + - dashboard: GCP Compute Engine + diff --git a/csp-mixin/alerts.libsonnet b/csp-mixin/alerts.libsonnet deleted file mode 100644 index b207ecef7..000000000 --- a/csp-mixin/alerts.libsonnet +++ /dev/null @@ -1,52 +0,0 @@ -{ - new(this): { - groups+: [ - { - name: this.config.uid, - rules: - [ - { - alert: 'AzureVMHighCpuUtilization', - expr: 'avg by (%s) (%s) > 85' % - [ - std.join(',', this.config.groupLabels + this.config.instanceLabels), - this.signals.azurevm.cpuUtilization.asRuleExpression(), - ], - 'for': '5m', - keep_firing_for: '10m', - labels: { - severity: 'critical', - service: 'Azure Virtual Machines', - namespace: 'cloud-provider-' + this.config.uid, - }, - annotations: { - summary: 'CPU utilization is too high', - description: 'The VM {{$labels.resourceName}} is under heavy load and may become unresponsive.', - dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e', - }, - }, - { - alert: 'AzureVMUnavailable', - expr: 'avg by (%s) (%s) != 1' % - [ - std.join(',', this.config.groupLabels + this.config.instanceLabels), - this.signals.azurevmOverview.vmAvailability.asRuleExpression(), - ], - 'for': '5m', - keep_firing_for: '10m', - labels: { - severity: 'critical', - service: 'Azure Virtual Machines', - namespace: 'cloud-provider-' + this.config.uid, - }, - annotations: { - summary: 'VM unavailable', - description: 'The VM {{$labels.resourceName}} is not functioning or crashed, which may require immediate action.', - dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e', - }, - }, - ], - }, - ], - }, -} diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml new file mode 100644 index 000000000..ed3d73b04 --- /dev/null +++ b/csp-mixin/alerts/azure-alerts.yml @@ -0,0 +1,30 @@ +groups: + - name: azure + rules: + - alert: AzureVMHighCpuUtilization + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_compute_virtualmachines_percentage_cpu_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 85 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure Virtual Machines' + namespace: cloud-provider-azure + annotations: + summary: 'CPU utilization is too high.' + description: 'The VM {{ $labels.resourceName }} is under heavy load and may become unresponsive.' + dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' + + - alert: AzureVMUnavailable + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_compute_virtualmachines_vmavailabilitymetric_average_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) != 1 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure Virtual Machines.' + namespace: cloud-provider-azure + annotations: + summary: 'VM unavailable.' + description: 'The VM {{ $labels.resourceName }} is not functioning or crashed, which may require immediate action.' + dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' diff --git a/csp-mixin/alerts/gcp-alerts.yml b/csp-mixin/alerts/gcp-alerts.yml new file mode 100644 index 000000000..494e540fc --- /dev/null +++ b/csp-mixin/alerts/gcp-alerts.yml @@ -0,0 +1,16 @@ +groups: + - name: gcp + rules: + - alert: GcpCEHighCpuUtilization + expr: | + 100 * avg by (job,project_id,instance_name) (stackdriver_gce_instance_compute_googleapis_com_instance_cpu_utilization{job=~".+",project_id=~".+",instance_name=~".+"}) > 85 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Compute Engine' + namespace: cloud-provider-gcp + annotations: + summary: 'CPU utilization is too high.' + description: 'The VM {{ $labels.instance_name }} is under heavy load and may become unresponsive.' + dashboard_uid: 'f115fe73641347c43415535d77e2dc0f' diff --git a/csp-mixin/azureconfig.libsonnet b/csp-mixin/azureconfig.libsonnet index b16f3ec3b..510fbb11b 100644 --- a/csp-mixin/azureconfig.libsonnet +++ b/csp-mixin/azureconfig.libsonnet @@ -19,5 +19,13 @@ groupLabels: ['job', 'resourceGroup', 'subscriptionName'], instanceLabels: ['resourceName'], metricsSource: 'azuremonitor', + + local importRules(rules) = { + groups+: std.parseYaml(rules).groups, + }, + + prometheus: { + alerts: importRules(importstr 'alerts/azure-alerts.yml'), + }, }, } diff --git a/csp-mixin/gcpconfig.libsonnet b/csp-mixin/gcpconfig.libsonnet index fac342add..14084823d 100644 --- a/csp-mixin/gcpconfig.libsonnet +++ b/csp-mixin/gcpconfig.libsonnet @@ -19,5 +19,12 @@ groupLabels: ['job'], instanceLabels: ['bucket_name'], metricsSource: 'stackdriver', + local importRules(rules) = { + groups+: std.parseYaml(rules).groups, + }, + + prometheus: { + alerts: importRules(importstr 'alerts/gcp-alerts.yml'), + }, }, } diff --git a/csp-mixin/main.libsonnet b/csp-mixin/main.libsonnet index e5c247a62..c23155217 100644 --- a/csp-mixin/main.libsonnet +++ b/csp-mixin/main.libsonnet @@ -15,7 +15,7 @@ local commonlib = import 'common-lib/common/main.libsonnet'; dashboards: (import './dashboards.libsonnet').new(this), }, prometheus: { - alerts: (import './alerts.libsonnet').new(this), + alerts: this.config.prometheus.alerts, recordingRules: {}, }, asMonitoringMixin(): {