Skip to content
Merged
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 128 additions & 2 deletions csp-mixin/alerts/azure-alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ groups:
service: 'Azure Virtual Machines'
namespace: cloud-provider-azure
annotations:
summary: 'CPU utilization is too high.'
summary: 'VM CPU utilization is too high.'
description: 'The VM {{ $labels.resourceName }} is under heavy load and may become unresponsive.'
dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e'

Expand All @@ -22,9 +22,135 @@ groups:
keep_firing_for: 10m
labels:
severity: critical
service: 'Azure Virtual Machines.'
service: 'Azure Virtual Machines'
namespace: cloud-provider-azure
annotations:
summary: 'VM unavailable.'
description: 'The VM {{ $labels.resourceName }} is not functioning or crashed, which may require immediate action.'
dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e'

- alert: AzureDatabaseHighDtuConsumption
expr: |
avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_dtu_consumption_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90
for: 10m
keep_firing_for: 10m
labels:
severity: critical
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High database DTU consumption.'
description: 'Check active queries and optimize indexes or consider scaling up DTUs to handle load in {{ $labels.resourceName }} database.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighStorageUsage
expr: |
avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_storage_percent_maximum_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 95
for: 15m
keep_firing_for: 10m
labels:
severity: critical
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High database Storage usage.'
description: 'Archive or delete old data, or scale up storage capacity in {{ $labels.resourceName }} database.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighDeadlockCount
expr: |
sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 5
for: 10m
keep_firing_for: 10m
labels:
severity: info
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High database Deadlock count.'
description: 'Check {{ $labels.resourceName }} database logs for deadlock details and optimize affected queries.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighUserCpuUsage
expr: |
avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_cpu_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90
for: 10m
keep_firing_for: 10m
labels:
severity: warning
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High database User CPU usage.'
description: 'Identify high CPU queries on {{ $labels.resourceName }} database and optimize them.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighSystemFailedConnections
expr: |
sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10
for: 5m
keep_firing_for: 10m
labels:
severity: warning
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High number of database System Failed connections.'
description: 'Check network problems, firewall restrictions or high resource consumption affecting application access to the {{ $labels.resourceName }} database.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighUserFailedConnections
expr: |
sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10
for: 15m
keep_firing_for: 10m
labels:
severity: warning
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High number of database User Failed connections.'
description: 'Check for authentication problems, network configuration errors, firewall issues, or resource constraints, affecting database accessibility for users on database {{ $labels.resourceName }}.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighWorkerUsage
expr: |
avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_workers_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High database worker usage.'
description: 'Look for long execution queries, review the number of concurrent queries and requests being sent to the database or check if there are any blocking sessions or deadlocks into the {{ $labels.resourceName }} database.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseHighDataIoUsage
expr: |
avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_physical_data_read_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90
for: 15m
keep_firing_for: 10m
labels:
severity: info
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'High database data IO usage.'
description: 'Review queries with high read or write activity, check if there are missing indexes or inefficient indexes that result in full table scans and assess the volume of transactions into the {{ $labels.resourceName }} database.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'

- alert: AzureDatabaseLowTempdbLogSpace
expr: |
avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_tempdb_log_used_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Azure SQL database'
namespace: cloud-provider-azure
annotations:
summary: 'Low database tempdb log space.'
description: 'Look for active sessions that might be using TempDB intensively, identify stored procedures or queries that create temporary tables or objects, and also look for long-running or memory-intensive queries that rely heavily on TempDB into the {{ $labels.resourceName }} database.'
dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'
Loading