diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index ed3d73b04..058ee1f30 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -11,7 +11,7 @@ groups: service: 'Azure Virtual Machines' namespace: cloud-provider-azure annotations: - summary: 'CPU utilization is too high.' + summary: 'VM CPU utilization is too high.' description: 'The VM {{ $labels.resourceName }} is under heavy load and may become unresponsive.' dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' @@ -22,9 +22,135 @@ groups: keep_firing_for: 10m labels: severity: critical - service: 'Azure Virtual Machines.' + service: 'Azure Virtual Machines' namespace: cloud-provider-azure annotations: summary: 'VM unavailable.' description: 'The VM {{ $labels.resourceName }} is not functioning or crashed, which may require immediate action.' dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' + + - alert: AzureDatabaseHighDtuConsumption + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_dtu_consumption_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 + for: 10m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High database DTU consumption.' + description: 'Check active queries and optimize indexes or consider scaling up DTUs to handle load in {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighStorageUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_storage_percent_maximum_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 95 + for: 15m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High database Storage usage.' + description: 'Archive or delete old data, or scale up storage capacity in {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighDeadlockCount + expr: | + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 5 + for: 10m + keep_firing_for: 10m + labels: + severity: info + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High database Deadlock count.' + description: 'Check {{ $labels.resourceName }} database logs for deadlock details and optimize affected queries.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighUserCpuUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_cpu_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 + for: 10m + keep_firing_for: 10m + labels: + severity: warning + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High database User CPU usage.' + description: 'Identify high CPU queries on {{ $labels.resourceName }} database and optimize them.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighSystemFailedConnections + expr: | + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_connection_failed_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 10 + for: 5m + keep_firing_for: 10m + labels: + severity: warning + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High number of database System Failed connections.' + description: 'Check network problems, firewall restrictions or high resource consumption affecting application access to the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighUserFailedConnections + expr: | + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 10 + for: 15m + keep_firing_for: 10m + labels: + severity: warning + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High number of database User Failed connections.' + description: 'Check for authentication problems, network configuration errors, firewall issues, or resource constraints, affecting database accessibility for users on database {{ $labels.resourceName }}.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighWorkerUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_workers_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High database worker usage.' + description: 'Look for long execution queries, review the number of concurrent queries and requests being sent to the database or check if there are any blocking sessions or deadlocks into the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseHighDataIoUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_physical_data_read_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 + for: 15m + keep_firing_for: 10m + labels: + severity: info + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High database data IO usage.' + description: 'Review queries with high read or write activity, check if there are missing indexes or inefficient indexes that result in full table scans and assess the volume of transactions into the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureDatabaseLowTempdbLogSpace + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_tempdb_log_used_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'Low database tempdb log space.' + description: 'Look for active sessions that might be using TempDB intensively, identify stored procedures or queries that create temporary tables or objects, and also look for long-running or memory-intensive queries that rely heavily on TempDB into the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d'