From 75e80e701819a74df8a8b6162249f396edb5a38c Mon Sep 17 00:00:00 2001 From: yduartep Date: Thu, 14 Nov 2024 14:10:35 +0100 Subject: [PATCH 1/7] add new azure sql database alerts --- csp-mixin/alerts/azure-alerts.yml | 128 +++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 1 deletion(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index ed3d73b04..c26057f85 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -22,9 +22,135 @@ groups: keep_firing_for: 10m labels: severity: critical - service: 'Azure Virtual Machines.' + service: 'Azure Virtual Machines' namespace: cloud-provider-azure annotations: summary: 'VM unavailable.' description: 'The VM {{ $labels.resourceName }} is not functioning or crashed, which may require immediate action.' dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' + + - alert: AzureHighDtuConsumption + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_dtu_consumption_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 + for: 10m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High DTU consumption.' + description: 'Check active queries and optimize indexes or consider scaling up DTUs to handle load in {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighStorageUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_storage_percent_maximum_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 85 + for: 10m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High Storage usage.' + description: 'Archive or delete old data, or scale up storage capacity in {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighDeadlockCount + expr: | + sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 5 + for: 10m + keep_firing_for: 10m + labels: + severity: info + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High Deadlock count.' + description: 'Check {{ $labels.resourceName }} database logs for deadlock details and optimize affected queries.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighUserCpuUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_cpu_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 80 + for: 10m + keep_firing_for: 10m + labels: + severity: warning + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High User CPU usage.' + description: 'Identify high CPU queries on {{ $labels.resourceName }} database and optimize them.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighSystemFailedConnections + expr: | + sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 + for: 5m + keep_firing_for: 10m + labels: + severity: warning + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High number of System Failed connections.' + description: 'Check network problems, firewall restrictions or high resource consumption affecting application access to the database {{ $labels.resourceName }}.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighUserFailedConnections + expr: | + sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 + for: 5m + keep_firing_for: 10m + labels: + severity: warning + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High number of User Failed connections.' + description: 'Check for authentication problems, network configuration errors, firewall issues, or resource constraints, affecting database accessibility for users on database {{ $labels.resourceName }}.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighWorkerUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_workers_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High worker usage.' + description: 'Look for long execution queries, review the number of concurrent queries and requests being sent to the database or check if there are any blocking sessions or deadlocks into the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureHighDataIoUsage + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_physical_data_read_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 + for: 15m + keep_firing_for: 10m + labels: + severity: info + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'High data IO usage.' + description: 'Review queries with high read or write activity, check if there are missing indexes or inefficient indexes that result in full table scans and assess the volume of transactions into the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' + + - alert: AzureLowTempdbLogSpace + expr: | + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_tempdb_log_used_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60 + for: 5m + keep_firing_for: 10m + labels: + severity: critical + service: 'Azure SQL database' + namespace: cloud-provider-azure + annotations: + summary: 'Low tempdb log space.' + description: 'Look for active sessions that might be using TempDB intensively, identify stored procedures or queries that create temporary tables or objects, and also look for long-running or memory-intensive queries that rely heavily on TempDB into the {{ $labels.resourceName }} database.' + dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' From fab1be67f81b05ec0c1eada8d4568d19f523e7eb Mon Sep 17 00:00:00 2001 From: yduartep Date: Thu, 14 Nov 2024 14:41:28 +0100 Subject: [PATCH 2/7] add database to summaries and alert name --- csp-mixin/alerts/azure-alerts.yml | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index c26057f85..faa1d41e9 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -11,7 +11,7 @@ groups: service: 'Azure Virtual Machines' namespace: cloud-provider-azure annotations: - summary: 'CPU utilization is too high.' + summary: 'VM CPU utilization is too high.' description: 'The VM {{ $labels.resourceName }} is under heavy load and may become unresponsive.' dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' @@ -29,7 +29,7 @@ groups: description: 'The VM {{ $labels.resourceName }} is not functioning or crashed, which may require immediate action.' dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e' - - alert: AzureHighDtuConsumption + - alert: AzureDatabaseHighDtuConsumption expr: | avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_dtu_consumption_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 for: 10m @@ -39,11 +39,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High DTU consumption.' + summary: 'High database DTU consumption.' description: 'Check active queries and optimize indexes or consider scaling up DTUs to handle load in {{ $labels.resourceName }} database.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighStorageUsage + - alert: AzureDatabaseHighStorageUsage expr: | avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_storage_percent_maximum_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 85 for: 10m @@ -53,11 +53,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High Storage usage.' + summary: 'High database Storage usage.' description: 'Archive or delete old data, or scale up storage capacity in {{ $labels.resourceName }} database.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighDeadlockCount + - alert: AzureDatabaseHighDeadlockCount expr: | sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 5 for: 10m @@ -67,11 +67,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High Deadlock count.' + summary: 'High database Deadlock count.' description: 'Check {{ $labels.resourceName }} database logs for deadlock details and optimize affected queries.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighUserCpuUsage + - alert: AzureDatabaseHighUserCpuUsage expr: | avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_cpu_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 80 for: 10m @@ -81,11 +81,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High User CPU usage.' + summary: 'High database User CPU usage.' description: 'Identify high CPU queries on {{ $labels.resourceName }} database and optimize them.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighSystemFailedConnections + - alert: AzureDatabaseHighSystemFailedConnections expr: | sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 for: 5m @@ -95,11 +95,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High number of System Failed connections.' + summary: 'High number of database System Failed connections.' description: 'Check network problems, firewall restrictions or high resource consumption affecting application access to the database {{ $labels.resourceName }}.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighUserFailedConnections + - alert: AzureDatabaseHighUserFailedConnections expr: | sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 for: 5m @@ -109,11 +109,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High number of User Failed connections.' + summary: 'High number of database User Failed connections.' description: 'Check for authentication problems, network configuration errors, firewall issues, or resource constraints, affecting database accessibility for users on database {{ $labels.resourceName }}.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighWorkerUsage + - alert: AzureDatabaseHighWorkerUsage expr: | avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_workers_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60 for: 5m @@ -123,11 +123,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High worker usage.' + summary: 'High database worker usage.' description: 'Look for long execution queries, review the number of concurrent queries and requests being sent to the database or check if there are any blocking sessions or deadlocks into the {{ $labels.resourceName }} database.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureHighDataIoUsage + - alert: AzureDatabaseHighDataIoUsage expr: | avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_physical_data_read_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 for: 15m @@ -137,11 +137,11 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'High data IO usage.' + summary: 'High database data IO usage.' description: 'Review queries with high read or write activity, check if there are missing indexes or inefficient indexes that result in full table scans and assess the volume of transactions into the {{ $labels.resourceName }} database.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - - alert: AzureLowTempdbLogSpace + - alert: AzureDatabaseLowTempdbLogSpace expr: | avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_tempdb_log_used_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 60 for: 5m @@ -151,6 +151,6 @@ groups: service: 'Azure SQL database' namespace: cloud-provider-azure annotations: - summary: 'Low tempdb log space.' + summary: 'Low database tempdb log space.' description: 'Look for active sessions that might be using TempDB intensively, identify stored procedures or queries that create temporary tables or objects, and also look for long-running or memory-intensive queries that rely heavily on TempDB into the {{ $labels.resourceName }} database.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' From 5f844080d1ad9fc0657d58f9965e01083b264cf7 Mon Sep 17 00:00:00 2001 From: yduartep Date: Tue, 19 Nov 2024 12:43:07 +0100 Subject: [PATCH 3/7] sync threshold and lookback period with azure --- csp-mixin/alerts/azure-alerts.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index faa1d41e9..269863e02 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -45,8 +45,8 @@ groups: - alert: AzureDatabaseHighStorageUsage expr: | - avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_storage_percent_maximum_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 85 - for: 10m + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_storage_percent_maximum_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 95 + for: 15m keep_firing_for: 10m labels: severity: critical @@ -73,7 +73,7 @@ groups: - alert: AzureDatabaseHighUserCpuUsage expr: | - avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_cpu_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 80 + avg by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_cpu_percent_average_percent{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 90 for: 10m keep_firing_for: 10m labels: @@ -102,7 +102,7 @@ groups: - alert: AzureDatabaseHighUserFailedConnections expr: | sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 - for: 5m + for: 15m keep_firing_for: 10m labels: severity: warning From a041aecfbb7df45bdda86b8818da00699b38ecab Mon Sep 17 00:00:00 2001 From: Yahima Duarte Date: Tue, 19 Nov 2024 16:25:25 +0100 Subject: [PATCH 4/7] Update csp-mixin/alerts/azure-alerts.yml Co-authored-by: Ana Ivanov <38096095+anaivanov@users.noreply.github.com> --- csp-mixin/alerts/azure-alerts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index 269863e02..0946ea989 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -96,7 +96,7 @@ groups: namespace: cloud-provider-azure annotations: summary: 'High number of database System Failed connections.' - description: 'Check network problems, firewall restrictions or high resource consumption affecting application access to the database {{ $labels.resourceName }}.' + description: 'Check network problems, firewall restrictions or high resource consumption affecting application access to the {{ $labels.resourceName }} database.' dashboard_uid: '82c5b6cf30db5b601c5cc3f5d8d4284d' - alert: AzureDatabaseHighUserFailedConnections From e894861e68f1b2ab163f94faff83b79d71bf3d4d Mon Sep 17 00:00:00 2001 From: yduartep Date: Tue, 19 Nov 2024 16:26:51 +0100 Subject: [PATCH 5/7] update deadlock alert to include rate --- csp-mixin/alerts/azure-alerts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index 0946ea989..aff08b753 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -59,7 +59,7 @@ groups: - alert: AzureDatabaseHighDeadlockCount expr: | - sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 5 + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[$__rate_interval])) > 5 for: 10m keep_firing_for: 10m labels: From 1e802a8f085963defd4fc358a2490ba0afd94cf6 Mon Sep 17 00:00:00 2001 From: yduartep Date: Tue, 19 Nov 2024 16:42:30 +0100 Subject: [PATCH 6/7] fix lint --- csp-mixin/alerts/azure-alerts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index aff08b753..3bf0a2743 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -59,7 +59,7 @@ groups: - alert: AzureDatabaseHighDeadlockCount expr: | - sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[$__rate_interval])) > 5 + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_deadlock_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 5 for: 10m keep_firing_for: 10m labels: From 4c8ecd01435c657be687d39f375c524dd3aca45f Mon Sep 17 00:00:00 2001 From: yduartep Date: Thu, 21 Nov 2024 14:42:43 +0100 Subject: [PATCH 7/7] add rate to counter metrics --- csp-mixin/alerts/azure-alerts.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csp-mixin/alerts/azure-alerts.yml b/csp-mixin/alerts/azure-alerts.yml index 3bf0a2743..058ee1f30 100644 --- a/csp-mixin/alerts/azure-alerts.yml +++ b/csp-mixin/alerts/azure-alerts.yml @@ -87,7 +87,7 @@ groups: - alert: AzureDatabaseHighSystemFailedConnections expr: | - sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_connection_failed_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 10 for: 5m keep_firing_for: 10m labels: @@ -101,7 +101,7 @@ groups: - alert: AzureDatabaseHighUserFailedConnections expr: | - sum by (job,resourceGroup,subscriptionName,resourceName) (azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}) > 10 + sum by (job,resourceGroup,subscriptionName,resourceName) (rate(azure_microsoft_sql_servers_databases_connection_failed_user_error_total_count{job=~".+",resourceGroup=~".+",subscriptionName=~".+",resourceName=~".+"}[5m])) > 10 for: 15m keep_firing_for: 10m labels: