diff --git a/infrastructure/modules/container-app-job/alerts.tf b/infrastructure/modules/container-app-job/alerts.tf new file mode 100644 index 00000000..fd3b891a --- /dev/null +++ b/infrastructure/modules/container-app-job/alerts.tf @@ -0,0 +1,28 @@ +resource "azurerm_monitor_scheduled_query_rules_alert" "job_failure" { + count = var.enable_alerting ? 1 : 0 + + name = "${azurerm_container_app_job.this.name}-containerjob-failure-alert" + location = var.location + resource_group_name = var.resource_group_name + + action { + action_group = [var.action_group_id] + } + data_source_id = var.log_analytics_workspace_id + description = "Alert when ${azurerm_container_app_job.this.name} job fails" + enabled = true + query = <<-KQL + ContainerAppSystemLogs_CL + | where JobName_s == "${azurerm_container_app_job.this.name}" + | where Reason_s == "ProcessExited" + | extend ExitCode = toint(extract(@"exit code:\\s*(\\d+)", 1, Log_s)) + | where ExitCode != 0 + KQL + severity = 1 + frequency = var.alert_frequency + time_window = var.time_window + trigger { + threshold = 0 + operator = "GreaterThan" + } +} diff --git a/infrastructure/modules/container-app-job/tfdocs.md b/infrastructure/modules/container-app-job/tfdocs.md index a6093ad8..1f29d1cd 100644 --- a/infrastructure/modules/container-app-job/tfdocs.md +++ b/infrastructure/modules/container-app-job/tfdocs.md @@ -16,6 +16,12 @@ Description: Docker image and tag. Format: /: Type: `string` +### [log\_analytics\_workspace\_id](#input\_log\_analytics\_workspace\_id) + +Description: Log analytics workspace ID + +Type: `string` + ### [name](#input\_name) Description: Name of the container app. Limited to 32 characters @@ -48,6 +54,22 @@ Type: `string` Default: `null` +### [action\_group\_id](#input\_action\_group\_id) + +Description: ID of the action group to notify. + +Type: `string` + +Default: `null` + +### [alert\_frequency](#input\_alert\_frequency) + +Description: Frequency (in minutes) at which rule condition should be evaluated. Values must be between 5 and 1440 (inclusive). Default is 15 + +Type: `number` + +Default: `15` + ### [app\_key\_vault\_id](#input\_app\_key\_vault\_id) Description: ID of the key vault to store app secrets. Each secret is mapped to an environment variable. Required when fetch\_secrets\_from\_app\_key\_vault is true. @@ -80,6 +102,14 @@ Type: `string` Default: `null` +### [enable\_alerting](#input\_enable\_alerting) + +Description: Whether monitoring and alerting is enabled for the PostgreSQL Flexible Server. + +Type: `bool` + +Default: `false` + ### [environment\_variables](#input\_environment\_variables) Description: Environment variables to pass to the container app. Only non-secret variables. Secrets must be stored in key vault 'app\_key\_vault\_id' @@ -154,6 +184,14 @@ Type: `map(string)` Default: `{}` +### [time\_window](#input\_time\_window) + +Description: Time window for which data needs to be fetched for query (must be greater than or equal to frequency). Values must be between 5 and 2880 (inclusive). Default is 30 + +Type: `number` + +Default: `30` + ### [user\_assigned\_identity\_ids](#input\_user\_assigned\_identity\_ids) Description: List of user assigned identity IDs to assign to the container app. @@ -190,4 +228,5 @@ Version: The following resources are used by this module: - [azurerm_container_app_job.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/container_app_job) (resource) +- [azurerm_monitor_scheduled_query_rules_alert.job_failure](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert) (resource) - [azurerm_key_vault_secrets.app](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secrets) (data source) diff --git a/infrastructure/modules/container-app-job/variables.tf b/infrastructure/modules/container-app-job/variables.tf index df2e55d2..0450704d 100644 --- a/infrastructure/modules/container-app-job/variables.tf +++ b/infrastructure/modules/container-app-job/variables.tf @@ -125,6 +125,35 @@ variable "workload_profile_name" { nullable = false } +variable "enable_alerting" { + description = "Whether monitoring and alerting is enabled for the PostgreSQL Flexible Server." + type = bool + default = false +} + +variable "action_group_id" { + type = string + description = "ID of the action group to notify." + default = null +} + +variable "log_analytics_workspace_id" { + description = "Log analytics workspace ID" + type = string +} + +variable "alert_frequency" { + type = number + description = "Frequency (in minutes) at which rule condition should be evaluated. Values must be between 5 and 1440 (inclusive). Default is 15" + default = 15 +} + +variable "time_window" { + type = number + description = "Time window for which data needs to be fetched for query (must be greater than or equal to frequency). Values must be between 5 and 2880 (inclusive). Default is 30" + default = 30 +} + locals { memory = "${var.memory}Gi" cpu = var.memory / 2