Skip to content

Commit 4577c16

Browse files
committed
Add container app job alerts
1 parent 4eee769 commit 4577c16

File tree

3 files changed

+117
-0
lines changed

3 files changed

+117
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
resource "azurerm_monitor_scheduled_query_rules_alert" "job_failure" {
2+
count = var.enable_alerting ? 1 : 0
3+
4+
name = "${azurerm_container_app_job.this.name}-containerjob-failure-alert"
5+
location = var.location
6+
resource_group_name = var.resource_group_name
7+
8+
action {
9+
action_group = [ var.action_group_id ]
10+
}
11+
data_source_id = var.log_analytics_workspace_id
12+
description = "Alert when ${azurerm_container_app_job.this.name} job fails"
13+
enabled = true
14+
query = <<-KQL
15+
ContainerAppSystemLogs_CL
16+
| where ExecutionName_s contains "${azurerm_container_app_job.this.name}"
17+
| where Log_s !contains "JobCleanup: Pod"
18+
| where Reason_s == "ProcessExited"
19+
| extend ExitCode = toint(extract(@"exit code:\\s*(\\d+)", 1, Log_s))
20+
| where ExitCode != 0
21+
| summarize Failures = count() by ExecutionName_s
22+
KQL
23+
severity = 1
24+
frequency = var.alert_frequency
25+
time_window = var.time_window
26+
trigger {
27+
threshold = 0
28+
operator = "GreaterThan"
29+
}
30+
}

infrastructure/modules/container-app-job/tfdocs.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ Description: Docker image and tag. Format: <registry>/<repository>:<tag>
1616

1717
Type: `string`
1818

19+
### <a name="input_log_analytics_workspace_audit_id"></a> [log\_analytics\_workspace\_audit\_id](#input\_log\_analytics\_workspace\_audit\_id)
20+
21+
Description: Log analytics workspace audit ID
22+
23+
Type: `string`
24+
1925
### <a name="input_name"></a> [name](#input\_name)
2026

2127
Description: Name of the container app. Limited to 32 characters
@@ -48,6 +54,30 @@ Type: `string`
4854

4955
Default: `null`
5056

57+
### <a name="input_action_group_id"></a> [action\_group\_id](#input\_action\_group\_id)
58+
59+
Description: ID of the action group to notify.
60+
61+
Type: `string`
62+
63+
Default: `null`
64+
65+
### <a name="input_alert_frequency"></a> [alert\_frequency](#input\_alert\_frequency)
66+
67+
Description: Frequency (in minutes) at which rule condition should be evaluated. Values must be between 5 and 1440 (inclusive). Default is 15
68+
69+
Type: `number`
70+
71+
Default: `15`
72+
73+
### <a name="input_alert_window_size"></a> [alert\_window\_size](#input\_alert\_window\_size)
74+
75+
Description: The period of time that is used to monitor alert activity e.g. PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly.
76+
77+
Type: `string`
78+
79+
Default: `"PT5M"`
80+
5181
### <a name="input_app_key_vault_id"></a> [app\_key\_vault\_id](#input\_app\_key\_vault\_id)
5282

5383
Description: ID of the key vault to store app secrets. Each secret is mapped to an environment variable. Required when fetch\_secrets\_from\_app\_key\_vault is true.
@@ -80,6 +110,14 @@ Type: `string`
80110

81111
Default: `null`
82112

113+
### <a name="input_enable_alerting"></a> [enable\_alerting](#input\_enable\_alerting)
114+
115+
Description: Whether monitoring and alerting is enabled for the PostgreSQL Flexible Server.
116+
117+
Type: `bool`
118+
119+
Default: `false`
120+
83121
### <a name="input_environment_variables"></a> [environment\_variables](#input\_environment\_variables)
84122

85123
Description: Environment variables to pass to the container app. Only non-secret variables. Secrets must be stored in key vault 'app\_key\_vault\_id'
@@ -154,6 +192,14 @@ Type: `map(string)`
154192

155193
Default: `{}`
156194

195+
### <a name="input_time_window"></a> [time\_window](#input\_time\_window)
196+
197+
Description: Time window for which data needs to be fetched for query (must be greater than or equal to frequency). Values must be between 5 and 2880 (inclusive). Default is 30
198+
199+
Type: `number`
200+
201+
Default: `30`
202+
157203
### <a name="input_user_assigned_identity_ids"></a> [user\_assigned\_identity\_ids](#input\_user\_assigned\_identity\_ids)
158204

159205
Description: List of user assigned identity IDs to assign to the container app.
@@ -190,4 +236,5 @@ Version:
190236
The following resources are used by this module:
191237

192238
- [azurerm_container_app_job.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/container_app_job) (resource)
239+
- [azurerm_monitor_scheduled_query_rules_alert.job_failure](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert) (resource)
193240
- [azurerm_key_vault_secrets.app](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secrets) (data source)

infrastructure/modules/container-app-job/variables.tf

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,46 @@ variable "workload_profile_name" {
125125
nullable = false
126126
}
127127

128+
variable "enable_alerting" {
129+
description = "Whether monitoring and alerting is enabled for the PostgreSQL Flexible Server."
130+
type = bool
131+
default = false
132+
}
133+
134+
variable "action_group_id" {
135+
type = string
136+
description = "ID of the action group to notify."
137+
default = null
138+
}
139+
140+
variable "alert_window_size" {
141+
type = string
142+
nullable = false
143+
default = "PT5M"
144+
validation {
145+
condition = contains(["PT1M", "PT5M", "PT15M", "PT30M", "PT1H", "PT6H", "PT12H"], var.alert_window_size)
146+
error_message = "The alert_window_size must be one of: PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H"
147+
}
148+
description = "The period of time that is used to monitor alert activity e.g. PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly."
149+
}
150+
151+
variable "log_analytics_workspace_id" {
152+
description = "Log analytics workspace ID"
153+
type = string
154+
}
155+
156+
variable "alert_frequency" {
157+
type = number
158+
description = "Frequency (in minutes) at which rule condition should be evaluated. Values must be between 5 and 1440 (inclusive). Default is 15"
159+
default = 15
160+
}
161+
162+
variable "time_window" {
163+
type = number
164+
description = "Time window for which data needs to be fetched for query (must be greater than or equal to frequency). Values must be between 5 and 2880 (inclusive). Default is 30"
165+
default = 30
166+
}
167+
128168
locals {
129169
memory = "${var.memory}Gi"
130170
cpu = var.memory / 2

0 commit comments

Comments
 (0)