Skip to content

Commit b477fff

Browse files
authored
Merge pull request #5 from kabisa/burn-rate-alerts
Burn rate alerts
2 parents eaa25d4 + 22c9a97 commit b477fff

13 files changed

+336
-69
lines changed

README.md

Lines changed: 85 additions & 46 deletions
Large diffs are not rendered by default.

apdex.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ locals {
66
}
77

88
module "apdex" {
9-
source = "[email protected]:kabisa/terraform-datadog-generic-monitor.git?ref=0.7.0"
9+
source = "kabisa/generic-monitor/datadog"
10+
version = "0.7.4"
1011

1112
name = "APM - ${title(split(".", var.trace_span_name)[0])} - Apdex"
1213
query = "avg(${var.apdex_evaluation_period}):avg:trace.${var.trace_span_name}.apdex.by.service{${local.apdex_filter}} < ${var.apdex_critical}"

error-percentage.tf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ locals {
66
}
77

88
module "error_percentage" {
9-
source = "[email protected]:kabisa/terraform-datadog-generic-monitor.git?ref=0.7.0"
9+
source = "kabisa/generic-monitor/datadog"
10+
version = "0.7.4"
1011

1112
name = "APM - ${title(split(".", var.trace_span_name)[0])} - Error Percentage"
1213
query = "avg(${var.error_percentage_evaluation_period}):100 * (sum:trace.${var.trace_span_name}.errors{${local.error_percentage_filter}}.as_rate() / sum:trace.${var.trace_span_name}.hits{${local.error_percentage_filter}}.as_rate() ) > ${var.error_percentage_critical}"

errors-slo-variables.tf

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,7 @@ variable "error_slo_alerting_enabled" {
3333
default = true
3434
}
3535

36-
variable "error_slo_status_ok_filter" {
37-
type = string
38-
description = "Filter string to select the non-errors for the SLO, Dont forget to include the comma or (AND or OR) keywords"
39-
default = ",status:ok"
40-
}
41-
42-
variable "error_slo_status_error_filter" {
36+
variable "error_slo_error_filter" {
4337
type = string
4438
description = "Filter string to select the non-errors for the SLO, Dont forget to include the comma or (AND or OR) keywords"
4539
default = ",status:error"
@@ -53,3 +47,70 @@ variable "error_slo_timeframe" {
5347
type = string
5448
default = "30d"
5549
}
50+
51+
variable "error_slo_numerator_override" {
52+
type = string
53+
default = ""
54+
}
55+
56+
variable "error_slo_denominator_override" {
57+
type = string
58+
default = ""
59+
}
60+
61+
variable "error_slo_burn_rate_notification_channel_override" {
62+
type = string
63+
default = ""
64+
}
65+
66+
variable "error_slo_burn_rate_enabled" {
67+
type = bool
68+
default = true
69+
}
70+
71+
variable "error_slo_burn_rate_alerting_enabled" {
72+
type = bool
73+
default = true
74+
}
75+
76+
variable "error_slo_burn_rate_priority" {
77+
description = "Number from 1 (high) to 5 (low)."
78+
79+
type = number
80+
default = 3
81+
}
82+
83+
variable "error_slo_burn_rate_warning" {
84+
type = number
85+
default = null
86+
}
87+
88+
variable "error_slo_burn_rate_critical" {
89+
type = number
90+
default = 10 # 10x burn rate
91+
}
92+
93+
variable "error_slo_burn_rate_note" {
94+
type = string
95+
default = ""
96+
}
97+
98+
variable "error_slo_burn_rate_docs" {
99+
type = string
100+
default = "Use burn rates alerts to measure how fast your error budget is being depleted relative to the time window of your SLO. For example, for a 30 day SLO if a burn rate of 1 is sustained, that means the error budget will be fully depleted in exactly 30 days, a burn rate of 2 means in exactly 15 days, etc. Therefore, you could use a burn rate alert to notify you if a burn rate of 10 is measured in the past hour. Burn rate alerts evaluate two time windows: a long window which you specify and a short window that is automatically calculated as 1/12 of your long window. The long window's purpose is to reduce alert flappiness, while the short window's purpose is to improve recovery time. If your threshold is violated in both windows, you will receive an alert."
101+
}
102+
103+
variable "error_slo_burn_rate_evaluation_period" {
104+
type = string
105+
default = "30d"
106+
}
107+
108+
variable "error_slo_burn_rate_short_window" {
109+
type = string
110+
default = "5m"
111+
}
112+
113+
variable "error_slo_burn_rate_long_window" {
114+
type = string
115+
default = "1h"
116+
}

errors-slo.tf

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@ locals {
33
var.error_slo_filter_override,
44
local.filter_str
55
)
6+
error_slo_numerator = coalesce(
7+
var.error_slo_numerator_override,
8+
"sum:trace.${var.trace_span_name}.hits{${local.error_slo_filter}}.as_count() - sum:trace.${var.trace_span_name}.hits{${local.error_slo_filter}${var.error_slo_error_filter}}.as_count()"
9+
)
10+
error_slo_denominator = coalesce(
11+
var.error_slo_denominator_override,
12+
"sum:trace.${var.trace_span_name}.hits{${local.error_slo_filter}}.as_count()"
13+
)
14+
error_slo_burn_rate_notification_channel = try(coalesce(
15+
var.error_slo_burn_rate_notification_channel_override,
16+
var.notification_channel
17+
), "")
618
}
719

820

@@ -19,9 +31,41 @@ resource "datadog_service_level_objective" "error_slo" {
1931
}
2032

2133
query {
22-
numerator = "sum:custom_trace.hits{${local.error_slo_filter}${var.error_slo_status_ok_filter}}.as_count()"
23-
denominator = "sum:custom_trace.hits{${local.error_slo_filter}${var.error_slo_status_ok_filter}}.as_count() + sum:custom_trace.hits{${local.error_slo_filter}${var.error_slo_status_error_filter}}.as_count()"
34+
numerator = local.error_slo_numerator
35+
denominator = local.error_slo_denominator
2436
}
2537

2638
tags = local.normalized_tags
2739
}
40+
41+
module "error_slo_burn_rate" {
42+
source = "kabisa/generic-monitor/datadog"
43+
version = "0.7.4"
44+
45+
name = "APM - Error SLO - Burn Rate"
46+
query = "burn_rate(\"${datadog_service_level_objective.error_slo[0].id}\").over(\"${var.error_slo_burn_rate_evaluation_period}\").long_window(\"${var.error_slo_burn_rate_long_window}\").short_window(\"${var.error_slo_burn_rate_short_window}\") > ${var.error_slo_burn_rate_critical}"
47+
48+
49+
alert_message = "${local.service_display_name} service is burning through its Error Budget. The percentage of 5XX status codes is {{threshold}}x higher than expected"
50+
recovery_message = "${local.service_display_name} service burn rate has recovered"
51+
type = "slo alert"
52+
53+
# monitor level vars
54+
enabled = var.error_slo_enabled && var.error_slo_burn_rate_enabled
55+
alerting_enabled = var.error_slo_burn_rate_alerting_enabled
56+
warning_threshold = var.error_slo_burn_rate_warning
57+
critical_threshold = var.error_slo_burn_rate_critical
58+
priority = var.error_slo_burn_rate_priority
59+
docs = var.error_slo_burn_rate_docs
60+
note = var.error_slo_burn_rate_note
61+
62+
# module level vars
63+
env = var.alert_env
64+
service = var.service
65+
service_display_name = var.service_display_name
66+
notification_channel = local.error_slo_burn_rate_notification_channel
67+
additional_tags = var.additional_tags
68+
locked = var.locked
69+
name_prefix = var.name_prefix
70+
name_suffix = var.name_suffix
71+
}

latency-p95-variables.tf

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,20 @@ variable "latency_p95_enabled" {
44
}
55

66
variable "latency_p95_warning" {
7-
type = number
8-
default = 0.9
7+
type = number
8+
default = 0.9 # 0.9 seconds
9+
description = "P95 Latency in seconds."
910
}
1011

1112
variable "latency_p95_critical" {
12-
type = number
13-
default = 1.3
13+
type = number
14+
default = 1.3 # 1.3 seconds
15+
description = "P95 Latency warning in seconds."
1416
}
1517

1618
variable "latency_p95_evaluation_period" {
1719
type = string
18-
default = "last_10m"
20+
default = "last_15m"
1921
}
2022

2123
variable "latency_p95_note" {
@@ -39,3 +41,8 @@ variable "latency_p95_priority" {
3941
type = number
4042
default = 3
4143
}
44+
45+
variable "latency_p95_notification_channel_override" {
46+
type = string
47+
default = ""
48+
}

latency-p95.tf

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
1+
locals {
2+
latency_p95_notification_channel = try(coalesce(
3+
var.latency_p95_notification_channel_override,
4+
var.notification_channel
5+
), "")
6+
}
7+
18
module "latency_p95" {
2-
source = "[email protected]:kabisa/terraform-datadog-generic-monitor.git?ref=0.7.0"
9+
source = "kabisa/generic-monitor/datadog"
10+
version = "0.7.4"
311

412
name = "APM - ${title(split(".", var.trace_span_name)[0])} - Latency(p95)"
513
# using same filters as for avg latency
6-
query = "avg(${var.latency_p95_evaluation_period}):p95:trace.${var.trace_span_name}{${local.latency_filter}} > ${var.latency_p95_critical}"
14+
query = "percentile(${var.latency_p95_evaluation_period}):p95:trace.${var.trace_span_name}{${local.latency_filter}} > ${var.latency_p95_critical}"
715

816
alert_message = "The latency_p95 for service ${local.service_display_name} ({{value}}) has risen above {{threshold}}"
917
recovery_message = "The latency_p95 for service ${local.service_display_name} ({{value}}) has recovered"
@@ -21,7 +29,7 @@ module "latency_p95" {
2129
env = var.alert_env
2230
service = var.service
2331
service_display_name = var.service_display_name
24-
notification_channel = var.notification_channel
32+
notification_channel = local.latency_p95_notification_channel
2533
additional_tags = var.additional_tags
2634
locked = var.locked
2735
name_prefix = var.name_prefix

latency-slo-variables.tf

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,60 @@ variable "latency_slo_timeframe" {
5454
type = string
5555
default = "30d"
5656
}
57+
58+
variable "latency_slo_burn_rate_priority" {
59+
description = "Number from 1 (high) to 5 (low)."
60+
61+
type = number
62+
default = 3
63+
}
64+
65+
variable "latency_slo_burn_rate_warning" {
66+
type = number
67+
default = null
68+
}
69+
70+
variable "latency_slo_burn_rate_critical" {
71+
type = number
72+
default = 10 # 10x burn rate
73+
}
74+
75+
variable "latency_slo_burn_rate_note" {
76+
type = string
77+
default = ""
78+
}
79+
80+
variable "latency_slo_burn_rate_docs" {
81+
type = string
82+
default = "Use burn rates alerts to measure how fast your error budget is being depleted relative to the time window of your SLO. For example, for a 30 day SLO if a burn rate of 1 is sustained, that means the error budget will be fully depleted in exactly 30 days, a burn rate of 2 means in exactly 15 days, etc. Therefore, you could use a burn rate alert to notify you if a burn rate of 10 is measured in the past hour. Burn rate alerts evaluate two time windows: a long window which you specify and a short window that is automatically calculated as 1/12 of your long window. The long window's purpose is to reduce alert flappiness, while the short window's purpose is to improve recovery time. If your threshold is violated in both windows, you will receive an alert."
83+
}
84+
85+
variable "latency_slo_burn_rate_evaluation_period" {
86+
type = string
87+
default = "30d"
88+
}
89+
90+
variable "latency_slo_burn_rate_short_window" {
91+
type = string
92+
default = "5m"
93+
}
94+
95+
variable "latency_slo_burn_rate_long_window" {
96+
type = string
97+
default = "1h"
98+
}
99+
100+
variable "latency_slo_burn_rate_notification_channel_override" {
101+
type = string
102+
default = ""
103+
}
104+
105+
variable "latency_slo_burn_rate_enabled" {
106+
type = bool
107+
default = true
108+
}
109+
110+
variable "latency_slo_burn_rate_alerting_enabled" {
111+
type = bool
112+
default = true
113+
}

latency-slo.tf

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ locals {
33
var.latency_slo_filter_override,
44
local.filter_str
55
)
6+
latency_slo_burn_rate_notification_channel = try(coalesce(
7+
var.latency_slo_burn_rate_notification_channel_override,
8+
var.notification_channel
9+
), "")
610
}
711

812

@@ -25,3 +29,36 @@ resource "datadog_service_level_objective" "latency_slo" {
2529

2630
tags = local.normalized_tags
2731
}
32+
33+
34+
module "latency_slo_burn_rate" {
35+
source = "kabisa/generic-monitor/datadog"
36+
version = "0.7.4"
37+
38+
name = "APM - Latency SLO - Burn Rate"
39+
query = "burn_rate(\"${datadog_service_level_objective.latency_slo[0].id}\").over(\"${var.latency_slo_burn_rate_evaluation_period}\").long_window(\"${var.latency_slo_burn_rate_long_window}\").short_window(\"${var.latency_slo_burn_rate_short_window}\") > ${var.latency_slo_burn_rate_critical}"
40+
41+
42+
alert_message = "${local.service_display_name} service is burning through its Latency Budget. The percentage of slow requests is {{threshold}}x higher than expected"
43+
recovery_message = "${local.service_display_name} service burn rate has recovered"
44+
type = "slo alert"
45+
46+
# monitor level vars
47+
enabled = var.latency_slo_enabled && var.latency_slo_burn_rate_enabled
48+
alerting_enabled = var.latency_slo_burn_rate_alerting_enabled
49+
warning_threshold = var.latency_slo_burn_rate_warning
50+
critical_threshold = var.latency_slo_burn_rate_critical
51+
priority = var.latency_slo_burn_rate_priority
52+
docs = var.latency_slo_burn_rate_docs
53+
note = var.latency_slo_burn_rate_note
54+
55+
# module level vars
56+
env = var.alert_env
57+
service = var.service
58+
service_display_name = var.service_display_name
59+
notification_channel = local.latency_slo_burn_rate_notification_channel
60+
additional_tags = var.additional_tags
61+
locked = var.locked
62+
name_prefix = var.name_prefix
63+
name_suffix = var.name_suffix
64+
}

latency-variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,8 @@ variable "latency_priority" {
4444
type = number
4545
default = 3
4646
}
47+
48+
variable "latency_notification_channel_override" {
49+
type = string
50+
default = ""
51+
}

0 commit comments

Comments
 (0)