Skip to content

Commit 2ea0968

Browse files
committed
feat: alert_for
Provides alert tuning of `for:` Prometheus clauses. Note that it requires a tiny addition to the API structure to represent the new field. Reference: #787 Signed-off-by: Robin H. Johnson <rjohnson@coreweave.com>
1 parent 3757b66 commit 2ea0968

File tree

7 files changed

+284
-12
lines changed

7 files changed

+284
-12
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ https://sloth.dev
5454
Release the Sloth!
5555

5656
```bash
57-
sloth generate -i ./examples/getting-started.yml
57+
sloth generate --slo-plugins-path=./examples/plugins -i ./examples/getting-started.yml
5858
```
5959

6060
```yaml
@@ -64,6 +64,9 @@ labels:
6464
owner: "myteam"
6565
repo: "myorg/myservice"
6666
tier: "2"
67+
slo_plugins:
68+
chain:
69+
- id: "sloth.dev/contrib/alert_for/v1"
6770
slos:
6871
# We allow failing (5xx and 429) 1 request every 1000 requests (99.9%).
6972
- name: "requests-availability"
@@ -83,10 +86,14 @@ slos:
8386
# Overwrite default Sloth SLO alert summmary on ticket and page alerts.
8487
summary: "High error rate on 'myservice' requests responses"
8588
page_alert:
89+
# Requires `sloth.dev/contrib/alert_for/v1` from `--slo-plugins-path`.
90+
for: 5m
8691
labels:
8792
severity: "pageteam"
8893
routing_key: "myteam"
8994
ticket_alert:
95+
# Requires `sloth.dev/contrib/alert_for/v1` from `--slo-plugins-path`.
96+
for: 10m
9097
labels:
9198
severity: "slack"
9299
slack_channel: "#alerts-myteam"

examples/getting-started.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ labels:
44
owner: "myteam"
55
repo: "myorg/myservice"
66
tier: "2"
7+
slo_plugins:
8+
chain:
9+
- id: "sloth.dev/contrib/alert_for/v1"
710
slos:
811
# We allow failing (5xx and 429) 1 request every 1000 requests (99.9%).
912
- name: "requests-availability"
@@ -21,10 +24,12 @@ slos:
2124
# Overwrite default Sloth SLO alert summmary on ticket and page alerts.
2225
summary: "High error rate on 'myservice' requests responses"
2326
page_alert:
27+
for: 5m
2428
labels:
2529
severity: pageteam
2630
routing_key: myteam
2731
ticket_alert:
32+
for: 10m
2833
labels:
2934
severity: "slack"
3035
slack_channel: "#alerts-myteam"
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# sloth.dev/contrib/alert_for/v1
2+
3+
This plugin sets Prometheus alert `for` durations from the Sloth `prometheus/v1` YAML spec fields:
4+
5+
- `slos[].alerting.page_alert.for`
6+
- `slos[].alerting.ticket_alert.for`
7+
8+
This plugin is required because the core plugins ignore `for` and always generate alerts without a pending time.
9+
10+
## Example
11+
12+
```yaml
13+
version: "prometheus/v1"
14+
service: "myservice"
15+
slo_plugins:
16+
chain:
17+
- id: "sloth.dev/contrib/alert_for/v1"
18+
slos:
19+
- name: "requests-availability"
20+
objective: 99.9
21+
sli:
22+
events:
23+
error_query: sum(rate(http_requests_total{code=~"5.."}[{{.window}}]))
24+
total_query: sum(rate(http_requests_total[{{.window}}]))
25+
alerting:
26+
name: MyServiceHighErrorRate
27+
page_alert:
28+
for: 5m
29+
ticket_alert:
30+
for: 10m
31+
```
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package plugin
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
7+
prommodel "github.com/prometheus/common/model"
8+
9+
"github.com/slok/sloth/pkg/common/conventions"
10+
pluginslov1 "github.com/slok/sloth/pkg/prometheus/plugin/slo/v1"
11+
)
12+
13+
const (
14+
PluginVersion = "prometheus/slo/v1"
15+
PluginID = "sloth.dev/contrib/alert_for/v1"
16+
)
17+
18+
func NewPlugin(_ json.RawMessage, _ pluginslov1.AppUtils) (pluginslov1.Plugin, error) {
19+
return plugin{}, nil
20+
}
21+
22+
type plugin struct{}
23+
24+
func (p plugin) ProcessSLO(_ context.Context, request *pluginslov1.Request, result *pluginslov1.Result) error {
25+
src := request.OriginalSource.SlothV1
26+
if src == nil {
27+
return nil
28+
}
29+
30+
var pageFor prommodel.Duration
31+
var ticketFor prommodel.Duration
32+
found := false
33+
for _, specSLO := range src.SLOs {
34+
if specSLO.Name != request.SLO.Name {
35+
continue
36+
}
37+
38+
pageFor = specSLO.Alerting.PageAlert.For
39+
ticketFor = specSLO.Alerting.TicketAlert.For
40+
found = true
41+
break
42+
}
43+
44+
if !found || (pageFor == 0 && ticketFor == 0) {
45+
return nil
46+
}
47+
48+
pageSeverity := request.MWMBAlertGroup.PageQuick.Severity.String()
49+
ticketSeverity := request.MWMBAlertGroup.TicketQuick.Severity.String()
50+
51+
for i := range result.SLORules.AlertRules.Rules {
52+
rule := &result.SLORules.AlertRules.Rules[i]
53+
if rule.Labels == nil {
54+
continue
55+
}
56+
57+
switch rule.Labels[conventions.PromSLOSeverityLabelName] {
58+
case pageSeverity:
59+
if pageFor != 0 {
60+
rule.For = pageFor
61+
}
62+
case ticketSeverity:
63+
if ticketFor != 0 {
64+
rule.For = ticketFor
65+
}
66+
}
67+
}
68+
69+
return nil
70+
}
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
package plugin_test
2+
3+
import (
4+
"testing"
5+
"time"
6+
7+
prommodel "github.com/prometheus/common/model"
8+
"github.com/prometheus/prometheus/model/rulefmt"
9+
"github.com/stretchr/testify/assert"
10+
11+
plugin "github.com/slok/sloth/internal/plugin/slo/contrib/alert_for_v1"
12+
"github.com/slok/sloth/pkg/common/conventions"
13+
"github.com/slok/sloth/pkg/common/model"
14+
prometheusv1 "github.com/slok/sloth/pkg/prometheus/api/v1"
15+
pluginslov1 "github.com/slok/sloth/pkg/prometheus/plugin/slo/v1"
16+
pluginslov1testing "github.com/slok/sloth/pkg/prometheus/plugin/slo/v1/testing"
17+
)
18+
19+
func TestPlugin(t *testing.T) {
20+
tests := map[string]struct {
21+
pluginFactory func(t *testing.T) (pluginslov1.Plugin, error)
22+
req pluginslov1.Request
23+
res pluginslov1.Result
24+
expRes pluginslov1.Result
25+
}{
26+
"Using the plugin as embedded yaegi plugin, it should set page and ticket `for` durations.": {
27+
pluginFactory: func(t *testing.T) (pluginslov1.Plugin, error) {
28+
return pluginslov1testing.NewTestPlugin(t.Context(), pluginslov1testing.TestPluginConfig{})
29+
},
30+
req: pluginslov1.Request{
31+
SLO: model.PromSLO{Name: "requests-availability"},
32+
MWMBAlertGroup: model.MWMBAlertGroup{
33+
PageQuick: model.MWMBAlert{Severity: model.PageAlertSeverity},
34+
TicketQuick: model.MWMBAlert{Severity: model.TicketAlertSeverity},
35+
},
36+
OriginalSource: model.PromSLOGroupSource{
37+
SlothV1: &prometheusv1.Spec{
38+
Version: prometheusv1.Version,
39+
Service: "myservice",
40+
SLOs: []prometheusv1.SLO{
41+
{
42+
Name: "requests-availability",
43+
Objective: 99.9,
44+
SLI: prometheusv1.SLI{Raw: &prometheusv1.SLIRaw{ErrorRatioQuery: "1"}},
45+
Alerting: prometheusv1.Alerting{
46+
Name: "MyServiceHighErrorRate",
47+
PageAlert: prometheusv1.Alert{For: prommodel.Duration(5 * time.Minute)},
48+
TicketAlert: prometheusv1.Alert{For: prommodel.Duration(10 * time.Minute)},
49+
},
50+
},
51+
},
52+
},
53+
},
54+
},
55+
res: pluginslov1.Result{
56+
SLORules: model.PromSLORules{
57+
AlertRules: model.PromRuleGroup{
58+
Rules: []rulefmt.Rule{
59+
{Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}},
60+
{Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}},
61+
},
62+
},
63+
},
64+
},
65+
expRes: pluginslov1.Result{
66+
SLORules: model.PromSLORules{
67+
AlertRules: model.PromRuleGroup{
68+
Rules: []rulefmt.Rule{
69+
{Alert: "MyServiceHighErrorRate", For: prommodel.Duration(5 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}},
70+
{Alert: "MyServiceHighErrorRate", For: prommodel.Duration(10 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}},
71+
},
72+
},
73+
},
74+
},
75+
},
76+
77+
"Using the plugin as compiled Go plugin, it should set page and ticket `for` durations.": {
78+
pluginFactory: func(t *testing.T) (pluginslov1.Plugin, error) {
79+
return plugin.NewPlugin(nil, pluginslov1.AppUtils{})
80+
},
81+
req: pluginslov1.Request{
82+
SLO: model.PromSLO{Name: "requests-availability"},
83+
MWMBAlertGroup: model.MWMBAlertGroup{
84+
PageQuick: model.MWMBAlert{Severity: model.PageAlertSeverity},
85+
TicketQuick: model.MWMBAlert{Severity: model.TicketAlertSeverity},
86+
},
87+
OriginalSource: model.PromSLOGroupSource{
88+
SlothV1: &prometheusv1.Spec{
89+
Version: prometheusv1.Version,
90+
Service: "myservice",
91+
SLOs: []prometheusv1.SLO{
92+
{
93+
Name: "requests-availability",
94+
Objective: 99.9,
95+
SLI: prometheusv1.SLI{Raw: &prometheusv1.SLIRaw{ErrorRatioQuery: "1"}},
96+
Alerting: prometheusv1.Alerting{
97+
Name: "MyServiceHighErrorRate",
98+
PageAlert: prometheusv1.Alert{For: prommodel.Duration(5 * time.Minute)},
99+
TicketAlert: prometheusv1.Alert{For: prommodel.Duration(10 * time.Minute)},
100+
},
101+
},
102+
},
103+
},
104+
},
105+
},
106+
res: pluginslov1.Result{
107+
SLORules: model.PromSLORules{
108+
AlertRules: model.PromRuleGroup{
109+
Rules: []rulefmt.Rule{
110+
{Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}},
111+
{Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}},
112+
},
113+
},
114+
},
115+
},
116+
expRes: pluginslov1.Result{
117+
SLORules: model.PromSLORules{
118+
AlertRules: model.PromRuleGroup{
119+
Rules: []rulefmt.Rule{
120+
{Alert: "MyServiceHighErrorRate", For: prommodel.Duration(5 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}},
121+
{Alert: "MyServiceHighErrorRate", For: prommodel.Duration(10 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}},
122+
},
123+
},
124+
},
125+
},
126+
},
127+
}
128+
129+
for name, test := range tests {
130+
t.Run(name, func(t *testing.T) {
131+
assert := assert.New(t)
132+
133+
p, err := test.pluginFactory(t)
134+
assert.NoError(err)
135+
136+
res := test.res
137+
err = p.ProcessSLO(t.Context(), &test.req, &res)
138+
if assert.NoError(err) {
139+
assert.Equal(test.expRes, res)
140+
}
141+
})
142+
}
143+
}

0 commit comments

Comments
 (0)