diff --git a/README.md b/README.md index 2a73355b..c65b33d1 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ https://sloth.dev Release the Sloth! ```bash -sloth generate -i ./examples/getting-started.yml +sloth generate --slo-plugins-path=./examples/plugins -i ./examples/getting-started.yml ``` ```yaml @@ -64,6 +64,9 @@ labels: owner: "myteam" repo: "myorg/myservice" tier: "2" +slo_plugins: + chain: + - id: "sloth.dev/contrib/alert_for/v1" slos: # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). - name: "requests-availability" @@ -83,10 +86,14 @@ slos: # Overwrite default Sloth SLO alert summmary on ticket and page alerts. summary: "High error rate on 'myservice' requests responses" page_alert: + # Requires `sloth.dev/contrib/alert_for/v1` from `--slo-plugins-path`. + for: 5m labels: severity: "pageteam" routing_key: "myteam" ticket_alert: + # Requires `sloth.dev/contrib/alert_for/v1` from `--slo-plugins-path`. + for: 10m labels: severity: "slack" slack_channel: "#alerts-myteam" diff --git a/examples/getting-started.yml b/examples/getting-started.yml index c1ba5e78..eaf2e8dc 100644 --- a/examples/getting-started.yml +++ b/examples/getting-started.yml @@ -4,6 +4,9 @@ labels: owner: "myteam" repo: "myorg/myservice" tier: "2" +slo_plugins: + chain: + - id: "sloth.dev/contrib/alert_for/v1" slos: # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). - name: "requests-availability" @@ -21,10 +24,12 @@ slos: # Overwrite default Sloth SLO alert summmary on ticket and page alerts. summary: "High error rate on 'myservice' requests responses" page_alert: + for: 5m labels: severity: pageteam routing_key: myteam ticket_alert: + for: 10m labels: severity: "slack" slack_channel: "#alerts-myteam" diff --git a/internal/plugin/slo/contrib/alert_for_v1/README.md b/internal/plugin/slo/contrib/alert_for_v1/README.md new file mode 100644 index 00000000..b0043cbf --- /dev/null +++ b/internal/plugin/slo/contrib/alert_for_v1/README.md @@ -0,0 +1,31 @@ +# sloth.dev/contrib/alert_for/v1 + +This plugin sets Prometheus alert `for` durations from the Sloth `prometheus/v1` YAML spec fields: + +- `slos[].alerting.page_alert.for` +- `slos[].alerting.ticket_alert.for` + +This plugin is required because the core plugins ignore `for` and always generate alerts without a pending time. + +## Example + +```yaml +version: "prometheus/v1" +service: "myservice" +slo_plugins: + chain: + - id: "sloth.dev/contrib/alert_for/v1" +slos: + - name: "requests-availability" + objective: 99.9 + sli: + events: + error_query: sum(rate(http_requests_total{code=~"5.."}[{{.window}}])) + total_query: sum(rate(http_requests_total[{{.window}}])) + alerting: + name: MyServiceHighErrorRate + page_alert: + for: 5m + ticket_alert: + for: 10m +``` diff --git a/internal/plugin/slo/contrib/alert_for_v1/plugin.go b/internal/plugin/slo/contrib/alert_for_v1/plugin.go new file mode 100644 index 00000000..a3f4d977 --- /dev/null +++ b/internal/plugin/slo/contrib/alert_for_v1/plugin.go @@ -0,0 +1,70 @@ +package plugin + +import ( + "context" + "encoding/json" + + prommodel "github.com/prometheus/common/model" + + "github.com/slok/sloth/pkg/common/conventions" + pluginslov1 "github.com/slok/sloth/pkg/prometheus/plugin/slo/v1" +) + +const ( + PluginVersion = "prometheus/slo/v1" + PluginID = "sloth.dev/contrib/alert_for/v1" +) + +func NewPlugin(_ json.RawMessage, _ pluginslov1.AppUtils) (pluginslov1.Plugin, error) { + return plugin{}, nil +} + +type plugin struct{} + +func (p plugin) ProcessSLO(_ context.Context, request *pluginslov1.Request, result *pluginslov1.Result) error { + src := request.OriginalSource.SlothV1 + if src == nil { + return nil + } + + var pageFor prommodel.Duration + var ticketFor prommodel.Duration + found := false + for _, specSLO := range src.SLOs { + if specSLO.Name != request.SLO.Name { + continue + } + + pageFor = specSLO.Alerting.PageAlert.For + ticketFor = specSLO.Alerting.TicketAlert.For + found = true + break + } + + if !found || (pageFor == 0 && ticketFor == 0) { + return nil + } + + pageSeverity := request.MWMBAlertGroup.PageQuick.Severity.String() + ticketSeverity := request.MWMBAlertGroup.TicketQuick.Severity.String() + + for i := range result.SLORules.AlertRules.Rules { + rule := &result.SLORules.AlertRules.Rules[i] + if rule.Labels == nil { + continue + } + + switch rule.Labels[conventions.PromSLOSeverityLabelName] { + case pageSeverity: + if pageFor != 0 { + rule.For = pageFor + } + case ticketSeverity: + if ticketFor != 0 { + rule.For = ticketFor + } + } + } + + return nil +} diff --git a/internal/plugin/slo/contrib/alert_for_v1/plugin_test.go b/internal/plugin/slo/contrib/alert_for_v1/plugin_test.go new file mode 100644 index 00000000..c7a8716d --- /dev/null +++ b/internal/plugin/slo/contrib/alert_for_v1/plugin_test.go @@ -0,0 +1,143 @@ +package plugin_test + +import ( + "testing" + "time" + + prommodel "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/rulefmt" + "github.com/stretchr/testify/assert" + + plugin "github.com/slok/sloth/internal/plugin/slo/contrib/alert_for_v1" + "github.com/slok/sloth/pkg/common/conventions" + "github.com/slok/sloth/pkg/common/model" + prometheusv1 "github.com/slok/sloth/pkg/prometheus/api/v1" + pluginslov1 "github.com/slok/sloth/pkg/prometheus/plugin/slo/v1" + pluginslov1testing "github.com/slok/sloth/pkg/prometheus/plugin/slo/v1/testing" +) + +func TestPlugin(t *testing.T) { + tests := map[string]struct { + pluginFactory func(t *testing.T) (pluginslov1.Plugin, error) + req pluginslov1.Request + res pluginslov1.Result + expRes pluginslov1.Result + }{ + "Using the plugin as embedded yaegi plugin, it should set page and ticket `for` durations.": { + pluginFactory: func(t *testing.T) (pluginslov1.Plugin, error) { + return pluginslov1testing.NewTestPlugin(t.Context(), pluginslov1testing.TestPluginConfig{}) + }, + req: pluginslov1.Request{ + SLO: model.PromSLO{Name: "requests-availability"}, + MWMBAlertGroup: model.MWMBAlertGroup{ + PageQuick: model.MWMBAlert{Severity: model.PageAlertSeverity}, + TicketQuick: model.MWMBAlert{Severity: model.TicketAlertSeverity}, + }, + OriginalSource: model.PromSLOGroupSource{ + SlothV1: &prometheusv1.Spec{ + Version: prometheusv1.Version, + Service: "myservice", + SLOs: []prometheusv1.SLO{ + { + Name: "requests-availability", + Objective: 99.9, + SLI: prometheusv1.SLI{Raw: &prometheusv1.SLIRaw{ErrorRatioQuery: "1"}}, + Alerting: prometheusv1.Alerting{ + Name: "MyServiceHighErrorRate", + PageAlert: prometheusv1.Alert{For: prommodel.Duration(5 * time.Minute)}, + TicketAlert: prometheusv1.Alert{For: prommodel.Duration(10 * time.Minute)}, + }, + }, + }, + }, + }, + }, + res: pluginslov1.Result{ + SLORules: model.PromSLORules{ + AlertRules: model.PromRuleGroup{ + Rules: []rulefmt.Rule{ + {Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}}, + {Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}}, + }, + }, + }, + }, + expRes: pluginslov1.Result{ + SLORules: model.PromSLORules{ + AlertRules: model.PromRuleGroup{ + Rules: []rulefmt.Rule{ + {Alert: "MyServiceHighErrorRate", For: prommodel.Duration(5 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}}, + {Alert: "MyServiceHighErrorRate", For: prommodel.Duration(10 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}}, + }, + }, + }, + }, + }, + + "Using the plugin as compiled Go plugin, it should set page and ticket `for` durations.": { + pluginFactory: func(t *testing.T) (pluginslov1.Plugin, error) { + return plugin.NewPlugin(nil, pluginslov1.AppUtils{}) + }, + req: pluginslov1.Request{ + SLO: model.PromSLO{Name: "requests-availability"}, + MWMBAlertGroup: model.MWMBAlertGroup{ + PageQuick: model.MWMBAlert{Severity: model.PageAlertSeverity}, + TicketQuick: model.MWMBAlert{Severity: model.TicketAlertSeverity}, + }, + OriginalSource: model.PromSLOGroupSource{ + SlothV1: &prometheusv1.Spec{ + Version: prometheusv1.Version, + Service: "myservice", + SLOs: []prometheusv1.SLO{ + { + Name: "requests-availability", + Objective: 99.9, + SLI: prometheusv1.SLI{Raw: &prometheusv1.SLIRaw{ErrorRatioQuery: "1"}}, + Alerting: prometheusv1.Alerting{ + Name: "MyServiceHighErrorRate", + PageAlert: prometheusv1.Alert{For: prommodel.Duration(5 * time.Minute)}, + TicketAlert: prometheusv1.Alert{For: prommodel.Duration(10 * time.Minute)}, + }, + }, + }, + }, + }, + }, + res: pluginslov1.Result{ + SLORules: model.PromSLORules{ + AlertRules: model.PromRuleGroup{ + Rules: []rulefmt.Rule{ + {Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}}, + {Alert: "MyServiceHighErrorRate", Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}}, + }, + }, + }, + }, + expRes: pluginslov1.Result{ + SLORules: model.PromSLORules{ + AlertRules: model.PromRuleGroup{ + Rules: []rulefmt.Rule{ + {Alert: "MyServiceHighErrorRate", For: prommodel.Duration(5 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "page"}}, + {Alert: "MyServiceHighErrorRate", For: prommodel.Duration(10 * time.Minute), Labels: map[string]string{conventions.PromSLOSeverityLabelName: "ticket"}}, + }, + }, + }, + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + assert := assert.New(t) + + p, err := test.pluginFactory(t) + assert.NoError(err) + + res := test.res + err = p.ProcessSLO(t.Context(), &test.req, &res) + if assert.NoError(err) { + assert.Equal(test.expRes, res) + } + }) + } +} diff --git a/pkg/prometheus/api/v1/README.md b/pkg/prometheus/api/v1/README.md index 9c9d239a..542ef8bc 100755 --- a/pkg/prometheus/api/v1/README.md +++ b/pkg/prometheus/api/v1/README.md @@ -87,7 +87,7 @@ const Version = "prometheus/v1" ``` -## type [Alert]() +## type [Alert]() Alert configures specific SLO alert. @@ -96,6 +96,12 @@ type Alert struct { // Disable disables the alert and makes Sloth not generating this alert. This // can be helpful for example to disable ticket(warning) alerts. Disable bool `json:"disable,omitempty"` + // For is the alerting time window that a rule must be active before firing. + // Default is 0m, which means no pending time. + // + // Sloth core plugins ignore this field. Use an output plugin (SLO plugin) that applies + // it on the generated Prometheus rules, like the `custom_alert_for.go` example. + For prommodel.Duration `json:"for,omitempty"` // Labels are the Prometheus labels for the specific alert. For example can be // useful to route the Page alert to specific Slack channel. Labels map[string]string `json:"labels,omitempty"` @@ -105,7 +111,7 @@ type Alert struct { ``` -## type [Alerting]() +## type [Alerting]() Alerting wraps all the configuration required by the SLO alerts. @@ -126,7 +132,7 @@ type Alerting struct { ``` -## type [SLI]() +## type [SLI]() SLI will tell what is good or bad for the SLO. All SLIs will be get based on time windows, that's why Sloth needs the queries to use \`\{\{.window\}\}\` template variable. @@ -144,7 +150,7 @@ type SLI struct { ``` -## type [SLIEvents]() +## type [SLIEvents]() SLIEvents is an SLI that is calculated as the division of bad events and total events, giving a ratio SLI. Normally this is the most common ratio type. @@ -162,7 +168,7 @@ type SLIEvents struct { ``` -## type [SLIPlugin]() +## type [SLIPlugin]() SLIPlugin will use the SLI returned by the SLI plugin selected along with the options. @@ -176,7 +182,7 @@ type SLIPlugin struct { ``` -## type [SLIRaw]() +## type [SLIRaw]() SLIRaw is a error ratio SLI already calculated. Normally this will be used when the SLI is already calculated by other recording rule, system... @@ -188,7 +194,7 @@ type SLIRaw struct { ``` -## type [SLO]() +## type [SLO]() SLO is the configuration/declaration of the service level objective of a service. @@ -216,7 +222,7 @@ type SLO struct { ``` -## type [SLOPlugin]() +## type [SLOPlugin]() SLOPlugin is a plugin that will be used on the chain of plugins for the SLO generation. @@ -237,7 +243,7 @@ type SLOPlugin struct { ``` -## type [SLOPlugins]() +## type [SLOPlugins]() SLOPlugins are the list plugins that will be used on the process of SLOs for the rules generation. @@ -255,7 +261,7 @@ type SLOPlugins struct { ``` -## type [Spec]() +## type [Spec]() Spec represents the root type of the SLOs declaration specification. diff --git a/pkg/prometheus/api/v1/v1.go b/pkg/prometheus/api/v1/v1.go index e18b4b9f..3b7e639f 100644 --- a/pkg/prometheus/api/v1/v1.go +++ b/pkg/prometheus/api/v1/v1.go @@ -54,7 +54,11 @@ // disable: true package v1 -import "encoding/json" +import ( + "encoding/json" + + prommodel "github.com/prometheus/common/model" +) const Version = "prometheus/v1" @@ -160,6 +164,12 @@ type Alert struct { // Disable disables the alert and makes Sloth not generating this alert. This // can be helpful for example to disable ticket(warning) alerts. Disable bool `json:"disable,omitempty"` + // For is the alerting time window that a rule must be active before firing. + // Default is 0m, which means no pending time. + // + // Sloth core plugins ignore this field. Use an output plugin (SLO plugin) that applies + // it on the generated Prometheus rules, like the `custom_alert_for.go` example. + For prommodel.Duration `json:"for,omitempty"` // Labels are the Prometheus labels for the specific alert. For example can be // useful to route the Page alert to specific Slack channel. Labels map[string]string `json:"labels,omitempty"`