Skip to content

Commit d21a242

Browse files
Health Rules
1 parent ab9ae57 commit d21a242

25 files changed

+917
-392
lines changed

api/flowcollector/v1beta2/flowcollector_alert_types.go

Lines changed: 74 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,64 +10,88 @@ import (
1010
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1111
)
1212

13-
type AlertTemplate string
14-
type AlertGroupBy string
13+
type HealthRuleTemplate string
14+
type HealthRuleGroupBy string
15+
type HealthRuleMode string
16+
17+
// Backward compatibility type alias for disableAlerts field
18+
type AlertTemplate = HealthRuleTemplate
19+
type AlertGroupBy = HealthRuleGroupBy
1520

1621
const (
17-
AlertNoFlows AlertTemplate = "NetObservNoFlows"
18-
AlertLokiError AlertTemplate = "NetObservLokiError"
19-
AlertPacketDropsByKernel AlertTemplate = "PacketDropsByKernel"
20-
AlertPacketDropsByDevice AlertTemplate = "PacketDropsByDevice"
21-
AlertIPsecErrors AlertTemplate = "IPsecErrors"
22-
AlertNetpolDenied AlertTemplate = "NetpolDenied"
23-
AlertLatencyHighTrend AlertTemplate = "LatencyHighTrend"
24-
AlertDNSErrors AlertTemplate = "DNSErrors"
25-
AlertDNSNxDomain AlertTemplate = "DNSNxDomain"
26-
AlertExternalEgressHighTrend AlertTemplate = "ExternalEgressHighTrend"
27-
AlertExternalIngressHighTrend AlertTemplate = "ExternalIngressHighTrend"
28-
GroupByNode AlertGroupBy = "Node"
29-
GroupByNamespace AlertGroupBy = "Namespace"
30-
GroupByWorkload AlertGroupBy = "Workload"
22+
// Alert-only templates (cannot be used as recording rules)
23+
AlertNoFlows AlertTemplate = "NetObservNoFlows"
24+
AlertLokiError AlertTemplate = "NetObservLokiError"
25+
26+
// Health rule templates (can be either alerts or recording rules depending on mode)
27+
HealthRulePacketDropsByKernel HealthRuleTemplate = "PacketDropsByKernel"
28+
HealthRulePacketDropsByDevice HealthRuleTemplate = "PacketDropsByDevice"
29+
HealthRuleIPsecErrors HealthRuleTemplate = "IPsecErrors"
30+
HealthRuleNetpolDenied HealthRuleTemplate = "NetpolDenied"
31+
HealthRuleLatencyHighTrend HealthRuleTemplate = "LatencyHighTrend"
32+
HealthRuleDNSErrors HealthRuleTemplate = "DNSErrors"
33+
HealthRuleDNSNxDomain HealthRuleTemplate = "DNSNxDomain"
34+
HealthRuleExternalEgressHighTrend HealthRuleTemplate = "ExternalEgressHighTrend"
35+
HealthRuleExternalIngressHighTrend HealthRuleTemplate = "ExternalIngressHighTrend"
36+
37+
GroupByNode HealthRuleGroupBy = "Node"
38+
GroupByNamespace HealthRuleGroupBy = "Namespace"
39+
GroupByWorkload HealthRuleGroupBy = "Workload"
40+
41+
ModeAlert HealthRuleMode = "Alert"
42+
ModeMetricOnly HealthRuleMode = "MetricOnly"
3143
)
3244

33-
type FLPAlert struct {
34-
// Alert template name.
45+
type FLPHealthRule struct {
46+
// Health rule template name.
3547
// Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`,
3648
// `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`.
37-
// More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
49+
// Note: `NetObservNoFlows` and `NetObservLokiError` are alert-only and cannot be used as health rules.
50+
// More information on health rules: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
3851
// +kubebuilder:validation:Enum:="PacketDropsByKernel";"PacketDropsByDevice";"IPsecErrors";"NetpolDenied";"LatencyHighTrend";"DNSErrors";"DNSNxDomain";"ExternalEgressHighTrend";"ExternalIngressHighTrend"
3952
// +required
40-
Template AlertTemplate `json:"template,omitempty"`
53+
Template HealthRuleTemplate `json:"template,omitempty"`
54+
55+
// Mode defines whether this health rule should be generated as an alert or a recording rule.
56+
// Possible values are: `Alert` (default), `MetricOnly`.
57+
// MetricOnly rules violations are visible in the Network Health dashboard without generating any Prometheus alert.
58+
// This provides an alternative way of getting Health information for SRE and cluster admins who may find
59+
// many new alerts burdensome.
60+
// +kubebuilder:validation:Enum:="Alert";"MetricOnly"
61+
// +kubebuilder:default:="Alert"
62+
// +optional
63+
Mode HealthRuleMode `json:"mode,omitempty"`
4164

4265
// A list of variants for this template
4366
// +required
44-
Variants []AlertVariant `json:"variants,omitempty"`
67+
Variants []HealthRuleVariant `json:"variants,omitempty"`
4568
}
4669

47-
type AlertVariant struct {
70+
type HealthRuleVariant struct {
4871
// The low volume threshold allows to ignore metrics with a too low volume of traffic, in order to improve signal-to-noise.
4972
// It is provided as an absolute rate (bytes per second or packets per second, depending on the context).
5073
// When provided, it must be parsable as a float.
5174
LowVolumeThreshold string `json:"lowVolumeThreshold,omitempty"`
5275

53-
// Thresholds of the alert per severity.
76+
// Thresholds of the health rule per severity.
5477
// They are expressed as a percentage of errors above which the alert is triggered. They must be parsable as floats.
78+
// Required for both alert and recording modes
5579
// +required
56-
Thresholds AlertThresholds `json:"thresholds,omitempty"`
80+
Thresholds HealthRuleThresholds `json:"thresholds,omitempty"`
5781

5882
// Optional grouping criteria, possible values are: `Node`, `Namespace`, `Workload`.
5983
// +kubebuilder:validation:Enum:="";"Node";"Namespace";"Workload"
6084
// +optional
61-
GroupBy AlertGroupBy `json:"groupBy,omitempty"`
85+
GroupBy HealthRuleGroupBy `json:"groupBy,omitempty"`
6286

63-
// For trending alerts, the time offset for baseline comparison. For example, "1d" means comparing against yesterday. Defaults to 1d.
87+
// For trending health rules, the time offset for baseline comparison. For example, "1d" means comparing against yesterday. Defaults to 1d.
6488
TrendOffset *metav1.Duration `json:"trendOffset,omitempty"`
6589

66-
// For trending alerts, the duration interval for baseline comparison. For example, "2h" means comparing against a 2-hours average. Defaults to 2h.
90+
// For trending health rules, the duration interval for baseline comparison. For example, "2h" means comparing against a 2-hours average. Defaults to 2h.
6791
TrendDuration *metav1.Duration `json:"trendDuration,omitempty"`
6892
}
6993

70-
type AlertThresholds struct {
94+
type HealthRuleThresholds struct {
7195
// Threshold for severity `info`. Leave empty to not generate an Info alert.
7296
Info string `json:"info,omitempty"`
7397

@@ -123,19 +147,19 @@ func removeMetricsByPattern(list []string, search string) []string {
123147
return filtered
124148
}
125149

126-
func (s *FlowCollectorSpec) GetFLPAlerts() []FLPAlert {
127-
var ret []FLPAlert
128-
var templates []AlertTemplate // for reproducible ordering
150+
func (s *FlowCollectorSpec) GetFLPHealthRules() []FLPHealthRule {
151+
var rules []FLPHealthRule
152+
var templates []HealthRuleTemplate // for reproducible ordering
129153

130-
tplMap := make(map[AlertTemplate]FLPAlert)
131-
for _, group := range DefaultAlerts {
154+
tplMap := make(map[HealthRuleTemplate]FLPHealthRule)
155+
for _, group := range DefaultHealthRules {
132156
if !slices.Contains(s.Processor.Metrics.DisableAlerts, group.Template) {
133157
tplMap[group.Template] = group
134158
templates = append(templates, group.Template)
135159
}
136160
}
137-
if s.Processor.Metrics.Alerts != nil {
138-
for _, group := range *s.Processor.Metrics.Alerts {
161+
if s.Processor.Metrics.HealthRules != nil {
162+
for _, group := range *s.Processor.Metrics.HealthRules {
139163
if !slices.Contains(s.Processor.Metrics.DisableAlerts, group.Template) {
140164
// A group defined in FC overrides the default group
141165
tplMap[group.Template] = group
@@ -149,42 +173,42 @@ func (s *FlowCollectorSpec) GetFLPAlerts() []FLPAlert {
149173
for _, name := range templates {
150174
tpl := tplMap[name]
151175
if ok, _ := tpl.IsAllowed(s); ok {
152-
ret = append(ret, tpl)
176+
rules = append(rules, tpl)
153177
}
154178
}
155179

156-
return ret
180+
return rules
157181
}
158182

159-
func (g *FLPAlert) IsAllowed(spec *FlowCollectorSpec) (bool, string) {
183+
func (g *FLPHealthRule) IsAllowed(spec *FlowCollectorSpec) (bool, string) {
160184
switch g.Template {
161-
case AlertPacketDropsByKernel:
185+
case HealthRulePacketDropsByKernel:
162186
if !spec.Agent.EBPF.IsPktDropEnabled() {
163-
return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, PacketDrop)
187+
return false, fmt.Sprintf("HealthRule %s requires the %s agent feature to be enabled", g.Template, PacketDrop)
164188
}
165-
case AlertIPsecErrors:
189+
case HealthRuleIPsecErrors:
166190
if !spec.Agent.EBPF.IsIPSecEnabled() {
167-
return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, IPSec)
191+
return false, fmt.Sprintf("HealthRule %s requires the %s agent feature to be enabled", g.Template, IPSec)
168192
}
169-
case AlertDNSErrors, AlertDNSNxDomain:
193+
case HealthRuleDNSErrors, HealthRuleDNSNxDomain:
170194
if !spec.Agent.EBPF.IsDNSTrackingEnabled() {
171-
return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, DNSTracking)
195+
return false, fmt.Sprintf("HealthRule %s requires the %s agent feature to be enabled", g.Template, DNSTracking)
172196
}
173-
case AlertLatencyHighTrend:
197+
case HealthRuleLatencyHighTrend:
174198
if !spec.Agent.EBPF.IsFlowRTTEnabled() {
175-
return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, FlowRTT)
199+
return false, fmt.Sprintf("HealthRule %s requires the %s agent feature to be enabled", g.Template, FlowRTT)
176200
}
177-
case AlertNetpolDenied:
201+
case HealthRuleNetpolDenied:
178202
if !spec.Agent.EBPF.IsNetworkEventsEnabled() {
179-
return false, fmt.Sprintf("Alert %s requires the %s agent feature to be enabled", g.Template, NetworkEvents)
203+
return false, fmt.Sprintf("HealthRule %s requires the %s agent feature to be enabled", g.Template, NetworkEvents)
180204
}
181-
case AlertNoFlows, AlertLokiError, AlertPacketDropsByDevice, AlertExternalEgressHighTrend, AlertExternalIngressHighTrend:
205+
case AlertNoFlows, AlertLokiError, HealthRulePacketDropsByDevice, HealthRuleExternalEgressHighTrend, HealthRuleExternalIngressHighTrend:
182206
return true, ""
183207
}
184208
return true, ""
185209
}
186210

187-
func (v *AlertVariant) GetTrendParams() (string, string) {
211+
func (v *HealthRuleVariant) GetTrendParams() (string, string) {
188212
offset := metav1.Duration{Duration: 24 * time.Hour}
189213
if v.TrendOffset != nil {
190214
offset = *v.TrendOffset

api/flowcollector/v1beta2/flowcollector_defaults.go

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -45,20 +45,20 @@ var (
4545
"node_ipsec_flows_total",
4646
"node_to_node_ingress_flows_total",
4747
}
48-
DefaultAlerts = []FLPAlert{
48+
DefaultHealthRules = []FLPHealthRule{
4949
{
50-
Template: AlertPacketDropsByKernel,
51-
Variants: []AlertVariant{
50+
Template: HealthRulePacketDropsByKernel,
51+
Variants: []HealthRuleVariant{
5252
{
53-
Thresholds: AlertThresholds{
53+
Thresholds: HealthRuleThresholds{
5454
Info: "10",
5555
Warning: "20",
5656
},
5757
LowVolumeThreshold: "5",
5858
GroupBy: GroupByNamespace,
5959
},
6060
{
61-
Thresholds: AlertThresholds{
61+
Thresholds: HealthRuleThresholds{
6262
Info: "5",
6363
Warning: "10",
6464
},
@@ -67,42 +67,42 @@ var (
6767
},
6868
},
6969
{
70-
Template: AlertPacketDropsByDevice,
71-
Variants: []AlertVariant{
70+
Template: HealthRulePacketDropsByDevice,
71+
Variants: []HealthRuleVariant{
7272
{
73-
Thresholds: AlertThresholds{
73+
Thresholds: HealthRuleThresholds{
7474
Warning: "5",
7575
},
7676
GroupBy: GroupByNode,
7777
},
7878
},
7979
},
8080
{
81-
Template: AlertIPsecErrors,
82-
Variants: []AlertVariant{
81+
Template: HealthRuleIPsecErrors,
82+
Variants: []HealthRuleVariant{
8383
{
84-
Thresholds: AlertThresholds{
84+
Thresholds: HealthRuleThresholds{
8585
Critical: "2",
8686
},
8787
},
8888
{
89-
Thresholds: AlertThresholds{
89+
Thresholds: HealthRuleThresholds{
9090
Critical: "2",
9191
},
9292
GroupBy: GroupByNode,
9393
},
9494
},
9595
},
9696
{
97-
Template: AlertDNSErrors,
98-
Variants: []AlertVariant{
97+
Template: HealthRuleDNSErrors,
98+
Variants: []HealthRuleVariant{
9999
{
100-
Thresholds: AlertThresholds{
100+
Thresholds: HealthRuleThresholds{
101101
Warning: "5",
102102
},
103103
},
104104
{
105-
Thresholds: AlertThresholds{
105+
Thresholds: HealthRuleThresholds{
106106
Info: "5",
107107
Warning: "10",
108108
},
@@ -111,10 +111,10 @@ var (
111111
},
112112
},
113113
{
114-
Template: AlertDNSNxDomain,
115-
Variants: []AlertVariant{
114+
Template: HealthRuleDNSNxDomain,
115+
Variants: []HealthRuleVariant{
116116
{
117-
Thresholds: AlertThresholds{
117+
Thresholds: HealthRuleThresholds{
118118
Info: "10",
119119
Warning: "80",
120120
},
@@ -123,10 +123,10 @@ var (
123123
},
124124
},
125125
{
126-
Template: AlertNetpolDenied,
127-
Variants: []AlertVariant{
126+
Template: HealthRuleNetpolDenied,
127+
Variants: []HealthRuleVariant{
128128
{
129-
Thresholds: AlertThresholds{
129+
Thresholds: HealthRuleThresholds{
130130
Info: "5",
131131
Warning: "10",
132132
},
@@ -135,10 +135,10 @@ var (
135135
},
136136
},
137137
{
138-
Template: AlertLatencyHighTrend,
139-
Variants: []AlertVariant{
138+
Template: HealthRuleLatencyHighTrend,
139+
Variants: []HealthRuleVariant{
140140
{
141-
Thresholds: AlertThresholds{
141+
Thresholds: HealthRuleThresholds{
142142
Info: "100",
143143
},
144144
GroupBy: GroupByNamespace,
@@ -148,18 +148,18 @@ var (
148148
},
149149
},
150150
{
151-
Template: AlertExternalEgressHighTrend,
152-
Variants: []AlertVariant{
151+
Template: HealthRuleExternalEgressHighTrend,
152+
Variants: []HealthRuleVariant{
153153
{
154-
Thresholds: AlertThresholds{
154+
Thresholds: HealthRuleThresholds{
155155
Warning: "200",
156156
},
157157
GroupBy: GroupByNode,
158158
TrendOffset: &v1.Duration{Duration: 24 * time.Hour},
159159
TrendDuration: &v1.Duration{Duration: time.Hour},
160160
},
161161
{
162-
Thresholds: AlertThresholds{
162+
Thresholds: HealthRuleThresholds{
163163
Info: "100",
164164
Warning: "500",
165165
},
@@ -170,18 +170,18 @@ var (
170170
},
171171
},
172172
{
173-
Template: AlertExternalIngressHighTrend,
174-
Variants: []AlertVariant{
173+
Template: HealthRuleExternalIngressHighTrend,
174+
Variants: []HealthRuleVariant{
175175
{
176-
Thresholds: AlertThresholds{
176+
Thresholds: HealthRuleThresholds{
177177
Warning: "200",
178178
},
179179
GroupBy: GroupByNode,
180180
TrendOffset: &v1.Duration{Duration: 24 * time.Hour},
181181
TrendDuration: &v1.Duration{Duration: time.Hour},
182182
},
183183
{
184-
Thresholds: AlertThresholds{
184+
Thresholds: HealthRuleThresholds{
185185
Info: "100",
186186
Warning: "500",
187187
},

api/flowcollector/v1beta2/flowcollector_types.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -585,12 +585,13 @@ type FLPMetrics struct {
585585
// `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`.
586586
// More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
587587
// +optional
588-
DisableAlerts []AlertTemplate `json:"disableAlerts"`
588+
DisableAlerts []HealthRuleTemplate `json:"disableAlerts"`
589589

590-
// `alerts` is a list of alerts to be created for Prometheus AlertManager, organized by templates and variants.
591-
// More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
590+
// `healthRules` is a list of health rules to be created for Prometheus, organized by templates and variants.
591+
// Each health rule can be configured to generate either alerts or recording rules based on the mode field.
592+
// More information on health rules: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
592593
// +optional
593-
Alerts *[]FLPAlert `json:"alerts"`
594+
HealthRules *[]FLPHealthRule `json:"healthRules"`
594595
}
595596

596597
type FLPLogTypes string

0 commit comments

Comments
 (0)