@@ -10,64 +10,88 @@ import (
1010 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1111)
1212
13- type AlertTemplate string
14- type AlertGroupBy string
13+ type HealthRuleTemplate string
14+ type HealthRuleGroupBy string
15+ type HealthRuleMode string
16+
17+ // Backward compatibility type alias for disableAlerts field
18+ type AlertTemplate = HealthRuleTemplate
19+ type AlertGroupBy = HealthRuleGroupBy
1520
1621const (
17- AlertNoFlows AlertTemplate = "NetObservNoFlows"
18- AlertLokiError AlertTemplate = "NetObservLokiError"
19- AlertPacketDropsByKernel AlertTemplate = "PacketDropsByKernel"
20- AlertPacketDropsByDevice AlertTemplate = "PacketDropsByDevice"
21- AlertIPsecErrors AlertTemplate = "IPsecErrors"
22- AlertNetpolDenied AlertTemplate = "NetpolDenied"
23- AlertLatencyHighTrend AlertTemplate = "LatencyHighTrend"
24- AlertDNSErrors AlertTemplate = "DNSErrors"
25- AlertDNSNxDomain AlertTemplate = "DNSNxDomain"
26- AlertExternalEgressHighTrend AlertTemplate = "ExternalEgressHighTrend"
27- AlertExternalIngressHighTrend AlertTemplate = "ExternalIngressHighTrend"
28- GroupByNode AlertGroupBy = "Node"
29- GroupByNamespace AlertGroupBy = "Namespace"
30- GroupByWorkload AlertGroupBy = "Workload"
22+ // Alert-only templates (cannot be used as recording rules)
23+ AlertNoFlows AlertTemplate = "NetObservNoFlows"
24+ AlertLokiError AlertTemplate = "NetObservLokiError"
25+
26+ // Health rule templates (can be either alerts or recording rules depending on mode)
27+ HealthRulePacketDropsByKernel HealthRuleTemplate = "PacketDropsByKernel"
28+ HealthRulePacketDropsByDevice HealthRuleTemplate = "PacketDropsByDevice"
29+ HealthRuleIPsecErrors HealthRuleTemplate = "IPsecErrors"
30+ HealthRuleNetpolDenied HealthRuleTemplate = "NetpolDenied"
31+ HealthRuleLatencyHighTrend HealthRuleTemplate = "LatencyHighTrend"
32+ HealthRuleDNSErrors HealthRuleTemplate = "DNSErrors"
33+ HealthRuleDNSNxDomain HealthRuleTemplate = "DNSNxDomain"
34+ HealthRuleExternalEgressHighTrend HealthRuleTemplate = "ExternalEgressHighTrend"
35+ HealthRuleExternalIngressHighTrend HealthRuleTemplate = "ExternalIngressHighTrend"
36+
37+ GroupByNode HealthRuleGroupBy = "Node"
38+ GroupByNamespace HealthRuleGroupBy = "Namespace"
39+ GroupByWorkload HealthRuleGroupBy = "Workload"
40+
41+ ModeAlert HealthRuleMode = "Alert"
42+ ModeMetricOnly HealthRuleMode = "MetricOnly"
3143)
3244
33- type FLPAlert struct {
34- // Alert template name.
45+ type FLPHealthRule struct {
46+ // Health rule template name.
3547 // Possible values are: `PacketDropsByKernel`, `PacketDropsByDevice`, `IPsecErrors`, `NetpolDenied`,
3648 // `LatencyHighTrend`, `DNSErrors`, `DNSNxDomain`, `ExternalEgressHighTrend`, `ExternalIngressHighTrend`.
37- // More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
49+ // Note: `NetObservNoFlows` and `NetObservLokiError` are alert-only and cannot be used as health rules.
50+ // More information on health rules: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
3851 // +kubebuilder:validation:Enum:="PacketDropsByKernel";"PacketDropsByDevice";"IPsecErrors";"NetpolDenied";"LatencyHighTrend";"DNSErrors";"DNSNxDomain";"ExternalEgressHighTrend";"ExternalIngressHighTrend"
3952 // +required
40- Template AlertTemplate `json:"template,omitempty"`
53+ Template HealthRuleTemplate `json:"template,omitempty"`
54+
55+ // Mode defines whether this health rule should be generated as an alert or a recording rule.
56+ // Possible values are: `Alert` (default), `MetricOnly`.
57+ // MetricOnly rules violations are visible in the Network Health dashboard without generating any Prometheus alert.
58+ // This provides an alternative way of getting Health information for SRE and cluster admins who may find
59+ // many new alerts burdensome.
60+ // +kubebuilder:validation:Enum:="Alert";"MetricOnly"
61+ // +kubebuilder:default:="Alert"
62+ // +optional
63+ Mode HealthRuleMode `json:"mode,omitempty"`
4164
4265 // A list of variants for this template
4366 // +required
44- Variants []AlertVariant `json:"variants,omitempty"`
67+ Variants []HealthRuleVariant `json:"variants,omitempty"`
4568}
4669
47- type AlertVariant struct {
70+ type HealthRuleVariant struct {
4871 // The low volume threshold allows to ignore metrics with a too low volume of traffic, in order to improve signal-to-noise.
4972 // It is provided as an absolute rate (bytes per second or packets per second, depending on the context).
5073 // When provided, it must be parsable as a float.
5174 LowVolumeThreshold string `json:"lowVolumeThreshold,omitempty"`
5275
53- // Thresholds of the alert per severity.
76+ // Thresholds of the health rule per severity.
5477 // They are expressed as a percentage of errors above which the alert is triggered. They must be parsable as floats.
78+ // Required for both alert and recording modes
5579 // +required
56- Thresholds AlertThresholds `json:"thresholds,omitempty"`
80+ Thresholds HealthRuleThresholds `json:"thresholds,omitempty"`
5781
5882 // Optional grouping criteria, possible values are: `Node`, `Namespace`, `Workload`.
5983 // +kubebuilder:validation:Enum:="";"Node";"Namespace";"Workload"
6084 // +optional
61- GroupBy AlertGroupBy `json:"groupBy,omitempty"`
85+ GroupBy HealthRuleGroupBy `json:"groupBy,omitempty"`
6286
63- // For trending alerts , the time offset for baseline comparison. For example, "1d" means comparing against yesterday. Defaults to 1d.
87+ // For trending health rules , the time offset for baseline comparison. For example, "1d" means comparing against yesterday. Defaults to 1d.
6488 TrendOffset * metav1.Duration `json:"trendOffset,omitempty"`
6589
66- // For trending alerts , the duration interval for baseline comparison. For example, "2h" means comparing against a 2-hours average. Defaults to 2h.
90+ // For trending health rules , the duration interval for baseline comparison. For example, "2h" means comparing against a 2-hours average. Defaults to 2h.
6791 TrendDuration * metav1.Duration `json:"trendDuration,omitempty"`
6892}
6993
70- type AlertThresholds struct {
94+ type HealthRuleThresholds struct {
7195 // Threshold for severity `info`. Leave empty to not generate an Info alert.
7296 Info string `json:"info,omitempty"`
7397
@@ -123,19 +147,19 @@ func removeMetricsByPattern(list []string, search string) []string {
123147 return filtered
124148}
125149
126- func (s * FlowCollectorSpec ) GetFLPAlerts () []FLPAlert {
127- var ret []FLPAlert
128- var templates []AlertTemplate // for reproducible ordering
150+ func (s * FlowCollectorSpec ) GetFLPHealthRules () []FLPHealthRule {
151+ var rules []FLPHealthRule
152+ var templates []HealthRuleTemplate // for reproducible ordering
129153
130- tplMap := make (map [AlertTemplate ] FLPAlert )
131- for _ , group := range DefaultAlerts {
154+ tplMap := make (map [HealthRuleTemplate ] FLPHealthRule )
155+ for _ , group := range DefaultHealthRules {
132156 if ! slices .Contains (s .Processor .Metrics .DisableAlerts , group .Template ) {
133157 tplMap [group .Template ] = group
134158 templates = append (templates , group .Template )
135159 }
136160 }
137- if s .Processor .Metrics .Alerts != nil {
138- for _ , group := range * s .Processor .Metrics .Alerts {
161+ if s .Processor .Metrics .HealthRules != nil {
162+ for _ , group := range * s .Processor .Metrics .HealthRules {
139163 if ! slices .Contains (s .Processor .Metrics .DisableAlerts , group .Template ) {
140164 // A group defined in FC overrides the default group
141165 tplMap [group .Template ] = group
@@ -149,42 +173,42 @@ func (s *FlowCollectorSpec) GetFLPAlerts() []FLPAlert {
149173 for _ , name := range templates {
150174 tpl := tplMap [name ]
151175 if ok , _ := tpl .IsAllowed (s ); ok {
152- ret = append (ret , tpl )
176+ rules = append (rules , tpl )
153177 }
154178 }
155179
156- return ret
180+ return rules
157181}
158182
159- func (g * FLPAlert ) IsAllowed (spec * FlowCollectorSpec ) (bool , string ) {
183+ func (g * FLPHealthRule ) IsAllowed (spec * FlowCollectorSpec ) (bool , string ) {
160184 switch g .Template {
161- case AlertPacketDropsByKernel :
185+ case HealthRulePacketDropsByKernel :
162186 if ! spec .Agent .EBPF .IsPktDropEnabled () {
163- return false , fmt .Sprintf ("Alert %s requires the %s agent feature to be enabled" , g .Template , PacketDrop )
187+ return false , fmt .Sprintf ("HealthRule %s requires the %s agent feature to be enabled" , g .Template , PacketDrop )
164188 }
165- case AlertIPsecErrors :
189+ case HealthRuleIPsecErrors :
166190 if ! spec .Agent .EBPF .IsIPSecEnabled () {
167- return false , fmt .Sprintf ("Alert %s requires the %s agent feature to be enabled" , g .Template , IPSec )
191+ return false , fmt .Sprintf ("HealthRule %s requires the %s agent feature to be enabled" , g .Template , IPSec )
168192 }
169- case AlertDNSErrors , AlertDNSNxDomain :
193+ case HealthRuleDNSErrors , HealthRuleDNSNxDomain :
170194 if ! spec .Agent .EBPF .IsDNSTrackingEnabled () {
171- return false , fmt .Sprintf ("Alert %s requires the %s agent feature to be enabled" , g .Template , DNSTracking )
195+ return false , fmt .Sprintf ("HealthRule %s requires the %s agent feature to be enabled" , g .Template , DNSTracking )
172196 }
173- case AlertLatencyHighTrend :
197+ case HealthRuleLatencyHighTrend :
174198 if ! spec .Agent .EBPF .IsFlowRTTEnabled () {
175- return false , fmt .Sprintf ("Alert %s requires the %s agent feature to be enabled" , g .Template , FlowRTT )
199+ return false , fmt .Sprintf ("HealthRule %s requires the %s agent feature to be enabled" , g .Template , FlowRTT )
176200 }
177- case AlertNetpolDenied :
201+ case HealthRuleNetpolDenied :
178202 if ! spec .Agent .EBPF .IsNetworkEventsEnabled () {
179- return false , fmt .Sprintf ("Alert %s requires the %s agent feature to be enabled" , g .Template , NetworkEvents )
203+ return false , fmt .Sprintf ("HealthRule %s requires the %s agent feature to be enabled" , g .Template , NetworkEvents )
180204 }
181- case AlertNoFlows , AlertLokiError , AlertPacketDropsByDevice , AlertExternalEgressHighTrend , AlertExternalIngressHighTrend :
205+ case AlertNoFlows , AlertLokiError , HealthRulePacketDropsByDevice , HealthRuleExternalEgressHighTrend , HealthRuleExternalIngressHighTrend :
182206 return true , ""
183207 }
184208 return true , ""
185209}
186210
187- func (v * AlertVariant ) GetTrendParams () (string , string ) {
211+ func (v * HealthRuleVariant ) GetTrendParams () (string , string ) {
188212 offset := metav1.Duration {Duration : 24 * time .Hour }
189213 if v .TrendOffset != nil {
190214 offset = * v .TrendOffset
0 commit comments