Skip to content

Commit 91777b1

Browse files
fixes
1 parent b1fae84 commit 91777b1

File tree

3 files changed

+134
-26
lines changed

3 files changed

+134
-26
lines changed

internal/pkg/metrics/alerts/builder.go

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,50 +36,102 @@ type ruleBuilder struct {
3636
// BuildRules is the main entry point that builds both alerts and recording rules
3737
// based on each health rule's mode configuration
3838
func BuildRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec) []monitoringv1.Rule {
39-
log := log.FromContext(ctx)
4039
rules := []monitoringv1.Rule{}
4140

4241
if fc.HasExperimentalAlertsHealth() {
4342
healthRules := fc.GetHealthRules()
4443
metrics := fc.GetIncludeList()
4544

46-
for _, healthRule := range healthRules {
47-
if ok, _ := healthRule.IsAllowed(fc); !ok {
48-
continue
45+
// Build health rules and track system rules mode
46+
addNoFlowsRecording, addLokiErrorRecording := buildHealthRules(ctx, fc, healthRules, metrics, &rules)
47+
48+
// Add system health rules based on mode
49+
addSystemHealthRules(fc, addNoFlowsRecording, addLokiErrorRecording, &rules)
50+
} else {
51+
// If experimental alerts are not enabled, add default alert rules
52+
addDefaultSystemAlerts(fc, &rules)
53+
}
54+
55+
return rules
56+
}
57+
58+
// buildHealthRules processes health rules and returns flags indicating if NoFlows/LokiError should be recording rules
59+
func buildHealthRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec, healthRules []flowslatest.HealthRule, metrics []string, rules *[]monitoringv1.Rule) (bool, bool) {
60+
log := log.FromContext(ctx)
61+
addNoFlowsRecording := false
62+
addLokiErrorRecording := false
63+
64+
for _, healthRule := range healthRules {
65+
if ok, _ := healthRule.IsAllowed(fc); !ok {
66+
continue
67+
}
68+
69+
// Check if NoFlows or LokiError are configured as recording rules
70+
if healthRule.Mode == flowslatest.HealthRuleModeRecordingRule {
71+
if healthRule.Template == flowslatest.AlertNoFlows {
72+
addNoFlowsRecording = true
73+
}
74+
if healthRule.Template == flowslatest.AlertLokiError {
75+
addLokiErrorRecording = true
4976
}
77+
}
5078

51-
for _, variant := range healthRule.Variants {
52-
var r []monitoringv1.Rule
53-
var err error
79+
for _, variant := range healthRule.Variants {
80+
var r []monitoringv1.Rule
81+
var err error
5482

55-
// Decide whether to build alert or recording rule based on mode
56-
if healthRule.Mode == flowslatest.HealthRuleModeRecordingRule {
57-
r, err = convertToRecordingRules(healthRule.Template, &variant, metrics)
58-
} else {
59-
// Default to alert mode
60-
r, err = convertToRules(healthRule.Template, &variant, metrics)
61-
}
83+
// Decide whether to build alert or recording rule based on mode
84+
if healthRule.Mode == flowslatest.HealthRuleModeRecordingRule {
85+
r, err = convertToRecordingRules(healthRule.Template, &variant, metrics)
86+
} else {
87+
// Default to alert mode
88+
r, err = convertToRules(healthRule.Template, &variant, metrics)
89+
}
6290

63-
if err != nil {
64-
log.Error(err, "unable to configure a health rule", "template", healthRule.Template, "mode", healthRule.Mode)
65-
} else if len(r) > 0 {
66-
rules = append(rules, r...)
67-
}
91+
if err != nil {
92+
log.Error(err, "unable to configure a health rule", "template", healthRule.Template, "mode", healthRule.Mode)
93+
} else if len(r) > 0 {
94+
*rules = append(*rules, r...)
6895
}
6996
}
7097
}
7198

72-
// Add system health rules (NoFlows and LokiError)
99+
return addNoFlowsRecording, addLokiErrorRecording
100+
}
101+
102+
// addSystemHealthRules adds NoFlows and LokiError rules based on configured mode
103+
func addSystemHealthRules(fc *flowslatest.FlowCollectorSpec, addNoFlowsRecording, addLokiErrorRecording bool, rules *[]monitoringv1.Rule) {
104+
// Add system health recording rules (NoFlows and LokiError) if configured
105+
if addNoFlowsRecording && !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertNoFlows) {
106+
r := RecordingNoFlows()
107+
*rules = append(*rules, *r)
108+
}
109+
if addLokiErrorRecording && !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertLokiError) {
110+
r := RecordingLokiError()
111+
*rules = append(*rules, *r)
112+
}
113+
114+
// Add system health alert rules (NoFlows and LokiError) - only if not configured as recording rules
115+
if !addNoFlowsRecording && !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertNoFlows) {
116+
r := alertNoFlows()
117+
*rules = append(*rules, *r)
118+
}
119+
if !addLokiErrorRecording && !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertLokiError) {
120+
r := alertLokiError()
121+
*rules = append(*rules, *r)
122+
}
123+
}
124+
125+
// addDefaultSystemAlerts adds default alert rules when experimental alerts are not enabled
126+
func addDefaultSystemAlerts(fc *flowslatest.FlowCollectorSpec, rules *[]monitoringv1.Rule) {
73127
if !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertNoFlows) {
74128
r := alertNoFlows()
75-
rules = append(rules, *r)
129+
*rules = append(*rules, *r)
76130
}
77131
if !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertLokiError) {
78132
r := alertLokiError()
79-
rules = append(rules, *r)
133+
*rules = append(*rules, *r)
80134
}
81-
82-
return rules
83135
}
84136

85137
func convertToRules(template flowslatest.AlertTemplate, alert *flowslatest.HealthRuleVariant, enabledMetrics []string) ([]monitoringv1.Rule, error) {

internal/pkg/metrics/alerts/recording.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ func (rb *ruleBuilder) latencyTrendRecording() (*monitoringv1.Rule, error) {
215215
}, nil
216216
}
217217

218-
func recordingNoFlows() *monitoringv1.Rule {
218+
func RecordingNoFlows() *monitoringv1.Rule {
219219
return &monitoringv1.Rule{
220220
Record: "netobserv:health:no_flows:rate1m",
221221
Expr: intstr.FromString("sum(rate(netobserv_ingest_flows_processed[1m]))"),
@@ -226,7 +226,7 @@ func recordingNoFlows() *monitoringv1.Rule {
226226
}
227227
}
228228

229-
func recordingLokiError() *monitoringv1.Rule {
229+
func RecordingLokiError() *monitoringv1.Rule {
230230
return &monitoringv1.Rule{
231231
Record: "netobserv:health:loki_errors:rate1m",
232232
Expr: intstr.FromString("sum(rate(netobserv_loki_dropped_entries_total[1m]))"),

internal/pkg/metrics/alerts/recording_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,59 @@ func TestRecordingRuleLabels(t *testing.T) {
207207
assert.Equal(t, "Workload", labels["health_groupby"])
208208
assert.Equal(t, "Dst", labels["health_side"])
209209
}
210+
211+
func TestBuildRules_SystemRecordingRules(t *testing.T) {
212+
ctx := context.Background()
213+
214+
// Create health rules with NoFlows and LokiError as recording rules
215+
healthRules := []flowslatest.HealthRule{
216+
{
217+
Template: flowslatest.AlertNoFlows,
218+
Mode: flowslatest.HealthRuleModeRecordingRule,
219+
Variants: []flowslatest.HealthRuleVariant{{}},
220+
},
221+
{
222+
Template: flowslatest.AlertLokiError,
223+
Mode: flowslatest.HealthRuleModeRecordingRule,
224+
Variants: []flowslatest.HealthRuleVariant{{}},
225+
},
226+
}
227+
228+
spec := &flowslatest.FlowCollectorSpec{
229+
Processor: flowslatest.FlowCollectorFLP{
230+
Metrics: flowslatest.FLPMetrics{
231+
HealthRules: &healthRules,
232+
},
233+
Advanced: &flowslatest.AdvancedProcessorConfig{
234+
Env: map[string]string{
235+
"EXPERIMENTAL_ALERTS_HEALTH": "true",
236+
},
237+
},
238+
},
239+
}
240+
241+
rules := BuildRules(ctx, spec)
242+
243+
// Should have recording rules for NoFlows and LokiError
244+
var noFlowsRecording, lokiErrorRecording bool
245+
for _, rule := range rules {
246+
if rule.Record == "netobserv:health:no_flows:rate1m" {
247+
noFlowsRecording = true
248+
assert.Equal(t, "health", rule.Labels["netobserv"])
249+
assert.Equal(t, "NetObservNoFlows", rule.Labels["health_template"])
250+
assert.Contains(t, rule.Expr.StrVal, "netobserv_ingest_flows_processed")
251+
}
252+
if rule.Record == "netobserv:health:loki_errors:rate1m" {
253+
lokiErrorRecording = true
254+
assert.Equal(t, "health", rule.Labels["netobserv"])
255+
assert.Equal(t, "NetObservLokiError", rule.Labels["health_template"])
256+
assert.Contains(t, rule.Expr.StrVal, "netobserv_loki_dropped_entries_total")
257+
}
258+
// Should not have alert versions
259+
assert.NotEqual(t, "NetObservNoFlows", rule.Alert)
260+
assert.NotEqual(t, "NetObservLokiError", rule.Alert)
261+
}
262+
263+
assert.True(t, noFlowsRecording, "should have NoFlows recording rule")
264+
assert.True(t, lokiErrorRecording, "should have LokiError recording rule")
265+
}

0 commit comments

Comments
 (0)