openshift · DavidRajnoha · Jan 20, 2026 · Jan 20, 2026 · Jan 27, 2026
diff --git a/Makefile b/Makefile
@@ -7,6 +7,8 @@ endif
 
 # Tool binaries
 GOLANGCI_LINT := $(GOBIN)/golangci-lint
+YQ := $(GOBIN)/yq
+PROMTOOL := $(GOBIN)/promtool
 
 # Include integration testing targets
 include test.mk
@@ -33,6 +35,7 @@ lint: $(GOLANGCI_LINT)
 $(GOLANGCI_LINT):
 	./hack/install-golangci-lint.sh
 
+
 # ----------------
 # Test
 # ----------------
@@ -98,3 +101,4 @@ undeploy:
 ## precommit> run linting and unit tests
 .PHONY: precommit
 precommit: lint test
+
diff --git a/cmd/simulate/simulate.go b/cmd/simulate/simulate.go
@@ -23,24 +23,26 @@ import (
 
 func must(err error) {
 	if err != nil {
-		panic(err)
+		panic(err)	
 	}
 }
 
 var outputFile = "cluster-health-analyzer-openmetrics.txt"
 var scenarioFile string
+var alertsOnly bool
 
 var SimulateCmd = &cobra.Command{
 	Use:   "simulate",
 	Short: "Generate simulated data in openmetrics format",
 	Run: func(cmd *cobra.Command, args []string) {
-		simulate(outputFile, scenarioFile)
+		simulate(outputFile, scenarioFile, alertsOnly)
 	},
 }
 
 func init() {
 	SimulateCmd.Flags().StringVarP(&outputFile, "output", "o", outputFile, "output file")
 	SimulateCmd.Flags().StringVarP(&scenarioFile, "scenario", "s", "", "CSV file with the scenario to simulate")
+	SimulateCmd.Flags().BoolVar(&alertsOnly, "alerts-only", false, "Only output ALERTS metrics (for integration testing)")
 }
 
 var defaultRelativeIntervals = []utils.RelativeInterval{
@@ -354,8 +356,17 @@ func parseIntervalsFromCSV(file io.Reader) ([]utils.RelativeInterval, error) {
 	return intervals, nil
 }
 
+// endTimeBuffer adds extra time to ensure alerts are still "firing" when queried.
+// Without this buffer, alerts end exactly at time.Now() and immediately start
+// becoming stale in Prometheus (5-minute staleness window). This caused integration
+// tests to only see ~80% of alerts because the processor runs after a delay and
+// queries at a later time when some alerts have already gone stale.
+const endTimeBuffer = 10 * time.Minute
+
 func buildAlertIntervals(scenarioFile string) ([]processor.Interval, error) {
-	end := model.TimeFromUnixNano(time.Now().UnixNano())
+	// Add buffer to end time to ensure alerts are still active when the processor runs.
+	// This compensates for delays in test setup and processor polling intervals.
+	end := model.TimeFromUnixNano(time.Now().Add(endTimeBuffer).UnixNano())
 	intervals := defaultRelativeIntervals
 	if scenarioFile != "" {
 		csvIntervals, err := readIntervalsFromCSV(scenarioFile)
@@ -404,7 +415,7 @@ func fmtInterval(
 	return nil
 }
 
-func simulate(outputFile, scenarioFile string) {
+func simulate(outputFile, scenarioFile string, alertsOnly bool) {
 	// Build sample intervals.
 	intervals, err := buildAlertIntervals(scenarioFile)
 	must(err)
@@ -422,25 +433,6 @@ func simulate(outputFile, scenarioFile string) {
 		}
 	}
 
-	startToIntervals := make(map[model.Time][]processor.Interval)
-
-	// Group them by time.
-	for _, i := range intervals {
-		startToIntervals[i.Start] = append(startToIntervals[i.Start], i)
-	}
-
-	// Prepare the changeset
-	changes := make(processor.ChangeSet, len(startToIntervals))
-	for t, intervals := range startToIntervals {
-		changes = append(changes, processor.Change{
-			Timestamp: t,
-			Intervals: intervals,
-		})
-	}
-	sort.Slice(changes, func(i, j int) bool {
-		return changes[i].Timestamp.Before(changes[j].Timestamp)
-	})
-
 	f, err := os.Create(outputFile)
 	must(err)
 	defer f.Close() // nolint:errcheck
@@ -456,6 +448,15 @@ func simulate(outputFile, scenarioFile string) {
 		must(err)
 	}
 
+	// When alertsOnly is set, skip the processed metrics - they should be
+	// computed by the cluster-health-analyzer being tested.
+	if alertsOnly {
+		_, err = fmt.Fprint(w, "# EOF")
+		must(err)
+		slog.Info("Openmetrics file saved (alerts only)", "output", outputFile)
+		return
+	}
+
 	// Output cluster_health_components
 	fprintln(w, "# HELP cluster_health_components Cluster health components ranking")
 	fprintln(w, "# TYPE cluster_health_components gauge")
@@ -468,6 +469,25 @@ func simulate(outputFile, scenarioFile string) {
 		must(err)
 	}
 
+	startToIntervals := make(map[model.Time][]processor.Interval)
+
+	// Group them by time.
+	for _, i := range intervals {
+		startToIntervals[i.Start] = append(startToIntervals[i.Start], i)
+	}
+
+	// Prepare the changeset
+	changes := make(processor.ChangeSet, len(startToIntervals))
+	for t, intervals := range startToIntervals {
+		changes = append(changes, processor.Change{
+			Timestamp: t,
+			Intervals: intervals,
+		})
+	}
+	sort.Slice(changes, func(i, j int) bool {
+		return changes[i].Timestamp.Before(changes[j].Timestamp)
+	})
+
 	gc := &processor.GroupsCollection{}
 	var groupedIntervalsSet []processor.GroupedInterval
 
@@ -476,7 +496,7 @@ func simulate(outputFile, scenarioFile string) {
 		groupedIntervalsSet = append(groupedIntervalsSet, groupedIntervals...)
 	}
 
-	// Output cluster_health_components:map
+	// Output cluster_health_components_map
 	fprintln(w, "# HELP cluster_health_components_map Cluster health components mapping")
 	fprintln(w, "# TYPE cluster_health_components_map gauge")
 
@@ -489,8 +509,6 @@ func simulate(outputFile, scenarioFile string) {
 		err := fmtInterval(w, "cluster_health_components_map", healthMap.Labels(), gi.Start, gi.End, step, float64(healthMap.Health))
 		must(err)
 	}
-	_, err = fmt.Fprint(w, "# EOF")
-	must(err)
 
 	groups := make(map[string][]processor.GroupedInterval)
 	for _, gi := range groupedIntervalsSet {
@@ -516,6 +534,9 @@ func simulate(outputFile, scenarioFile string) {
 
 	slog.Info("Generated incidents", "num", len(groups))
 
+	_, err = fmt.Fprint(w, "# EOF")
+	must(err)
+
 	slog.Info("Openmetrics file saved", "output", outputFile)
 }
 

diff --git a/test.mk b/test.mk
@@ -52,7 +52,7 @@ undeploy-integration:
 ## test-integration> run integration tests (assumes deployment exists)
 .PHONY: test-integration
 test-integration:
-	$(GINKGO) -v ./test/integration/...
+	$(GINKGO) -v --label-filter="!stress&&!stress-simulate" ./test/integration/...
 
 ## help-integration> show integration testing workflow and related targets
 .PHONY: help-integration

diff --git a/test/integration/fixtures/labels.go b/test/integration/fixtures/labels.go
@@ -0,0 +1,7 @@
+package fixtures
+
+// Common test labels used for resource identification and cleanup.
+const (
+	StressTestLabel = "test-type=stress"
+	CrashLoopTestLabel = "test-type=crashloop"
+)
diff --git a/test/integration/fixtures/templates.go b/test/integration/fixtures/templates.go
@@ -9,45 +9,50 @@ import (
 //go:embed testdata/*.yaml
 var testDataFS embed.FS
 
+// Template paths relative to testdata directory.
 const (
-	DeploymentFile = "testdata/crashing_deployment.yaml"
-	RuleFile       = "testdata/crashloop_prometheusrule.yaml"
+	DeploymentTemplate     = "testdata/crashing_deployment.yaml"
+	PrometheusRuleTemplate = "testdata/crashloop_prometheusrule.yaml"
 )
 
+// DeploymentReplacements returns the standard replacements for a deployment template.
+func DeploymentReplacements(name, namespace, testType string) map[string]string {
+	return map[string]string{
+		"{{NAME}}":      name,
+		"{{NAMESPACE}}": namespace,
+		"{{TEST_TYPE}}": testType,
+	}
+}
+
+// RuleReplacements returns the standard replacements for a PrometheusRule template.
+func RuleReplacements(ruleName, alertName, podPrefix, testType string) map[string]string {
+	return map[string]string{
+		"{{RULE_NAME}}":  ruleName,
+		"{{ALERT_NAME}}": alertName,
+		"{{POD_PREFIX}}": podPrefix,
+		"{{TEST_TYPE}}":  testType,
+	}
+}
+
+// PodSelector returns the label selector for pods created by a deployment.
+func PodSelector(deploymentName string) string {
+	return "app=" + deploymentName
+}
+
 // RenderDeployment loads the embedded deployment template and renders it with the given parameters.
 func RenderDeployment(name, namespace, testType string) (string, error) {
-	content, err := testDataFS.ReadFile(DeploymentFile)
+	content, err := testDataFS.ReadFile(DeploymentTemplate)
 	if err != nil {
 		return "", err
 	}
-
-	data := map[string]string{
-		"Name":      name,
-		"Namespace": namespace,
-		"TestType":  testType,
-	}
-
-	return framework.Render("deployment", content, data)
+	return framework.RenderTemplate(string(content), DeploymentReplacements(name, namespace, testType)), nil
 }
 
 // RenderRule loads the embedded PrometheusRule template and renders it with the given parameters.
 func RenderRule(ruleName, alertName, podPrefix, testType string) (string, error) {
-	content, err := testDataFS.ReadFile(RuleFile)
+	content, err := testDataFS.ReadFile(PrometheusRuleTemplate)
 	if err != nil {
 		return "", err
 	}
-
-	data := map[string]string{
-		"RuleName":  ruleName,
-		"AlertName": alertName,
-		"PodPrefix": podPrefix,
-		"TestType":  testType,
-	}
-
-	return framework.Render("rule", content, data)
-}
-
-// PodSelector returns the label selector for pods created by a deployment.
-func PodSelector(deploymentName string) string {
-	return "app=" + deploymentName
+	return framework.RenderTemplate(string(content), RuleReplacements(ruleName, alertName, podPrefix, testType)), nil
 }
diff --git a/test/integration/framework/cluster.go b/test/integration/framework/cluster.go
@@ -41,6 +41,7 @@ func (c *Cluster) DeleteByLabel(ctx context.Context, resourceType, labelSelector
 	return c.run(ctx, "delete", resourceType, "-l", labelSelector, "--ignore-not-found")
 }
 
+// IsGone checks if a resource no longer exists.
 // Use this with Eventually to wait for deletion to complete.
 func (c *Cluster) IsGone(ctx context.Context, resourceType, name string) (bool, error) {
 	err := c.run(ctx, "get", resourceType, name)

diff --git a/test/integration/framework/matchers.go b/test/integration/framework/matchers.go
@@ -77,7 +77,7 @@ type validIncidentMatcher struct {
 	missing  []string
 	empty    []string
 	invalid  []string
-	incident Incident
+	incident *Incident
 }
 
 var requiredIncidentLabels = []string{
@@ -90,9 +90,9 @@ var requiredIncidentLabels = []string{
 }
 
 func (m *validIncidentMatcher) Match(actual interface{}) (bool, error) {
-	incident, ok := actual.(Incident)
+	incident, ok := actual.(*Incident)
 	if !ok {
-		return false, fmt.Errorf("BeValidIncident expects Incident, got %T", actual)
+		return false, fmt.Errorf("BeValidIncident expects *Incident, got %T", actual)
 	}
 
 	if incident == nil {
@@ -105,7 +105,7 @@ func (m *validIncidentMatcher) Match(actual interface{}) (bool, error) {
 	m.invalid = nil
 
 	for _, label := range requiredIncidentLabels {
-		value, exists := incident[label]
+		value, exists := incident.Labels[label]
 		if !exists {
 			m.missing = append(m.missing, label)
 		} else if value == "" {
@@ -114,14 +114,14 @@ func (m *validIncidentMatcher) Match(actual interface{}) (bool, error) {
 	}
 
 	// Validate layer is a known OpenShift layer
-	if layer, exists := incident["layer"]; exists && layer != "" {
+	if layer, exists := incident.Labels["layer"]; exists && layer != "" {
 		if _, ok := ValidLayers[layer]; !ok {
 			m.invalid = append(m.invalid, fmt.Sprintf("layer=%q", layer))
 		}
 	}
 
 	// Validate component is a known OpenShift component
-	if component, exists := incident["component"]; exists && component != "" {
+	if component, exists := incident.Labels["component"]; exists && component != "" {
 		if _, ok := ValidComponents[component]; !ok {
 			m.invalid = append(m.invalid, fmt.Sprintf("component=%q", component))
 		}
@@ -147,19 +147,19 @@ func (m *validIncidentMatcher) FailureMessage(actual interface{}) string {
 		parts = append(parts, fmt.Sprintf("invalid values: %v", m.invalid))
 	}
 
-	alertname := m.incident["src_alertname"]
+	alertname := m.incident.Labels["src_alertname"]
 	if alertname == "" {
 		alertname = "(unknown)"
 	}
 
 	return fmt.Sprintf("Expected incident %s to be valid, but: %s\nAll labels: %v",
-		alertname, strings.Join(parts, "; "), m.incident)
+		alertname, strings.Join(parts, "; "), m.incident.Labels)
 }
 
 func (m *validIncidentMatcher) NegatedFailureMessage(actual interface{}) string {
 	alertname := "(unknown)"
 	if m.incident != nil {
-		if name := m.incident["src_alertname"]; name != "" {
+		if name := m.incident.Labels["src_alertname"]; name != "" {
 			alertname = name
 		}
 	}