Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ endif

# Tool binaries
GOLANGCI_LINT := $(GOBIN)/golangci-lint
YQ := $(GOBIN)/yq
PROMTOOL := $(GOBIN)/promtool

# Include integration testing targets
include test.mk
Expand All @@ -33,6 +35,7 @@ lint: $(GOLANGCI_LINT)
$(GOLANGCI_LINT):
./hack/install-golangci-lint.sh


# ----------------
# Test
# ----------------
Expand Down Expand Up @@ -98,3 +101,4 @@ undeploy:
## precommit> run linting and unit tests
.PHONY: precommit
precommit: lint test

73 changes: 47 additions & 26 deletions cmd/simulate/simulate.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,26 @@ import (

func must(err error) {
if err != nil {
panic(err)
panic(err)
}
}

var outputFile = "cluster-health-analyzer-openmetrics.txt"
var scenarioFile string
var alertsOnly bool

var SimulateCmd = &cobra.Command{
Use: "simulate",
Short: "Generate simulated data in openmetrics format",
Run: func(cmd *cobra.Command, args []string) {
simulate(outputFile, scenarioFile)
simulate(outputFile, scenarioFile, alertsOnly)
},
}

func init() {
SimulateCmd.Flags().StringVarP(&outputFile, "output", "o", outputFile, "output file")
SimulateCmd.Flags().StringVarP(&scenarioFile, "scenario", "s", "", "CSV file with the scenario to simulate")
SimulateCmd.Flags().BoolVar(&alertsOnly, "alerts-only", false, "Only output ALERTS metrics (for integration testing)")
}

var defaultRelativeIntervals = []utils.RelativeInterval{
Expand Down Expand Up @@ -354,8 +356,17 @@ func parseIntervalsFromCSV(file io.Reader) ([]utils.RelativeInterval, error) {
return intervals, nil
}

// endTimeBuffer adds extra time to ensure alerts are still "firing" when queried.
// Without this buffer, alerts end exactly at time.Now() and immediately start
// becoming stale in Prometheus (5-minute staleness window). This caused integration
// tests to only see ~80% of alerts because the processor runs after a delay and
// queries at a later time when some alerts have already gone stale.
const endTimeBuffer = 10 * time.Minute

func buildAlertIntervals(scenarioFile string) ([]processor.Interval, error) {
end := model.TimeFromUnixNano(time.Now().UnixNano())
// Add buffer to end time to ensure alerts are still active when the processor runs.
// This compensates for delays in test setup and processor polling intervals.
end := model.TimeFromUnixNano(time.Now().Add(endTimeBuffer).UnixNano())
intervals := defaultRelativeIntervals
if scenarioFile != "" {
csvIntervals, err := readIntervalsFromCSV(scenarioFile)
Expand Down Expand Up @@ -404,7 +415,7 @@ func fmtInterval(
return nil
}

func simulate(outputFile, scenarioFile string) {
func simulate(outputFile, scenarioFile string, alertsOnly bool) {
// Build sample intervals.
intervals, err := buildAlertIntervals(scenarioFile)
must(err)
Expand All @@ -422,25 +433,6 @@ func simulate(outputFile, scenarioFile string) {
}
}

startToIntervals := make(map[model.Time][]processor.Interval)

// Group them by time.
for _, i := range intervals {
startToIntervals[i.Start] = append(startToIntervals[i.Start], i)
}

// Prepare the changeset
changes := make(processor.ChangeSet, len(startToIntervals))
for t, intervals := range startToIntervals {
changes = append(changes, processor.Change{
Timestamp: t,
Intervals: intervals,
})
}
sort.Slice(changes, func(i, j int) bool {
return changes[i].Timestamp.Before(changes[j].Timestamp)
})

f, err := os.Create(outputFile)
must(err)
defer f.Close() // nolint:errcheck
Expand All @@ -456,6 +448,15 @@ func simulate(outputFile, scenarioFile string) {
must(err)
}

// When alertsOnly is set, skip the processed metrics - they should be
// computed by the cluster-health-analyzer being tested.
if alertsOnly {
_, err = fmt.Fprint(w, "# EOF")
must(err)
slog.Info("Openmetrics file saved (alerts only)", "output", outputFile)
return
}

// Output cluster_health_components
fprintln(w, "# HELP cluster_health_components Cluster health components ranking")
fprintln(w, "# TYPE cluster_health_components gauge")
Expand All @@ -468,6 +469,25 @@ func simulate(outputFile, scenarioFile string) {
must(err)
}

startToIntervals := make(map[model.Time][]processor.Interval)

// Group them by time.
for _, i := range intervals {
startToIntervals[i.Start] = append(startToIntervals[i.Start], i)
}

// Prepare the changeset
changes := make(processor.ChangeSet, len(startToIntervals))
for t, intervals := range startToIntervals {
changes = append(changes, processor.Change{
Timestamp: t,
Intervals: intervals,
})
}
sort.Slice(changes, func(i, j int) bool {
return changes[i].Timestamp.Before(changes[j].Timestamp)
})

gc := &processor.GroupsCollection{}
var groupedIntervalsSet []processor.GroupedInterval

Expand All @@ -476,7 +496,7 @@ func simulate(outputFile, scenarioFile string) {
groupedIntervalsSet = append(groupedIntervalsSet, groupedIntervals...)
}

// Output cluster_health_components:map
// Output cluster_health_components_map
fprintln(w, "# HELP cluster_health_components_map Cluster health components mapping")
fprintln(w, "# TYPE cluster_health_components_map gauge")

Expand All @@ -489,8 +509,6 @@ func simulate(outputFile, scenarioFile string) {
err := fmtInterval(w, "cluster_health_components_map", healthMap.Labels(), gi.Start, gi.End, step, float64(healthMap.Health))
must(err)
}
_, err = fmt.Fprint(w, "# EOF")
must(err)

groups := make(map[string][]processor.GroupedInterval)
for _, gi := range groupedIntervalsSet {
Expand All @@ -516,6 +534,9 @@ func simulate(outputFile, scenarioFile string) {

slog.Info("Generated incidents", "num", len(groups))

_, err = fmt.Fprint(w, "# EOF")
must(err)

slog.Info("Openmetrics file saved", "output", outputFile)
}

Expand Down
2 changes: 1 addition & 1 deletion test.mk
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ undeploy-integration:
## test-integration> run integration tests (assumes deployment exists)
.PHONY: test-integration
test-integration:
$(GINKGO) -v ./test/integration/...
$(GINKGO) -v --label-filter="!stress&&!stress-simulate" ./test/integration/...

## help-integration> show integration testing workflow and related targets
.PHONY: help-integration
Expand Down
7 changes: 7 additions & 0 deletions test/integration/fixtures/labels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package fixtures

// Common test labels used for resource identification and cleanup.
const (
StressTestLabel = "test-type=stress"
CrashLoopTestLabel = "test-type=crashloop"
)
57 changes: 31 additions & 26 deletions test/integration/fixtures/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,45 +9,50 @@ import (
//go:embed testdata/*.yaml
var testDataFS embed.FS

// Template paths relative to testdata directory.
const (
DeploymentFile = "testdata/crashing_deployment.yaml"
RuleFile = "testdata/crashloop_prometheusrule.yaml"
DeploymentTemplate = "testdata/crashing_deployment.yaml"
PrometheusRuleTemplate = "testdata/crashloop_prometheusrule.yaml"
)

// DeploymentReplacements returns the standard replacements for a deployment template.
func DeploymentReplacements(name, namespace, testType string) map[string]string {
return map[string]string{
"{{NAME}}": name,
"{{NAMESPACE}}": namespace,
"{{TEST_TYPE}}": testType,
}
}

// RuleReplacements returns the standard replacements for a PrometheusRule template.
func RuleReplacements(ruleName, alertName, podPrefix, testType string) map[string]string {
return map[string]string{
"{{RULE_NAME}}": ruleName,
"{{ALERT_NAME}}": alertName,
"{{POD_PREFIX}}": podPrefix,
"{{TEST_TYPE}}": testType,
}
}

// PodSelector returns the label selector for pods created by a deployment.
func PodSelector(deploymentName string) string {
return "app=" + deploymentName
}

// RenderDeployment loads the embedded deployment template and renders it with the given parameters.
func RenderDeployment(name, namespace, testType string) (string, error) {
content, err := testDataFS.ReadFile(DeploymentFile)
content, err := testDataFS.ReadFile(DeploymentTemplate)
if err != nil {
return "", err
}

data := map[string]string{
"Name": name,
"Namespace": namespace,
"TestType": testType,
}

return framework.Render("deployment", content, data)
return framework.RenderTemplate(string(content), DeploymentReplacements(name, namespace, testType)), nil
}

// RenderRule loads the embedded PrometheusRule template and renders it with the given parameters.
func RenderRule(ruleName, alertName, podPrefix, testType string) (string, error) {
content, err := testDataFS.ReadFile(RuleFile)
content, err := testDataFS.ReadFile(PrometheusRuleTemplate)
if err != nil {
return "", err
}

data := map[string]string{
"RuleName": ruleName,
"AlertName": alertName,
"PodPrefix": podPrefix,
"TestType": testType,
}

return framework.Render("rule", content, data)
}

// PodSelector returns the label selector for pods created by a deployment.
func PodSelector(deploymentName string) string {
return "app=" + deploymentName
return framework.RenderTemplate(string(content), RuleReplacements(ruleName, alertName, podPrefix, testType)), nil
}
1 change: 1 addition & 0 deletions test/integration/framework/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func (c *Cluster) DeleteByLabel(ctx context.Context, resourceType, labelSelector
return c.run(ctx, "delete", resourceType, "-l", labelSelector, "--ignore-not-found")
}

// IsGone checks if a resource no longer exists.
// Use this with Eventually to wait for deletion to complete.
func (c *Cluster) IsGone(ctx context.Context, resourceType, name string) (bool, error) {
err := c.run(ctx, "get", resourceType, name)
Expand Down
18 changes: 9 additions & 9 deletions test/integration/framework/matchers.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ type validIncidentMatcher struct {
missing []string
empty []string
invalid []string
incident Incident
incident *Incident
}

var requiredIncidentLabels = []string{
Expand All @@ -90,9 +90,9 @@ var requiredIncidentLabels = []string{
}

func (m *validIncidentMatcher) Match(actual interface{}) (bool, error) {
incident, ok := actual.(Incident)
incident, ok := actual.(*Incident)
if !ok {
return false, fmt.Errorf("BeValidIncident expects Incident, got %T", actual)
return false, fmt.Errorf("BeValidIncident expects *Incident, got %T", actual)
}

if incident == nil {
Expand All @@ -105,7 +105,7 @@ func (m *validIncidentMatcher) Match(actual interface{}) (bool, error) {
m.invalid = nil

for _, label := range requiredIncidentLabels {
value, exists := incident[label]
value, exists := incident.Labels[label]
if !exists {
m.missing = append(m.missing, label)
} else if value == "" {
Expand All @@ -114,14 +114,14 @@ func (m *validIncidentMatcher) Match(actual interface{}) (bool, error) {
}

// Validate layer is a known OpenShift layer
if layer, exists := incident["layer"]; exists && layer != "" {
if layer, exists := incident.Labels["layer"]; exists && layer != "" {
if _, ok := ValidLayers[layer]; !ok {
m.invalid = append(m.invalid, fmt.Sprintf("layer=%q", layer))
}
}

// Validate component is a known OpenShift component
if component, exists := incident["component"]; exists && component != "" {
if component, exists := incident.Labels["component"]; exists && component != "" {
if _, ok := ValidComponents[component]; !ok {
m.invalid = append(m.invalid, fmt.Sprintf("component=%q", component))
}
Expand All @@ -147,19 +147,19 @@ func (m *validIncidentMatcher) FailureMessage(actual interface{}) string {
parts = append(parts, fmt.Sprintf("invalid values: %v", m.invalid))
}

alertname := m.incident["src_alertname"]
alertname := m.incident.Labels["src_alertname"]
if alertname == "" {
alertname = "(unknown)"
}

return fmt.Sprintf("Expected incident %s to be valid, but: %s\nAll labels: %v",
alertname, strings.Join(parts, "; "), m.incident)
alertname, strings.Join(parts, "; "), m.incident.Labels)
}

func (m *validIncidentMatcher) NegatedFailureMessage(actual interface{}) string {
alertname := "(unknown)"
if m.incident != nil {
if name := m.incident["src_alertname"]; name != "" {
if name := m.incident.Labels["src_alertname"]; name != "" {
alertname = name
}
}
Expand Down
Loading