Merge pull request kubernetes#127154 from macsko/check_if_inflight_events_empty_in_testcase_end_scheduler_perf

k8s-ci-robot · web-flow · commit 0c5e832aa2bb · 2024-09-09T20:43:33.000+01:00
Check if InFlightEvents is empty after scheduler_perf workload
diff --git a/pkg/scheduler/framework/events.go b/pkg/scheduler/framework/events.go
@@ -105,6 +105,42 @@ var (
 	WildCardEvent = ClusterEvent{Resource: WildCard, ActionType: All, Label: "WildCardEvent"}
 	// UnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
 	UnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, Label: "UnschedulableTimeout"}
+	// AllEvents contains all events defined above.
+	AllEvents = []ClusterEvent{
+		AssignedPodAdd,
+		NodeAdd,
+		NodeDelete,
+		AssignedPodUpdate,
+		UnscheduledPodAdd,
+		UnscheduledPodUpdate,
+		UnscheduledPodDelete,
+		assignedPodOtherUpdate,
+		AssignedPodDelete,
+		PodRequestScaledDown,
+		PodLabelChange,
+		PodTolerationChange,
+		PodSchedulingGateEliminatedChange,
+		NodeSpecUnschedulableChange,
+		NodeAllocatableChange,
+		NodeLabelChange,
+		NodeAnnotationChange,
+		NodeTaintChange,
+		NodeConditionChange,
+		PvAdd,
+		PvUpdate,
+		PvcAdd,
+		PvcUpdate,
+		StorageClassAdd,
+		StorageClassUpdate,
+		CSINodeAdd,
+		CSINodeUpdate,
+		CSIDriverAdd,
+		CSIDriverUpdate,
+		CSIStorageCapacityAdd,
+		CSIStorageCapacityUpdate,
+		WildCardEvent,
+		UnschedulableTimeout,
+	}
 )
 
 // PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s).
diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go
@@ -52,10 +52,13 @@ import (
 	featuregatetesting "k8s.io/component-base/featuregate/testing"
 	logsapi "k8s.io/component-base/logs/api/v1"
 	"k8s.io/component-base/metrics/legacyregistry"
+	"k8s.io/component-base/metrics/testutil"
 	"k8s.io/klog/v2"
+	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/pkg/scheduler/apis/config"
 	"k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
 	"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
+	schedframework "k8s.io/kubernetes/pkg/scheduler/framework"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
 	frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime"
 	"k8s.io/kubernetes/pkg/scheduler/metrics"
@@ -927,6 +930,13 @@ func RunBenchmarkPerfScheduling(b *testing.B, outOfTreePluginRegistry frameworkr
 						}
 					}
 
+					if tc.FeatureGates[features.SchedulerQueueingHints] {
+						// In any case, we should make sure InFlightEvents is empty after running the scenario.
+						if err = checkEmptyInFlightEvents(); err != nil {
+							tCtx.Errorf("%s: %s", w.Name, err)
+						}
+					}
+
 					// Reset metrics to prevent metrics generated in current workload gets
 					// carried over to the next workload.
 					legacyregistry.Reset()
@@ -1027,6 +1037,23 @@ func compareMetricWithThreshold(items []DataItem, threshold float64, metricSelec
 	return nil
 }
 
+func checkEmptyInFlightEvents() error {
+	labels := []string{metrics.PodPoppedInFlightEvent}
+	for _, event := range schedframework.AllEvents {
+		labels = append(labels, event.Label)
+	}
+	for _, label := range labels {
+		value, err := testutil.GetGaugeMetricValue(metrics.InFlightEvents.WithLabelValues(label))
+		if err != nil {
+			return fmt.Errorf("failed to get InFlightEvents metric for label %s", label)
+		}
+		if value > 0 {
+			return fmt.Errorf("InFlightEvents for label %s should be empty, but has %v items", label, value)
+		}
+	}
+	return nil
+}
+
 func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem {
 	b, benchmarking := tCtx.TB().(*testing.B)
 	if benchmarking {
@@ -1139,7 +1166,10 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact
 				for _, collector := range collectors {
 					// Need loop-local variable for function below.
 					collector := collector
-					collector.init()
+					err = collector.init()
+					if err != nil {
+						tCtx.Fatalf("op %d: Failed to initialize data collector: %v", opIndex, err)
+					}
 					collectorWG.Add(1)
 					go func() {
 						defer collectorWG.Done()
@@ -1205,13 +1235,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact
 				}()
 			}
 
-			if !concreteOp.SkipWaitToCompletion {
-				// SkipWaitToCompletion=false indicates this step has waited for the Pods to be scheduled.
-				// So we reset the metrics in global registry; otherwise metrics gathered in this step
-				// will be carried over to next step.
-				legacyregistry.Reset()
-			}
-
 		case *churnOp:
 			var namespace string
 			if concreteOp.Namespace != nil {
@@ -1376,7 +1399,7 @@ func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsP
 }
 
 type testDataCollector interface {
-	init()
+	init() error
 	run(tCtx ktesting.TContext)
 	collect() []DataItem
 }
diff --git a/test/integration/scheduler_perf/scheduler_test.go b/test/integration/scheduler_perf/scheduler_test.go
@@ -18,6 +18,9 @@ package benchmark
 
 import (
 	"testing"
+
+	"k8s.io/component-base/metrics/legacyregistry"
+	"k8s.io/kubernetes/pkg/features"
 )
 
 func TestScheduling(t *testing.T) {
@@ -43,6 +46,17 @@ func TestScheduling(t *testing.T) {
 					informerFactory, tCtx := setupTestCase(t, tc, nil, nil)
 
 					runWorkload(tCtx, tc, w, informerFactory)
+
+					if tc.FeatureGates[features.SchedulerQueueingHints] {
+						// In any case, we should make sure InFlightEvents is empty after running the scenario.
+						if err = checkEmptyInFlightEvents(); err != nil {
+							tCtx.Errorf("%s: %s", w.Name, err)
+						}
+					}
+
+					// Reset metrics to prevent metrics generated in current workload gets
+					// carried over to the next workload.
+					legacyregistry.Reset()
 				})
 			}
 		})
diff --git a/test/integration/scheduler_perf/util.go b/test/integration/scheduler_perf/util.go
@@ -263,9 +263,19 @@ func newMetricsCollector(config *metricsCollectorConfig, labels map[string]strin
 	}
 }
 
-func (mc *metricsCollector) init() {
+func (mc *metricsCollector) init() error {
 	// Reset the metrics so that the measurements do not interfere with those collected during the previous steps.
-	legacyregistry.Reset()
+	m, err := legacyregistry.DefaultGatherer.Gather()
+	if err != nil {
+		return fmt.Errorf("failed to gather metrics to reset: %w", err)
+	}
+	for _, mFamily := range m {
+		// Reset only metrics defined in the collector.
+		if _, ok := mc.Metrics[mFamily.GetName()]; ok {
+			mFamily.Reset()
+		}
+	}
+	return nil
 }
 
 func (*metricsCollector) run(tCtx ktesting.TContext) {
@@ -381,7 +391,8 @@ func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[st
 	}
 }
 
-func (tc *throughputCollector) init() {
+func (tc *throughputCollector) init() error {
+	return nil
 }
 
 func (tc *throughputCollector) run(tCtx ktesting.TContext) {

Original file line number	Diff line number	Diff line change
`@@ -263,9 +263,19 @@ func newMetricsCollector(config *metricsCollectorConfig, labels map[string]strin`
`263`	`263`	`}`
`264`	`264`	`}`
`265`	`265`
`266`		`-func (mc *metricsCollector) init() {`
	`266`	`+func (mc *metricsCollector) init() error {`
`267`	`267`	`// Reset the metrics so that the measurements do not interfere with those collected during the previous steps.`
`268`		`- legacyregistry.Reset()`
	`268`	`+ m, err := legacyregistry.DefaultGatherer.Gather()`
	`269`	`+ if err != nil {`
	`270`	`+ return fmt.Errorf("failed to gather metrics to reset: %w", err)`
	`271`	`+ }`
	`272`	`+ for _, mFamily := range m {`
	`273`	`+ // Reset only metrics defined in the collector.`
	`274`	`+ if _, ok := mc.Metrics[mFamily.GetName()]; ok {`
	`275`	`+ mFamily.Reset()`
	`276`	`+ }`
	`277`	`+ }`
	`278`	`+ return nil`
`269`	`279`	`}`
`270`	`280`
`271`	`281`	`func (*metricsCollector) run(tCtx ktesting.TContext) {`
`@@ -381,7 +391,8 @@ func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[st`
`381`	`391`	`}`
`382`	`392`	`}`
`383`	`393`
`384`		`-func (tc *throughputCollector) init() {`
	`394`	`+func (tc *throughputCollector) init() error {`
	`395`	`+ return nil`
`385`	`396`	`}`
`386`	`397`
`387`	`398`	`func (tc *throughputCollector) run(tCtx ktesting.TContext) {`