Merge pull request kubernetes#128482 from sanposhiho/scheduler-perf-ff

k8s-ci-robot · web-flow · commit 2bb886ce2a6a · 2024-11-05T12:15:30.000Z
fix: register QHint metrics only when available
diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go
@@ -160,21 +160,24 @@ var (
 					values: metrics.ExtentionPoints,
 				},
 			},
-			"scheduler_queueing_hint_execution_duration_seconds": {
-				{
-					label:  pluginLabelName,
-					values: PluginNames,
-				},
-				{
-					label:  eventLabelName,
-					values: schedframework.AllClusterEventLabels(),
-				},
+		},
+	}
+
+	qHintMetrics = map[string][]*labelValues{
+		"scheduler_queueing_hint_execution_duration_seconds": {
+			{
+				label:  pluginLabelName,
+				values: PluginNames,
 			},
-			"scheduler_event_handling_duration_seconds": {
-				{
-					label:  eventLabelName,
-					values: schedframework.AllClusterEventLabels(),
-				},
+			{
+				label:  eventLabelName,
+				values: schedframework.AllClusterEventLabels(),
+			},
+		},
+		"scheduler_event_handling_duration_seconds": {
+			{
+				label:  eventLabelName,
+				values: schedframework.AllClusterEventLabels(),
 			},
 		},
 	}
@@ -245,6 +248,18 @@ func InitTests() error {
 	return logsapi.ValidateAndApply(LoggingConfig, LoggingFeatureGate)
 }
 
+func registerQHintMetrics() {
+	for k, v := range qHintMetrics {
+		defaultMetricsCollectorConfig.Metrics[k] = v
+	}
+}
+
+func unregisterQHintMetrics() {
+	for k := range qHintMetrics {
+		delete(defaultMetricsCollectorConfig.Metrics, k)
+	}
+}
+
 // testCase defines a set of test cases that intends to test the performance of
 // similar workloads of varying sizes with shared overall settings such as
 // feature gates and metrics collected.
@@ -1056,7 +1071,6 @@ func setupTestCase(t testing.TB, tc *testCase, featureGates map[featuregate.Feat
 		if err := logsapi.ValidateAndApplyWithOptions(LoggingConfig, opts, LoggingFeatureGate); err != nil {
 			t.Fatalf("Failed to apply the per-test logging configuration: %v", err)
 		}
-
 	}
 
 	// Ensure that there are no leaked
@@ -1080,6 +1094,13 @@ func setupTestCase(t testing.TB, tc *testCase, featureGates map[featuregate.Feat
 	timeout := 30 * time.Minute
 	tCtx = ktesting.WithTimeout(tCtx, timeout, fmt.Sprintf("timed out after the %s per-test timeout", timeout))
 
+	if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
+		registerQHintMetrics()
+		t.Cleanup(func() {
+			unregisterQHintMetrics()
+		})
+	}
+
 	return setupClusterForWorkload(tCtx, tc.SchedulerConfigPath, featureGates, outOfTreePluginRegistry)
 }
 
@@ -1135,6 +1156,14 @@ func RunBenchmarkPerfScheduling(b *testing.B, configFile string, topicName strin
 					featureGates := featureGatesMerge(tc.FeatureGates, w.FeatureGates)
 					informerFactory, tCtx := setupTestCase(b, tc, featureGates, output, outOfTreePluginRegistry)
 
+					// TODO(#93795): make sure each workload within a test case has a unique
+					// name? The name is used to identify the stats in benchmark reports.
+					// TODO(#94404): check for unused template parameters? Probably a typo.
+					err := w.isValid(tc.MetricsCollectorConfig)
+					if err != nil {
+						b.Fatalf("workload %s is not valid: %v", w.Name, err)
+					}
+
 					results := runWorkload(tCtx, tc, w, informerFactory)
 					dataItems.DataItems = append(dataItems.DataItems, results...)
 
@@ -1228,6 +1257,10 @@ func RunIntegrationPerfScheduling(t *testing.T, configFile string) {
 					}
 					featureGates := featureGatesMerge(tc.FeatureGates, w.FeatureGates)
 					informerFactory, tCtx := setupTestCase(t, tc, featureGates, nil, nil)
+					err := w.isValid(tc.MetricsCollectorConfig)
+					if err != nil {
+						t.Fatalf("workload %s is not valid: %v", w.Name, err)
+					}
 
 					runWorkload(tCtx, tc, w, informerFactory)
 
@@ -2145,15 +2178,6 @@ func validateTestCases(testCases []*testCase) error {
 		if !tc.collectsMetrics() {
 			return fmt.Errorf("%s: no op in the workload template collects metrics", tc.Name)
 		}
-		// TODO(#93795): make sure each workload within a test case has a unique
-		// name? The name is used to identify the stats in benchmark reports.
-		// TODO(#94404): check for unused template parameters? Probably a typo.
-		for _, w := range tc.Workloads {
-			err := w.isValid(tc.MetricsCollectorConfig)
-			if err != nil {
-				return err
-			}
-		}
 	}
 	return nil
 }