scheduler_perf: track and visualize progress over time

pohly · pohly · commit d100768d940c · 2024-09-25T11:09:15.000+02:00
This is useful to see whether pod scheduling happens in bursts and how it
behaves over time, which is relevant in particular for dynamic resource
allocation where it may become harder at the end to find the node which still
has resources available.

Besides "pods scheduled" it's also useful to know how many attempts were
needed, so schedule_attempts_total also gets sampled and stored.

To visualize the result of one or more test runs, use:

     gnuplot.sh *.dat
diff --git a/staging/src/k8s.io/component-base/metrics/testutil/metrics.go b/staging/src/k8s.io/component-base/metrics/testutil/metrics.go
@@ -258,12 +258,8 @@ func GetHistogramVecFromGatherer(gatherer metrics.Gatherer, metricName string, l
 	if err != nil {
 		return nil, err
 	}
-	for _, mFamily := range m {
-		if mFamily.GetName() == metricName {
-			metricFamily = mFamily
-			break
-		}
-	}
+
+	metricFamily = findMetricFamily(m, metricName)
 
 	if metricFamily == nil {
 		return nil, fmt.Errorf("metric %q not found", metricName)
@@ -433,3 +429,47 @@ func LabelsMatch(metric *dto.Metric, labelFilter map[string]string) bool {
 
 	return true
 }
+
+// GetCounterVecFromGatherer collects a counter that matches the given name
+// from a gatherer implementing k8s.io/component-base/metrics.Gatherer interface.
+// It returns all counter values that had a label with a certain name in a map
+// that uses the label value as keys.
+//
+// Used only for testing purposes where we need to gather metrics directly from a running binary (without metrics endpoint).
+func GetCounterValuesFromGatherer(gatherer metrics.Gatherer, metricName string, lvMap map[string]string, labelName string) (map[string]float64, error) {
+	m, err := gatherer.Gather()
+	if err != nil {
+		return nil, err
+	}
+
+	metricFamily := findMetricFamily(m, metricName)
+	if metricFamily == nil {
+		return nil, fmt.Errorf("metric %q not found", metricName)
+	}
+	if len(metricFamily.GetMetric()) == 0 {
+		return nil, fmt.Errorf("metric %q is empty", metricName)
+	}
+
+	values := make(map[string]float64)
+	for _, metric := range metricFamily.GetMetric() {
+		if LabelsMatch(metric, lvMap) {
+			if counter := metric.GetCounter(); counter != nil {
+				for _, labelPair := range metric.Label {
+					if labelPair.GetName() == labelName {
+						values[labelPair.GetValue()] = counter.GetValue()
+					}
+				}
+			}
+		}
+	}
+	return values, nil
+}
+
+func findMetricFamily(metricFamilies []*dto.MetricFamily, metricName string) *dto.MetricFamily {
+	for _, mFamily := range metricFamilies {
+		if mFamily.GetName() == metricName {
+			return mFamily
+		}
+	}
+	return nil
+}
diff --git a/staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go b/staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"math"
 	"reflect"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -591,3 +592,104 @@ func TestGetHistogramVecFromGatherer(t *testing.T) {
 		})
 	}
 }
+
+func TestGetCounterValuesFromGatherer(t *testing.T) {
+	namespace := "namespace"
+	subsystem := "subsystem"
+	name := "metric_test_name"
+	metricName := fmt.Sprintf("%s_%s_%s", namespace, subsystem, name)
+
+	tests := map[string]struct {
+		metricName string // Empty is replaced with valid name.
+		lvMap      map[string]string
+		labelName  string
+
+		wantCounterValues map[string]float64
+		wantErr           string
+	}{
+		"wrong-metric": {
+			metricName: "no-such-metric",
+			wantErr:    `metric "no-such-metric" not found`,
+		},
+
+		"none": {
+			metricName: metricName,
+			lvMap:      map[string]string{"no-such-label": "a"},
+
+			wantCounterValues: map[string]float64{},
+		},
+
+		"value1-0": {
+			metricName: metricName,
+			lvMap:      map[string]string{"label1": "value1-0"},
+			labelName:  "label2",
+
+			wantCounterValues: map[string]float64{"value2-0": 1.5, "value2-1": 2.5},
+		},
+
+		"value1-1": {
+			metricName: metricName,
+			lvMap:      map[string]string{"label1": "value1-1"},
+			labelName:  "label2",
+
+			wantCounterValues: map[string]float64{"value2-0": 3.5, "value2-1": 4.5},
+		},
+
+		"value1-1-value2-0-none": {
+			metricName: metricName,
+			lvMap:      map[string]string{"label1": "value1-1", "label2": "value2-0"},
+			labelName:  "none",
+
+			wantCounterValues: map[string]float64{},
+		},
+
+		"value1-0-value2-0-one": {
+			metricName: metricName,
+			lvMap:      map[string]string{"label1": "value1-0", "label2": "value2-0"},
+			labelName:  "label2",
+
+			wantCounterValues: map[string]float64{"value2-0": 1.5},
+		},
+	}
+	for name, tt := range tests {
+		t.Run(name, func(t *testing.T) {
+			// CounterVec has two labels defined.
+			labels := []string{"label1", "label2"}
+			counterOpts := &metrics.CounterOpts{
+				Namespace: "namespace",
+				Name:      "metric_test_name",
+				Subsystem: "subsystem",
+				Help:      "counter help message",
+			}
+			vec := metrics.NewCounterVec(counterOpts, labels)
+			// Use local registry
+			var registry = metrics.NewKubeRegistry()
+			var gather metrics.Gatherer = registry
+			registry.MustRegister(vec)
+			// Observe two metrics with same value for label1 but different value of label2.
+			vec.WithLabelValues("value1-0", "value2-0").Add(1.5)
+			vec.WithLabelValues("value1-0", "value2-1").Add(2.5)
+			vec.WithLabelValues("value1-1", "value2-0").Add(3.5)
+			vec.WithLabelValues("value1-1", "value2-1").Add(4.5)
+
+			// The check for empty metric apparently cannot be tested: registering
+			// a NewCounterVec with no values has the affect that it doesn't get
+			// returned, leading to "not found".
+
+			counterValues, err := GetCounterValuesFromGatherer(gather, tt.metricName, tt.lvMap, tt.labelName)
+			if err != nil {
+				if tt.wantErr != "" && !strings.Contains(err.Error(), tt.wantErr) {
+					t.Errorf("expected error %q, got instead: %v", tt.wantErr, err)
+				}
+				return
+			}
+			if tt.wantErr != "" {
+				t.Fatalf("expected error %q, got none", tt.wantErr)
+			}
+
+			if diff := cmp.Diff(tt.wantCounterValues, counterValues); diff != "" {
+				t.Errorf("Got unexpected HistogramVec (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
diff --git a/test/integration/scheduler_perf/README.md b/test/integration/scheduler_perf/README.md
@@ -175,3 +175,22 @@ the ci-benchmark-scheduler-perf periodic job will fail with an error log such as
 This allows to analyze which workload failed. Make sure that the failure is not an outlier 
 by checking multiple runs of the job. If the failures are not related to any regression, 
 but to an incorrect threshold setting, it is reasonable to decrease it.
+
+### Visualization
+
+Some support for visualizing progress over time is built into the
+benchmarks. The measurement operation which creates pods writes .dat files like
+this:
+
+     test/integration/scheduler_perf/SchedulingBasic_5000Nodes_2023-03-17T14:52:09Z.dat
+
+This file is in a text format that [gnuplot](http://www.gnuplot.info/) can
+read. A wrapper script selects some suitable parameters:
+
+     test/integration/scheduler_perf/gnuplot.sh test/integration/scheduler_perf/*.dat
+
+It plots in an interactive window by default. To write into a file, use
+
+    test/integration/scheduler_perf/gnuplot.sh \
+       -e 'set term png; set output "<output>.png"' \
+       test/integration/scheduler_perf/*.dat
diff --git a/test/integration/scheduler_perf/gnuplot.sh b/test/integration/scheduler_perf/gnuplot.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# Copyright 2024 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Invoke this script with a list of *.dat and it'll plot them with gnuplot.
+# Any non-file parameter is passed through to gnuplot. By default,
+# an X11 window is used to display the result. To write into a file,
+# use
+#  -e "set term png; set output <output>.png"
+
+files=()
+args=( -e "set term x11 persist" )
+
+for i in "$@"; do
+    if [ -f "$i" ]; then
+        files+=("$i")
+    else
+        args+=("$i")
+    fi
+done
+
+(
+    cat <<EOF
+set ytics autofreq nomirror tc lt 1
+set xlabel 'measurement runtime [seconds]'
+set ylabel 'scheduling rate [pods/second]' tc lt 1
+set y2tics autofreq nomirror tc lt 2
+set y2label 'scheduling attempts per pod' tc lt 2
+
+# Derivative from https://stackoverflow.com/questions/15751226/how-can-i-plot-the-derivative-of-a-graph-in-gnuplot.
+d2(x,y) = (\$0 == 0) ? (x1 = x, y1 = y, 1/0) : (x2 = x1, x1 = x, y2 = y1, y1 = y, (y1-y2)/(x1-x2))
+dx = 0.25
+
+EOF
+    echo -n "plot "
+    for file in "${files[@]}"; do
+        echo -n "'${file}' using (\$1 - dx):(d2(\$1, \$2)) with linespoints title '$(basename "$file" .dat | sed -e 's/_/ /g') metric rate' axis x1y1, "
+        echo -n "'${file}' using (\$1 - dx):(d2(\$1, \$4)) with linespoints title '$(basename "$file" .dat | sed -e 's/_/ /g') observed rate' axis x1y1, "
+        echo -n "'${file}' using 1:(\$3/\$2) with linespoints title '$(basename "$file" .dat | sed -e 's/_/ /g') attempts' axis x1y2, "
+    done
+    echo
+) | tee /dev/stderr | gnuplot "${args[@]}" -
diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go
@@ -1095,6 +1095,30 @@ func RunBenchmarkPerfScheduling(b *testing.B, outOfTreePluginRegistry frameworkr
 					// Reset metrics to prevent metrics generated in current workload gets
 					// carried over to the next workload.
 					legacyregistry.Reset()
+
+					// Exactly one result is expected to contain the progress information.
+					for _, item := range results {
+						if len(item.progress) == 0 {
+							continue
+						}
+
+						destFile, err := dataFilename(strings.ReplaceAll(fmt.Sprintf("%s_%s_%s.dat", tc.Name, w.Name, runID), "/", "_"))
+						if err != nil {
+							b.Fatalf("prepare data file: %v", err)
+						}
+						f, err := os.Create(destFile)
+						if err != nil {
+							b.Fatalf("create data file: %v", err)
+						}
+
+						// Print progress over time.
+						for _, sample := range item.progress {
+							fmt.Fprintf(f, "%.1fs %d %d %d %f\n", sample.ts.Sub(item.start).Seconds(), sample.completed, sample.attempts, sample.observedTotal, sample.observedRate)
+						}
+						if err := f.Close(); err != nil {
+							b.Fatalf("closing data file: %v", err)
+						}
+					}
 				})
 			}
 		})
diff --git a/test/integration/scheduler_perf/util.go b/test/integration/scheduler_perf/util.go
@@ -63,6 +63,8 @@ const (
 
 var dataItemsDir = flag.String("data-items-dir", "", "destination directory for storing generated data items for perf dashboard")
 
+var runID = time.Now().Format(dateFormat)
+
 func newDefaultComponentConfig() (*config.KubeSchedulerConfiguration, error) {
 	gvk := kubeschedulerconfigv1.SchemeGroupVersion.WithKind("KubeSchedulerConfiguration")
 	cfg := config.KubeSchedulerConfiguration{}
@@ -180,6 +182,10 @@ type DataItem struct {
 	Unit string `json:"unit"`
 	// Labels is the labels of the data item.
 	Labels map[string]string `json:"labels,omitempty"`
+
+	// progress contains number of scheduled pods over time.
+	progress []podScheduling
+	start    time.Time
 }
 
 // DataItems is the data point set. It is the struct that perf dashboard expects.
@@ -188,6 +194,14 @@ type DataItems struct {
 	DataItems []DataItem `json:"dataItems"`
 }
 
+type podScheduling struct {
+	ts            time.Time
+	attempts      int
+	completed     int
+	observedTotal int
+	observedRate  float64
+}
+
 // makeBasePod creates a Pod object to be used as a template.
 func makeBasePod() *v1.Pod {
 	basePod := &v1.Pod{
@@ -241,6 +255,17 @@ func dataItems2JSONFile(dataItems DataItems, namePrefix string) error {
 	return os.WriteFile(destFile, formatted.Bytes(), 0644)
 }
 
+func dataFilename(destFile string) (string, error) {
+	if *dataItemsDir != "" {
+		// Ensure the "dataItemsDir" path is valid.
+		if err := os.MkdirAll(*dataItemsDir, 0750); err != nil {
+			return "", fmt.Errorf("dataItemsDir path %v does not exist and cannot be created: %w", *dataItemsDir, err)
+		}
+		destFile = path.Join(*dataItemsDir, destFile)
+	}
+	return destFile, nil
+}
+
 type labelValues struct {
 	label  string
 	values []string
@@ -383,6 +408,9 @@ type throughputCollector struct {
 	labels                map[string]string
 	namespaces            sets.Set[string]
 	errorMargin           float64
+
+	progress []podScheduling
+	start    time.Time
 }
 
 func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[string]string, namespaces []string, errorMargin float64) *throughputCollector {
@@ -492,6 +520,7 @@ func (tc *throughputCollector) run(tCtx ktesting.TContext) {
 				// sampling and creating pods get started independently.
 				lastScheduledCount = scheduled
 				lastSampleTime = now
+				tc.start = now
 				continue
 			}
 
@@ -524,6 +553,20 @@ func (tc *throughputCollector) run(tCtx ktesting.TContext) {
 			for i := 0; i <= skipped; i++ {
 				tc.schedulingThroughputs = append(tc.schedulingThroughputs, throughput)
 			}
+
+			// Record the metric sample.
+			counters, err := testutil.GetCounterValuesFromGatherer(legacyregistry.DefaultGatherer, "scheduler_schedule_attempts_total", map[string]string{"profile": "default-scheduler"}, "result")
+			if err != nil {
+				klog.Error(err)
+			}
+			tc.progress = append(tc.progress, podScheduling{
+				ts:            now,
+				attempts:      int(counters["unschedulable"] + counters["error"] + counters["scheduled"]),
+				completed:     int(counters["scheduled"]),
+				observedTotal: scheduled,
+				observedRate:  throughput,
+			})
+
 			lastScheduledCount = scheduled
 			klog.Infof("%d pods have been scheduled successfully", lastScheduledCount)
 			skipped = 0
@@ -533,7 +576,11 @@ func (tc *throughputCollector) run(tCtx ktesting.TContext) {
 }
 
 func (tc *throughputCollector) collect() []DataItem {
-	throughputSummary := DataItem{Labels: tc.labels}
+	throughputSummary := DataItem{
+		Labels:   tc.labels,
+		progress: tc.progress,
+		start:    tc.start,
+	}
 	if length := len(tc.schedulingThroughputs); length > 0 {
 		sort.Float64s(tc.schedulingThroughputs)
 		sum := 0.0