Skip to content

Commit 5fc4e71

Browse files
authored
Merge pull request kubernetes#127499 from pohly/scheduler-perf-updates
scheduler_perf: updates to enhance performance testing of DRA
2 parents 75214d1 + d100768 commit 5fc4e71

File tree

9 files changed

+871
-94
lines changed

9 files changed

+871
-94
lines changed

staging/src/k8s.io/component-base/metrics/testutil/metrics.go

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -258,12 +258,8 @@ func GetHistogramVecFromGatherer(gatherer metrics.Gatherer, metricName string, l
258258
if err != nil {
259259
return nil, err
260260
}
261-
for _, mFamily := range m {
262-
if mFamily.GetName() == metricName {
263-
metricFamily = mFamily
264-
break
265-
}
266-
}
261+
262+
metricFamily = findMetricFamily(m, metricName)
267263

268264
if metricFamily == nil {
269265
return nil, fmt.Errorf("metric %q not found", metricName)
@@ -433,3 +429,47 @@ func LabelsMatch(metric *dto.Metric, labelFilter map[string]string) bool {
433429

434430
return true
435431
}
432+
433+
// GetCounterVecFromGatherer collects a counter that matches the given name
434+
// from a gatherer implementing k8s.io/component-base/metrics.Gatherer interface.
435+
// It returns all counter values that had a label with a certain name in a map
436+
// that uses the label value as keys.
437+
//
438+
// Used only for testing purposes where we need to gather metrics directly from a running binary (without metrics endpoint).
439+
func GetCounterValuesFromGatherer(gatherer metrics.Gatherer, metricName string, lvMap map[string]string, labelName string) (map[string]float64, error) {
440+
m, err := gatherer.Gather()
441+
if err != nil {
442+
return nil, err
443+
}
444+
445+
metricFamily := findMetricFamily(m, metricName)
446+
if metricFamily == nil {
447+
return nil, fmt.Errorf("metric %q not found", metricName)
448+
}
449+
if len(metricFamily.GetMetric()) == 0 {
450+
return nil, fmt.Errorf("metric %q is empty", metricName)
451+
}
452+
453+
values := make(map[string]float64)
454+
for _, metric := range metricFamily.GetMetric() {
455+
if LabelsMatch(metric, lvMap) {
456+
if counter := metric.GetCounter(); counter != nil {
457+
for _, labelPair := range metric.Label {
458+
if labelPair.GetName() == labelName {
459+
values[labelPair.GetValue()] = counter.GetValue()
460+
}
461+
}
462+
}
463+
}
464+
}
465+
return values, nil
466+
}
467+
468+
func findMetricFamily(metricFamilies []*dto.MetricFamily, metricName string) *dto.MetricFamily {
469+
for _, mFamily := range metricFamilies {
470+
if mFamily.GetName() == metricName {
471+
return mFamily
472+
}
473+
}
474+
return nil
475+
}

staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"fmt"
2121
"math"
2222
"reflect"
23+
"strings"
2324
"testing"
2425

2526
"github.com/google/go-cmp/cmp"
@@ -591,3 +592,104 @@ func TestGetHistogramVecFromGatherer(t *testing.T) {
591592
})
592593
}
593594
}
595+
596+
func TestGetCounterValuesFromGatherer(t *testing.T) {
597+
namespace := "namespace"
598+
subsystem := "subsystem"
599+
name := "metric_test_name"
600+
metricName := fmt.Sprintf("%s_%s_%s", namespace, subsystem, name)
601+
602+
tests := map[string]struct {
603+
metricName string // Empty is replaced with valid name.
604+
lvMap map[string]string
605+
labelName string
606+
607+
wantCounterValues map[string]float64
608+
wantErr string
609+
}{
610+
"wrong-metric": {
611+
metricName: "no-such-metric",
612+
wantErr: `metric "no-such-metric" not found`,
613+
},
614+
615+
"none": {
616+
metricName: metricName,
617+
lvMap: map[string]string{"no-such-label": "a"},
618+
619+
wantCounterValues: map[string]float64{},
620+
},
621+
622+
"value1-0": {
623+
metricName: metricName,
624+
lvMap: map[string]string{"label1": "value1-0"},
625+
labelName: "label2",
626+
627+
wantCounterValues: map[string]float64{"value2-0": 1.5, "value2-1": 2.5},
628+
},
629+
630+
"value1-1": {
631+
metricName: metricName,
632+
lvMap: map[string]string{"label1": "value1-1"},
633+
labelName: "label2",
634+
635+
wantCounterValues: map[string]float64{"value2-0": 3.5, "value2-1": 4.5},
636+
},
637+
638+
"value1-1-value2-0-none": {
639+
metricName: metricName,
640+
lvMap: map[string]string{"label1": "value1-1", "label2": "value2-0"},
641+
labelName: "none",
642+
643+
wantCounterValues: map[string]float64{},
644+
},
645+
646+
"value1-0-value2-0-one": {
647+
metricName: metricName,
648+
lvMap: map[string]string{"label1": "value1-0", "label2": "value2-0"},
649+
labelName: "label2",
650+
651+
wantCounterValues: map[string]float64{"value2-0": 1.5},
652+
},
653+
}
654+
for name, tt := range tests {
655+
t.Run(name, func(t *testing.T) {
656+
// CounterVec has two labels defined.
657+
labels := []string{"label1", "label2"}
658+
counterOpts := &metrics.CounterOpts{
659+
Namespace: "namespace",
660+
Name: "metric_test_name",
661+
Subsystem: "subsystem",
662+
Help: "counter help message",
663+
}
664+
vec := metrics.NewCounterVec(counterOpts, labels)
665+
// Use local registry
666+
var registry = metrics.NewKubeRegistry()
667+
var gather metrics.Gatherer = registry
668+
registry.MustRegister(vec)
669+
// Observe two metrics with same value for label1 but different value of label2.
670+
vec.WithLabelValues("value1-0", "value2-0").Add(1.5)
671+
vec.WithLabelValues("value1-0", "value2-1").Add(2.5)
672+
vec.WithLabelValues("value1-1", "value2-0").Add(3.5)
673+
vec.WithLabelValues("value1-1", "value2-1").Add(4.5)
674+
675+
// The check for empty metric apparently cannot be tested: registering
676+
// a NewCounterVec with no values has the affect that it doesn't get
677+
// returned, leading to "not found".
678+
679+
counterValues, err := GetCounterValuesFromGatherer(gather, tt.metricName, tt.lvMap, tt.labelName)
680+
if err != nil {
681+
if tt.wantErr != "" && !strings.Contains(err.Error(), tt.wantErr) {
682+
t.Errorf("expected error %q, got instead: %v", tt.wantErr, err)
683+
}
684+
return
685+
}
686+
if tt.wantErr != "" {
687+
t.Fatalf("expected error %q, got none", tt.wantErr)
688+
}
689+
690+
if diff := cmp.Diff(tt.wantCounterValues, counterValues); diff != "" {
691+
t.Errorf("Got unexpected HistogramVec (-want +got):\n%s", diff)
692+
}
693+
})
694+
}
695+
}

test/integration/scheduler_perf/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,22 @@ the ci-benchmark-scheduler-perf periodic job will fail with an error log such as
175175
This allows to analyze which workload failed. Make sure that the failure is not an outlier
176176
by checking multiple runs of the job. If the failures are not related to any regression,
177177
but to an incorrect threshold setting, it is reasonable to decrease it.
178+
179+
### Visualization
180+
181+
Some support for visualizing progress over time is built into the
182+
benchmarks. The measurement operation which creates pods writes .dat files like
183+
this:
184+
185+
test/integration/scheduler_perf/SchedulingBasic_5000Nodes_2023-03-17T14:52:09Z.dat
186+
187+
This file is in a text format that [gnuplot](http://www.gnuplot.info/) can
188+
read. A wrapper script selects some suitable parameters:
189+
190+
test/integration/scheduler_perf/gnuplot.sh test/integration/scheduler_perf/*.dat
191+
192+
It plots in an interactive window by default. To write into a file, use
193+
194+
test/integration/scheduler_perf/gnuplot.sh \
195+
-e 'set term png; set output "<output>.png"' \
196+
test/integration/scheduler_perf/*.dat

test/integration/scheduler_perf/config/performance-config.yaml

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1167,7 +1167,9 @@
11671167
maxClaimsPerNode: 20
11681168

11691169
# SchedulingWithResourceClaimTemplateStructured uses a ResourceClaimTemplate
1170-
# and dynamically creates ResourceClaim instances for each pod.
1170+
# and dynamically creates ResourceClaim instances for each pod. Node, pod and
1171+
# device counts are chosen so that the cluster gets filled up completely.
1172+
#
11711173
# The driver uses structured parameters.
11721174
- name: SchedulingWithResourceClaimTemplateStructured
11731175
featureGates:
@@ -1234,6 +1236,125 @@
12341236
measurePods: 2500
12351237
maxClaimsPerNode: 10
12361238

1239+
# SteadyStateResourceClaimTemplateStructured uses a ResourceClaimTemplate and
1240+
# dynamically creates ResourceClaim instances for each pod. It creates ten
1241+
# pods, waits for them to be scheduled, deletes them, and starts again,
1242+
# so the cluster remains at the same level of utilization.
1243+
#
1244+
# The number of already allocated claims can be varied, thus simulating
1245+
# various degrees of pre-existing resource utilization.
1246+
#
1247+
# The driver uses structured parameters.
1248+
- name: SteadyStateClusterResourceClaimTemplateStructured
1249+
featureGates:
1250+
DynamicResourceAllocation: true
1251+
# SchedulerQueueingHints: true
1252+
workloadTemplate:
1253+
- opcode: createNodes
1254+
countParam: $nodesWithoutDRA
1255+
- opcode: createNodes
1256+
nodeTemplatePath: config/dra/node-with-dra-test-driver.yaml
1257+
countParam: $nodesWithDRA
1258+
- opcode: createResourceDriver
1259+
driverName: test-driver.cdi.k8s.io
1260+
nodes: scheduler-perf-dra-*
1261+
maxClaimsPerNodeParam: $maxClaimsPerNode
1262+
structuredParameters: true
1263+
- opcode: createAny
1264+
templatePath: config/dra/deviceclass-structured.yaml
1265+
- opcode: createAny
1266+
templatePath: config/dra/resourceclaim-structured.yaml
1267+
countParam: $initClaims
1268+
namespace: init
1269+
- opcode: allocResourceClaims
1270+
namespace: init
1271+
- opcode: createAny
1272+
templatePath: config/dra/resourceclaimtemplate-structured.yaml
1273+
namespace: test
1274+
- opcode: createPods
1275+
namespace: test
1276+
count: 10
1277+
steadyState: true
1278+
durationParam: $duration
1279+
podTemplatePath: config/dra/pod-with-claim-template.yaml
1280+
collectMetrics: true
1281+
workloads:
1282+
- name: fast
1283+
labels: [integration-test, fast, short]
1284+
params:
1285+
# This testcase runs through all code paths without
1286+
# taking too long overall.
1287+
nodesWithDRA: 1
1288+
nodesWithoutDRA: 1
1289+
initClaims: 0
1290+
maxClaimsPerNode: 10
1291+
duration: 2s
1292+
- name: empty_100nodes
1293+
params:
1294+
nodesWithDRA: 100
1295+
nodesWithoutDRA: 0
1296+
initClaims: 0
1297+
maxClaimsPerNode: 10
1298+
duration: 10s
1299+
- name: empty_200nodes
1300+
params:
1301+
nodesWithDRA: 200
1302+
nodesWithoutDRA: 0
1303+
initClaims: 0
1304+
maxClaimsPerNode: 10
1305+
duration: 10s
1306+
- name: empty_500nodes
1307+
params:
1308+
nodesWithDRA: 500
1309+
nodesWithoutDRA: 0
1310+
initClaims: 0
1311+
maxClaimsPerNode: 10
1312+
duration: 10s
1313+
# In the "half" scenarios, half of the devices are in use.
1314+
- name: half_100nodes
1315+
params:
1316+
nodesWithDRA: 100
1317+
nodesWithoutDRA: 0
1318+
initClaims: 500
1319+
maxClaimsPerNode: 10
1320+
duration: 10s
1321+
- name: half_200nodes
1322+
params:
1323+
nodesWithDRA: 200
1324+
nodesWithoutDRA: 0
1325+
initClaims: 1000
1326+
maxClaimsPerNode: 10
1327+
duration: 10s
1328+
- name: half_500nodes
1329+
params:
1330+
nodesWithDRA: 500
1331+
nodesWithoutDRA: 0
1332+
initClaims: 2500
1333+
maxClaimsPerNode: 10
1334+
duration: 10s
1335+
# In the "full" scenarios, the cluster can accommodate exactly 10 additional pods.
1336+
- name: full_100nodes
1337+
params:
1338+
nodesWithDRA: 100
1339+
nodesWithoutDRA: 0
1340+
initClaims: 990
1341+
maxClaimsPerNode: 10
1342+
duration: 10s
1343+
- name: full_200nodes
1344+
params:
1345+
nodesWithDRA: 200
1346+
nodesWithoutDRA: 0
1347+
initClaims: 1990
1348+
maxClaimsPerNode: 10
1349+
duration: 10s
1350+
- name: full_500nodes
1351+
params:
1352+
nodesWithDRA: 500
1353+
nodesWithoutDRA: 0
1354+
initClaims: 4990
1355+
maxClaimsPerNode: 10
1356+
duration: 10s
1357+
12371358
# SchedulingWithResourceClaimTemplate uses ResourceClaims
12381359
# with deterministic names that are shared between pods.
12391360
# There is a fixed ratio of 1:5 between claims and pods.

test/integration/scheduler_perf/create.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,6 @@ type createAny struct {
5656
var _ runnableOp = &createAny{}
5757

5858
func (c *createAny) isValid(allowParameterization bool) error {
59-
if c.Opcode != createAnyOpcode {
60-
return fmt.Errorf("invalid opcode %q; expected %q", c.Opcode, createAnyOpcode)
61-
}
6259
if c.TemplatePath == "" {
6360
return fmt.Errorf("TemplatePath must be set")
6461
}

0 commit comments

Comments
 (0)