Skip to content

Commit b95f9c3

Browse files
authored
Merge pull request kubernetes#126282 from macsko/fix_scheduler_perf_tests_taking_too_long
Init etcd and apiserver per test case in scheduler_perf integration tests
2 parents 6ac2067 + c15cdf7 commit b95f9c3

File tree

3 files changed

+33
-194
lines changed

3 files changed

+33
-194
lines changed

test/integration/scheduler_perf/create.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
"os"
2525
"time"
2626

27-
apierrors "k8s.io/apimachinery/pkg/api/errors"
2827
"k8s.io/apimachinery/pkg/api/meta"
2928
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3029
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -142,18 +141,6 @@ func (c *createAny) create(tCtx ktesting.TContext, env map[string]any) {
142141
}
143142
_, err = resourceClient.Create(tCtx, obj, options)
144143
}
145-
if err == nil && shouldCleanup(tCtx) {
146-
tCtx.CleanupCtx(func(tCtx ktesting.TContext) {
147-
del := resourceClient.Delete
148-
if mapping.Scope.Name() != meta.RESTScopeNameNamespace {
149-
del = resourceClient.Namespace(c.Namespace).Delete
150-
}
151-
err := del(tCtx, obj.GetName(), metav1.DeleteOptions{})
152-
if !apierrors.IsNotFound(err) {
153-
tCtx.ExpectNoError(err, fmt.Sprintf("deleting %s.%s %s", obj.GetKind(), obj.GetAPIVersion(), klog.KObj(obj)))
154-
}
155-
})
156-
}
157144
return err
158145
}
159146
// Retry, some errors (like CRD just created and type not ready for use yet) are temporary.

test/integration/scheduler_perf/scheduler_perf.go

Lines changed: 24 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ import (
3131
"testing"
3232
"time"
3333

34-
"github.com/google/go-cmp/cmp"
35-
3634
v1 "k8s.io/api/core/v1"
3735
apierrors "k8s.io/apimachinery/pkg/api/errors"
3836
"k8s.io/apimachinery/pkg/api/meta"
@@ -764,31 +762,34 @@ func initTestOutput(tb testing.TB) io.Writer {
764762
return output
765763
}
766764

767-
type cleanupKeyType struct{}
765+
var perfSchedulingLabelFilter = flag.String("perf-scheduling-label-filter", "performance", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by BenchmarkPerfScheduling")
768766

769-
var cleanupKey = cleanupKeyType{}
767+
func setupTestCase(t testing.TB, tc *testCase, output io.Writer, outOfTreePluginRegistry frameworkruntime.Registry) (informers.SharedInformerFactory, ktesting.TContext) {
768+
tCtx := ktesting.Init(t, initoption.PerTestOutput(*useTestingLog))
770769

771-
// shouldCleanup returns true if a function should clean up resource in the
772-
// apiserver when the test is done. This is true for unit tests (etcd and
773-
// apiserver get reused) and false for benchmarks (each benchmark starts with a
774-
// clean state, so cleaning up just wastes time).
775-
//
776-
// The default if not explicitly set in the context is true.
777-
func shouldCleanup(ctx context.Context) bool {
778-
val := ctx.Value(cleanupKey)
779-
if enabled, ok := val.(bool); ok {
780-
return enabled
770+
// Ensure that there are no leaked
771+
// goroutines. They could influence
772+
// performance of the next benchmark.
773+
// This must *after* RedirectKlog
774+
// because then during cleanup, the
775+
// test will wait for goroutines to
776+
// quit *before* restoring klog settings.
777+
framework.GoleakCheck(t)
778+
779+
// Now that we are ready to run, start
780+
// etcd.
781+
framework.StartEtcd(t, output)
782+
783+
for feature, flag := range tc.FeatureGates {
784+
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, feature, flag)
781785
}
782-
return true
783-
}
784786

785-
// withCleanup sets whether cleaning up resources in the apiserver
786-
// should be done. The default is true.
787-
func withCleanup(tCtx ktesting.TContext, enabled bool) ktesting.TContext {
788-
return ktesting.WithValue(tCtx, cleanupKey, enabled)
789-
}
787+
// 30 minutes should be plenty enough even for the 5000-node tests.
788+
timeout := 30 * time.Minute
789+
tCtx = ktesting.WithTimeout(tCtx, timeout, fmt.Sprintf("timed out after the %s per-test timeout", timeout))
790790

791-
var perfSchedulingLabelFilter = flag.String("perf-scheduling-label-filter", "performance", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by BenchmarkPerfScheduling")
791+
return setupClusterForWorkload(tCtx, tc.SchedulerConfigPath, tc.FeatureGates, outOfTreePluginRegistry)
792+
}
792793

793794
// RunBenchmarkPerfScheduling runs the scheduler performance tests.
794795
//
@@ -821,33 +822,8 @@ func RunBenchmarkPerfScheduling(b *testing.B, outOfTreePluginRegistry frameworkr
821822
if !enabled(*perfSchedulingLabelFilter, append(tc.Labels, w.Labels...)...) {
822823
b.Skipf("disabled by label filter %q", *perfSchedulingLabelFilter)
823824
}
824-
tCtx := ktesting.Init(b, initoption.PerTestOutput(*useTestingLog))
825-
826-
// Ensure that there are no leaked
827-
// goroutines. They could influence
828-
// performance of the next benchmark.
829-
// This must *after* RedirectKlog
830-
// because then during cleanup, the
831-
// test will wait for goroutines to
832-
// quit *before* restoring klog settings.
833-
framework.GoleakCheck(b)
834-
835-
// Now that we are ready to run, start
836-
// etcd.
837-
framework.StartEtcd(b, output)
838-
839-
// 30 minutes should be plenty enough even for the 5000-node tests.
840-
timeout := 30 * time.Minute
841-
tCtx = ktesting.WithTimeout(tCtx, timeout, fmt.Sprintf("timed out after the %s per-test timeout", timeout))
842-
843-
for feature, flag := range tc.FeatureGates {
844-
featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)
845-
}
846-
informerFactory, tCtx := setupClusterForWorkload(tCtx, tc.SchedulerConfigPath, tc.FeatureGates, outOfTreePluginRegistry)
847825

848-
// No need to clean up, each benchmark testcase starts with an empty
849-
// etcd database.
850-
tCtx = withCleanup(tCtx, false)
826+
informerFactory, tCtx := setupTestCase(b, tc, output, outOfTreePluginRegistry)
851827

852828
results := runWorkload(tCtx, tc, w, informerFactory)
853829
dataItems.DataItems = append(dataItems.DataItems, results...)
@@ -889,16 +865,6 @@ func RunBenchmarkPerfScheduling(b *testing.B, outOfTreePluginRegistry frameworkr
889865

890866
var testSchedulingLabelFilter = flag.String("test-scheduling-label-filter", "integration-test", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by TestScheduling")
891867

892-
type schedulerConfig struct {
893-
schedulerConfigPath string
894-
featureGates map[featuregate.Feature]bool
895-
}
896-
897-
func (c schedulerConfig) equals(tc *testCase) bool {
898-
return c.schedulerConfigPath == tc.SchedulerConfigPath &&
899-
cmp.Equal(c.featureGates, tc.FeatureGates)
900-
}
901-
902868
func loadSchedulerConfig(file string) (*config.KubeSchedulerConfiguration, error) {
903869
data, err := os.ReadFile(file)
904870
if err != nil {
@@ -997,7 +963,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact
997963
b.ReportMetric(duration.Seconds(), "runtime_seconds")
998964
})
999965
}
1000-
cleanup := shouldCleanup(tCtx)
1001966

1002967
// Disable error checking of the sampling interval length in the
1003968
// throughput collector by default. When running benchmarks, report
@@ -1028,11 +993,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact
1028993
// All namespaces listed in numPodsScheduledPerNamespace will be cleaned up.
1029994
numPodsScheduledPerNamespace := make(map[string]int)
1030995

1031-
if cleanup {
1032-
// This must run before controllers get shut down.
1033-
defer cleanupWorkload(tCtx, tc, numPodsScheduledPerNamespace)
1034-
}
1035-
1036996
for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) {
1037997
realOp, err := op.realOp.patchParams(w)
1038998
if err != nil {
@@ -1052,13 +1012,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact
10521012
if err := nodePreparer.PrepareNodes(tCtx, nextNodeIndex); err != nil {
10531013
tCtx.Fatalf("op %d: %v", opIndex, err)
10541014
}
1055-
if cleanup {
1056-
defer func() {
1057-
if err := nodePreparer.CleanupNodes(tCtx); err != nil {
1058-
tCtx.Fatalf("failed to clean up nodes, error: %v", err)
1059-
}
1060-
}()
1061-
}
10621015
nextNodeIndex += concreteOp.Count
10631016

10641017
case *createNamespacesOp:
@@ -1333,51 +1286,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact
13331286
return dataItems
13341287
}
13351288

1336-
// cleanupWorkload ensures that everything is removed from the API server that
1337-
// might have been created by runWorkload. This must be done before starting
1338-
// the next workload because otherwise it might stumble over previously created
1339-
// objects. For example, the namespaces are the same in different workloads, so
1340-
// not deleting them would cause the next one to fail with "cannot create
1341-
// namespace: already exists".
1342-
//
1343-
// Calling cleanupWorkload can be skipped if it is known that the next workload
1344-
// will run with a fresh etcd instance.
1345-
func cleanupWorkload(tCtx ktesting.TContext, tc *testCase, numPodsScheduledPerNamespace map[string]int) {
1346-
deleteNow := *metav1.NewDeleteOptions(0)
1347-
for namespace := range numPodsScheduledPerNamespace {
1348-
// Pods have to be deleted explicitly, with no grace period. Normally
1349-
// kubelet will set the DeletionGracePeriodSeconds to zero when it's okay
1350-
// to remove a deleted pod, but we don't run kubelet...
1351-
if err := tCtx.Client().CoreV1().Pods(namespace).DeleteCollection(tCtx, deleteNow, metav1.ListOptions{}); err != nil {
1352-
tCtx.Fatalf("failed to delete pods in namespace %q: %v", namespace, err)
1353-
}
1354-
if err := tCtx.Client().CoreV1().Namespaces().Delete(tCtx, namespace, deleteNow); err != nil {
1355-
tCtx.Fatalf("Deleting Namespace %q in numPodsScheduledPerNamespace: %v", namespace, err)
1356-
}
1357-
}
1358-
1359-
// We need to wait here because even with deletion timestamp set,
1360-
// actually removing a namespace can take some time (garbage collecting
1361-
// other generated object like secrets, etc.) and we don't want to
1362-
// start the next workloads while that cleanup is still going on.
1363-
if err := wait.PollUntilContextTimeout(tCtx, time.Second, 5*time.Minute, false, func(ctx context.Context) (bool, error) {
1364-
namespaces, err := tCtx.Client().CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
1365-
if err != nil {
1366-
return false, err
1367-
}
1368-
for _, namespace := range namespaces.Items {
1369-
if _, ok := numPodsScheduledPerNamespace[namespace.Name]; ok {
1370-
// A namespace created by the workload, need to wait.
1371-
return false, nil
1372-
}
1373-
}
1374-
// All namespaces gone.
1375-
return true, nil
1376-
}); err != nil {
1377-
tCtx.Fatalf("failed while waiting for namespace removal: %v", err)
1378-
}
1379-
}
1380-
13811289
func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) {
13821290
if _, ok := (*podsPerNamespace)[namespace]; !ok {
13831291
// The namespace has not created yet.

test/integration/scheduler_perf/scheduler_test.go

Lines changed: 9 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,6 @@ package benchmark
1818

1919
import (
2020
"testing"
21-
22-
utilfeature "k8s.io/apiserver/pkg/util/feature"
23-
featuregatetesting "k8s.io/component-base/featuregate/testing"
24-
"k8s.io/kubernetes/test/integration/framework"
25-
"k8s.io/kubernetes/test/utils/ktesting"
2621
)
2722

2823
func TestScheduling(t *testing.T) {
@@ -34,69 +29,18 @@ func TestScheduling(t *testing.T) {
3429
t.Fatal(err)
3530
}
3631

37-
// Check for leaks at the very end.
38-
framework.GoleakCheck(t)
39-
40-
// All integration test cases share the same etcd, similar to
41-
// https://github.com/kubernetes/kubernetes/blob/18d05b646d09b2971dc5400bc288062b0414e8cf/test/integration/framework/etcd.go#L186-L222.
42-
framework.StartEtcd(t, nil)
43-
44-
// Workloads with the same configuration share the same apiserver. For that
45-
// we first need to determine what those different configs are.
46-
var configs []schedulerConfig
4732
for _, tc := range testCases {
48-
tcEnabled := false
49-
for _, w := range tc.Workloads {
50-
if enabled(*testSchedulingLabelFilter, append(tc.Labels, w.Labels...)...) {
51-
tcEnabled = true
52-
break
53-
}
54-
}
55-
if !tcEnabled {
56-
continue
57-
}
58-
exists := false
59-
for _, config := range configs {
60-
if config.equals(tc) {
61-
exists = true
62-
break
63-
}
64-
}
65-
if !exists {
66-
configs = append(configs, schedulerConfig{schedulerConfigPath: tc.SchedulerConfigPath, featureGates: tc.FeatureGates})
67-
}
68-
}
69-
for _, config := range configs {
70-
// Not a sub test because we don't have a good name for it.
71-
func() {
72-
tCtx := ktesting.Init(t)
73-
74-
// No timeout here because the `go test -timeout` will ensure that
75-
// the test doesn't get stuck forever.
76-
77-
for feature, flag := range config.featureGates {
78-
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, feature, flag)
79-
}
80-
informerFactory, tCtx := setupClusterForWorkload(tCtx, config.schedulerConfigPath, config.featureGates, nil)
81-
82-
for _, tc := range testCases {
83-
if !config.equals(tc) {
84-
// Runs with some other config.
85-
continue
86-
}
87-
88-
t.Run(tc.Name, func(t *testing.T) {
89-
for _, w := range tc.Workloads {
90-
t.Run(w.Name, func(t *testing.T) {
91-
if !enabled(*testSchedulingLabelFilter, append(tc.Labels, w.Labels...)...) {
92-
t.Skipf("disabled by label filter %q", *testSchedulingLabelFilter)
93-
}
94-
tCtx := ktesting.WithTB(tCtx, t)
95-
runWorkload(tCtx, tc, w, informerFactory)
96-
})
33+
t.Run(tc.Name, func(t *testing.T) {
34+
for _, w := range tc.Workloads {
35+
t.Run(w.Name, func(t *testing.T) {
36+
if !enabled(*testSchedulingLabelFilter, append(tc.Labels, w.Labels...)...) {
37+
t.Skipf("disabled by label filter %q", *testSchedulingLabelFilter)
9738
}
39+
informerFactory, tCtx := setupTestCase(t, tc, nil, nil)
40+
41+
runWorkload(tCtx, tc, w, informerFactory)
9842
})
9943
}
100-
}()
44+
})
10145
}
10246
}

0 commit comments

Comments
 (0)