Skip to content

Commit 916b187

Browse files
authored
Merge pull request #8910 from ykakarap/pr-scale-e2e-improvements
🌱 add improvements to scale e2e
2 parents 2b63ecf + dfb8de3 commit 916b187

File tree

4 files changed

+187
-25
lines changed

4 files changed

+187
-25
lines changed

test/e2e/config/docker.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,3 +316,7 @@ intervals:
316316
kcp-remediation/wait-machines: ["5m", "10s"]
317317
kcp-remediation/check-machines-stable: ["30s", "5s"]
318318
kcp-remediation/wait-machine-provisioned: ["5m", "10s"]
319+
# Giving a bit more time during scale tests, we analyze independently if everything works quickly enough.
320+
scale/wait-cluster: ["10m", "10s"]
321+
scale/wait-control-plane: ["20m", "10s"]
322+
scale/wait-worker-nodes: ["20m", "10s"]

test/e2e/scale.go

Lines changed: 163 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ import (
2020
"bytes"
2121
"context"
2222
"fmt"
23+
"math"
2324
"os"
2425
"path/filepath"
2526
"strconv"
2627
"sync"
28+
"time"
2729

2830
. "github.com/onsi/ginkgo/v2"
2931
"github.com/onsi/ginkgo/v2/types"
@@ -32,7 +34,9 @@ import (
3234
corev1 "k8s.io/api/core/v1"
3335
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3436
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
37+
"k8s.io/apimachinery/pkg/runtime"
3538
kerrors "k8s.io/apimachinery/pkg/util/errors"
39+
"k8s.io/klog/v2"
3640
"k8s.io/utils/pointer"
3741
"sigs.k8s.io/controller-runtime/pkg/client"
3842

@@ -48,6 +52,7 @@ const (
4852
scaleConcurrency = "CAPI_SCALE_CONCURRENCY"
4953
scaleControlPlaneMachineCount = "CAPI_SCALE_CONTROL_PLANE_MACHINE_COUNT"
5054
scaleWorkerMachineCount = "CAPI_SCALE_WORKER_MACHINE_COUNT"
55+
scaleMachineDeploymentCount = "CAPI_SCALE_MACHINE_DEPLOYMENT_COUNT"
5156
)
5257

5358
// scaleSpecInput is the input for scaleSpec.
@@ -66,6 +71,7 @@ type scaleSpecInput struct {
6671

6772
// Flavor, if specified is the template flavor used to create the cluster for testing.
6873
// If not specified, the default flavor for the selected infrastructure provider is used.
74+
// The ClusterTopology of this flavor should have exactly one MachineDeployment.
6975
Flavor *string
7076

7177
// ClusterCount is the number of target workload clusters.
@@ -83,11 +89,23 @@ type scaleSpecInput struct {
8389
// Can be overridden by variable CAPI_SCALE_CONTROLPLANE_MACHINE_COUNT.
8490
ControlPlaneMachineCount *int64
8591

86-
// WorkerMachineCount defines number of worker machines to be added to each workload cluster.
92+
// WorkerMachineCount defines number of worker machines per machine deployment of the workload cluster.
8793
// If not specified, 1 will be used.
8894
// Can be overridden by variable CAPI_SCALE_WORKER_MACHINE_COUNT.
95+
// The resulting number of worker nodes for each of the workload cluster will
96+
// be MachineDeploymentCount*WorkerMachineCount (CAPI_SCALE_MACHINE_DEPLOYMENT_COUNT*CAPI_SCALE_WORKER_MACHINE_COUNT).
8997
WorkerMachineCount *int64
9098

99+
// MachineDeploymentCount defines the number of MachineDeployments to be used per workload cluster.
100+
// If not specified, 1 will be used.
101+
// Can be overridden by variable CAPI_SCALE_MACHINE_DEPLOYMENT_COUNT.
102+
// Note: This assumes that the cluster template of the specified flavor has exactly one machine deployment.
103+
// It uses this machine deployment to create additional copies.
104+
// Names of the MachineDeployments will be overridden to "md-1", "md-2", etc.
105+
// The resulting number of worker nodes for each of the workload cluster will
106+
// be MachineDeploymentCount*WorkerMachineCount (CAPI_SCALE_MACHINE_DEPLOYMENT_COUNT*CAPI_SCALE_WORKER_MACHINE_COUNT).
107+
MachineDeploymentCount *int64
108+
91109
// FailFast if set to true will return immediately after the first cluster operation fails.
92110
// If set to false, the test suite will not exit immediately after the first cluster operation fails.
93111
// Example: When creating clusters from c1 to c20 consider c6 fails creation. If FailFast is set to true
@@ -96,6 +114,15 @@ type scaleSpecInput struct {
96114
// Note: Please note that the test suit will still fail since c6 creation failed. FailFast will determine
97115
// if the test suit should fail as soon as c6 fails or if it should fail after all cluster creations are done.
98116
FailFast bool
117+
118+
// SkipCleanup if set to true will skip deleting the workload clusters.
119+
SkipCleanup bool
120+
121+
// SkipWaitForCreation defines if the test should wait for the workload clusters to be fully provisioned
122+
// before moving on.
123+
// If set to true, the test will create the workload clusters and immediately continue without waiting
124+
// for the clusters to be fully provisioned.
125+
SkipWaitForCreation bool
99126
}
100127

101128
// scaleSpec implements a scale test.
@@ -118,7 +145,17 @@ func scaleSpec(ctx context.Context, inputGetter func() scaleSpecInput) {
118145
Expect(input.E2EConfig.Variables).To(HaveKey(KubernetesVersion))
119146

120147
// Setup a Namespace where to host objects for this spec and create a watcher for the namespace events.
121-
namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder)
148+
// We are pinning the namespace for the test to help with debugging and testing.
149+
// Example: Queries to look up state of the clusters can be re-used.
150+
// Since we don't run multiple instances of this test concurrently on a management cluster it is okay to pin the namespace.
151+
Byf("Creating a namespace for hosting the %q test spec", specName)
152+
namespace, cancelWatches = framework.CreateNamespaceAndWatchEvents(ctx, framework.CreateNamespaceAndWatchEventsInput{
153+
Creator: input.BootstrapClusterProxy.GetClient(),
154+
ClientSet: input.BootstrapClusterProxy.GetClientSet(),
155+
Name: specName,
156+
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", input.BootstrapClusterProxy.GetName()),
157+
IgnoreAlreadyExists: true,
158+
})
122159
})
123160

124161
It("Should create and delete workload clusters", func() {
@@ -156,6 +193,18 @@ func scaleSpec(ctx context.Context, inputGetter func() scaleSpecInput) {
156193
workerMachineCount = pointer.Int64(int64(workerMachineCountInt))
157194
}
158195

196+
machineDeploymentCount := pointer.Int64(1)
197+
if input.MachineDeploymentCount != nil {
198+
machineDeploymentCount = input.MachineDeploymentCount
199+
}
200+
// If variable is defined that will take precedence.
201+
if input.E2EConfig.HasVariable(scaleMachineDeploymentCount) {
202+
machineDeploymentCountStr := input.E2EConfig.GetVariable(scaleMachineDeploymentCount)
203+
machineDeploymentCountInt, err := strconv.Atoi(machineDeploymentCountStr)
204+
Expect(err).NotTo(HaveOccurred())
205+
machineDeploymentCount = pointer.Int64(int64(machineDeploymentCountInt))
206+
}
207+
159208
clusterCount := int64(10)
160209
if input.ClusterCount != nil {
161210
clusterCount = *input.ClusterCount
@@ -211,33 +260,55 @@ func scaleSpec(ctx context.Context, inputGetter func() scaleSpecInput) {
211260
log.Logf("Extract ClusterClass and Cluster from template YAML")
212261
clusterClassYAML, baseClusterTemplateYAML := extractClusterClassAndClusterFromTemplate(baseWorkloadClusterTemplate)
213262

214-
// Apply the ClusterClass.
215-
log.Logf("Create ClusterClass")
216-
Eventually(func() error {
217-
return input.BootstrapClusterProxy.Apply(ctx, clusterClassYAML, "-n", namespace.Name)
218-
}).Should(Succeed())
263+
// Modify the baseClusterTemplateYAML so that it has the desired number of machine deployments.
264+
modifiedBaseTemplateYAML := modifyMachineDeployments(baseClusterTemplateYAML, int(*machineDeploymentCount))
265+
266+
if len(clusterClassYAML) > 0 {
267+
// Apply the ClusterClass.
268+
log.Logf("Create ClusterClass")
269+
Eventually(func() error {
270+
return input.BootstrapClusterProxy.Apply(ctx, clusterClassYAML)
271+
}).Should(Succeed())
272+
} else {
273+
log.Logf("ClusterClass already exists. Skipping creation.")
274+
}
219275

220276
By("Create workload clusters concurrently")
221277
// Create multiple clusters concurrently from the same base cluster template.
222278

223279
clusterNames := make([]string, 0, clusterCount)
280+
clusterNameDigits := 1 + int(math.Log10(float64(clusterCount)))
224281
for i := int64(1); i <= clusterCount; i++ {
225-
name := fmt.Sprintf("%s-%d", specName, i)
282+
// This ensures we always have the right number of leading zeros in our cluster names, e.g.
283+
// clusterCount=1000 will lead to cluster names like scale-0001, scale-0002, ... .
284+
// This makes it possible to have correct ordering of clusters in diagrams in tools like Grafana.
285+
name := fmt.Sprintf("%s-%0*d", specName, clusterNameDigits, i)
226286
clusterNames = append(clusterNames, name)
227287
}
228288

289+
// Use the correct creator function for creating the workload clusters.
290+
// Default to using the "create and wait" creator function. If SkipWaitForCreation=true then
291+
// use the "create only" creator function.
292+
creator := getClusterCreateAndWaitFn(clusterctl.ApplyCustomClusterTemplateAndWaitInput{
293+
ClusterProxy: input.BootstrapClusterProxy,
294+
Namespace: namespace.Name,
295+
WaitForClusterIntervals: input.E2EConfig.GetIntervals(specName, "wait-cluster"),
296+
WaitForControlPlaneIntervals: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
297+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
298+
})
299+
if input.SkipWaitForCreation {
300+
if !input.SkipCleanup {
301+
log.Logf("WARNING! Using SkipWaitForCreation=true while SkipCleanup=false can lead to workload clusters getting deleted before they are fully provisioned.")
302+
}
303+
creator = getClusterCreateFn(input.BootstrapClusterProxy, namespace.Name)
304+
}
305+
229306
clusterCreateResults, err := workConcurrentlyAndWait(ctx, workConcurrentlyAndWaitInput{
230307
ClusterNames: clusterNames,
231308
Concurrency: concurrency,
232309
FailFast: input.FailFast,
233310
WorkerFunc: func(ctx context.Context, inputChan chan string, resultChan chan workResult, wg *sync.WaitGroup) {
234-
createClusterAndWaitWorker(ctx, inputChan, resultChan, wg, baseClusterTemplateYAML, baseClusterName, clusterctl.ApplyCustomClusterTemplateAndWaitInput{
235-
ClusterProxy: input.BootstrapClusterProxy,
236-
Namespace: namespace.Name,
237-
WaitForClusterIntervals: input.E2EConfig.GetIntervals(specName, "wait-cluster"),
238-
WaitForControlPlaneIntervals: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
239-
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
240-
})
311+
createClusterWorker(ctx, inputChan, resultChan, wg, modifiedBaseTemplateYAML, baseClusterName, creator)
241312
},
242313
})
243314
if err != nil {
@@ -259,6 +330,10 @@ func scaleSpec(ctx context.Context, inputGetter func() scaleSpecInput) {
259330
clusterNamesToDelete = append(clusterNamesToDelete, result.clusterName)
260331
}
261332

333+
if input.SkipCleanup {
334+
return
335+
}
336+
262337
By("Delete the workload clusters concurrently")
263338
// Now delete all the workload clusters.
264339
_, err = workConcurrentlyAndWait(ctx, workConcurrentlyAndWaitInput{
@@ -396,7 +471,42 @@ outer:
396471
return results, kerrors.NewAggregate(errs)
397472
}
398473

399-
func createClusterAndWaitWorker(ctx context.Context, inputChan <-chan string, resultChan chan<- workResult, wg *sync.WaitGroup, baseTemplate []byte, baseClusterName string, input clusterctl.ApplyCustomClusterTemplateAndWaitInput) {
474+
type clusterCreator func(ctx context.Context, clusterName string, clusterTemplateYAML []byte)
475+
476+
func getClusterCreateAndWaitFn(input clusterctl.ApplyCustomClusterTemplateAndWaitInput) clusterCreator {
477+
return func(ctx context.Context, clusterName string, clusterTemplateYAML []byte) {
478+
clusterResources := &clusterctl.ApplyCustomClusterTemplateAndWaitResult{}
479+
// Nb. We cannot directly modify and use `input` in this closure function because this function
480+
// will be called multiple times and this closure will keep modifying the same `input` multiple
481+
// times. It is safer to pass the values explicitly into `ApplyCustomClusterTemplateAndWait`.
482+
clusterctl.ApplyCustomClusterTemplateAndWait(ctx, clusterctl.ApplyCustomClusterTemplateAndWaitInput{
483+
ClusterProxy: input.ClusterProxy,
484+
CustomTemplateYAML: clusterTemplateYAML,
485+
ClusterName: clusterName,
486+
Namespace: input.Namespace,
487+
CNIManifestPath: input.CNIManifestPath,
488+
WaitForClusterIntervals: input.WaitForClusterIntervals,
489+
WaitForControlPlaneIntervals: input.WaitForControlPlaneIntervals,
490+
WaitForMachineDeployments: input.WaitForMachineDeployments,
491+
WaitForMachinePools: input.WaitForMachinePools,
492+
Args: input.Args,
493+
PreWaitForCluster: input.PreWaitForCluster,
494+
PostMachinesProvisioned: input.PostMachinesProvisioned,
495+
ControlPlaneWaiters: input.ControlPlaneWaiters,
496+
}, clusterResources)
497+
}
498+
}
499+
500+
func getClusterCreateFn(clusterProxy framework.ClusterProxy, namespace string) clusterCreator {
501+
return func(ctx context.Context, clusterName string, clusterTemplateYAML []byte) {
502+
log.Logf("Applying the cluster template yaml of cluster %s", klog.KRef(namespace, clusterName))
503+
Eventually(func() error {
504+
return clusterProxy.Apply(ctx, clusterTemplateYAML)
505+
}, 10*time.Second).Should(Succeed(), "Failed to apply the cluster template of cluster %s", klog.KRef(namespace, clusterName))
506+
}
507+
}
508+
509+
func createClusterWorker(ctx context.Context, inputChan <-chan string, resultChan chan<- workResult, wg *sync.WaitGroup, baseTemplate []byte, baseClusterName string, create clusterCreator) {
400510
defer wg.Done()
401511

402512
for {
@@ -425,13 +535,7 @@ func createClusterAndWaitWorker(ctx context.Context, inputChan <-chan string, re
425535

426536
// Create the cluster template YAML with the target cluster name.
427537
clusterTemplateYAML := bytes.Replace(baseTemplate, []byte(baseClusterName), []byte(clusterName), -1)
428-
// Nb. Input is passed as a copy therefore we can safely update the value here, and it won't affect other
429-
// workers.
430-
input.CustomTemplateYAML = clusterTemplateYAML
431-
input.ClusterName = clusterName
432-
433-
clusterResources := &clusterctl.ApplyCustomClusterTemplateAndWaitResult{}
434-
clusterctl.ApplyCustomClusterTemplateAndWait(ctx, input, clusterResources)
538+
create(ctx, clusterName, clusterTemplateYAML)
435539
return false
436540
}
437541
}()
@@ -495,3 +599,39 @@ type workResult struct {
495599
clusterName string
496600
err any
497601
}
602+
603+
func modifyMachineDeployments(baseClusterTemplateYAML []byte, count int) []byte {
604+
Expect(baseClusterTemplateYAML).NotTo(BeEmpty(), "Invalid argument. baseClusterTemplateYAML cannot be empty when calling modifyMachineDeployments")
605+
Expect(count).To(BeNumerically(">=", 0), "Invalid argument. count cannot be less than 0 when calling modifyMachineDeployments")
606+
607+
objs, err := yaml.ToUnstructured(baseClusterTemplateYAML)
608+
Expect(err).NotTo(HaveOccurred())
609+
Expect(objs).To(HaveLen(1), "Unexpected number of objects found in baseClusterTemplateYAML")
610+
611+
scheme := runtime.NewScheme()
612+
framework.TryAddDefaultSchemes(scheme)
613+
cluster := &clusterv1.Cluster{}
614+
Expect(scheme.Convert(&objs[0], cluster, nil)).Should(Succeed())
615+
// Verify the Cluster Topology.
616+
Expect(cluster.Spec.Topology).NotTo(BeNil(), "Should be a ClusterClass based Cluster")
617+
Expect(cluster.Spec.Topology.Workers).NotTo(BeNil(), "ClusterTopology should have exactly one MachineDeployment. Cannot be empty")
618+
Expect(cluster.Spec.Topology.Workers.MachineDeployments).To(HaveLen(1), "ClusterTopology should have exactly one MachineDeployment")
619+
620+
baseMD := cluster.Spec.Topology.Workers.MachineDeployments[0]
621+
allMDs := make([]clusterv1.MachineDeploymentTopology, count)
622+
allMDDigits := 1 + int(math.Log10(float64(count)))
623+
for i := 1; i <= count; i++ {
624+
md := baseMD.DeepCopy()
625+
// This ensures we always have the right number of leading zeros in our machine deployment names, e.g.
626+
// count=1000 will lead to machine deployment names like md-0001, md-0002, so on.
627+
md.Name = fmt.Sprintf("md-%0*d", allMDDigits, i)
628+
allMDs[i-1] = *md
629+
}
630+
cluster.Spec.Topology.Workers.MachineDeployments = allMDs
631+
u := &unstructured.Unstructured{}
632+
Expect(scheme.Convert(cluster, u, nil)).To(Succeed())
633+
modifiedClusterYAML, err := yaml.FromUnstructured([]unstructured.Unstructured{*u})
634+
Expect(err).NotTo(HaveOccurred())
635+
636+
return modifiedClusterYAML
637+
}

test/e2e/scale_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ var _ = Describe("When scale testing using in-memory provider [Scale]", func() {
3636
Concurrency: pointer.Int64(5),
3737
Flavor: pointer.String(""),
3838
ControlPlaneMachineCount: pointer.Int64(1),
39+
MachineDeploymentCount: pointer.Int64(1),
3940
WorkerMachineCount: pointer.Int64(3),
41+
SkipCleanup: skipCleanup,
4042
}
4143
})
4244
})

test/framework/namespace_helpers.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ import (
4343
type CreateNamespaceInput struct {
4444
Creator Creator
4545
Name string
46+
47+
// IgnoreAlreadyExists if set to true will ignore the "AlreadyExists" error if the function
48+
// is trying to create a namespace that already exists.
49+
// If false, it will error and cause test failure.
50+
IgnoreAlreadyExists bool
4651
}
4752

4853
// CreateNamespace is used to create a namespace object.
@@ -61,7 +66,13 @@ func CreateNamespace(ctx context.Context, input CreateNamespaceInput, intervals
6166
}
6267
log.Logf("Creating namespace %s", input.Name)
6368
Eventually(func() error {
64-
return input.Creator.Create(ctx, ns)
69+
if err := input.Creator.Create(ctx, ns); err != nil {
70+
if input.IgnoreAlreadyExists && apierrors.IsAlreadyExists(err) {
71+
return nil
72+
}
73+
return err
74+
}
75+
return nil
6576
}, intervals...).Should(Succeed(), "Failed to create namespace %s", input.Name)
6677

6778
return ns
@@ -172,6 +183,11 @@ type CreateNamespaceAndWatchEventsInput struct {
172183
ClientSet *kubernetes.Clientset
173184
Name string
174185
LogFolder string
186+
187+
// IgnoreAlreadyExists if set to true will ignore the "AlreadyExists" error if the function
188+
// is trying to create a namespace that already exists.
189+
// If false, it will error and cause test failure.
190+
IgnoreAlreadyExists bool
175191
}
176192

177193
// CreateNamespaceAndWatchEvents creates a namespace and setups a watch for the namespace events.
@@ -182,7 +198,7 @@ func CreateNamespaceAndWatchEvents(ctx context.Context, input CreateNamespaceAnd
182198
Expect(input.Name).ToNot(BeEmpty(), "Invalid argument. input.Name can't be empty when calling ClientSet")
183199
Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created in CreateNamespaceAndWatchEvents")
184200

185-
namespace := CreateNamespace(ctx, CreateNamespaceInput{Creator: input.Creator, Name: input.Name}, "40s", "10s")
201+
namespace := CreateNamespace(ctx, CreateNamespaceInput{Creator: input.Creator, Name: input.Name, IgnoreAlreadyExists: input.IgnoreAlreadyExists}, "40s", "10s")
186202
Expect(namespace).ToNot(BeNil(), "Failed to create namespace %q", input.Name)
187203

188204
log.Logf("Creating event watcher for namespace %q", input.Name)

0 commit comments

Comments
 (0)