diff --git a/clusterloader2/go.mod b/clusterloader2/go.mod index aa0fed00b9..9aafe51b2a 100644 --- a/clusterloader2/go.mod +++ b/clusterloader2/go.mod @@ -36,6 +36,9 @@ replace ( require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.1 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1 + github.com/aws/aws-sdk-go-v2 v1.36.3 + github.com/aws/aws-sdk-go-v2/config v1.29.14 + github.com/aws/aws-sdk-go-v2/service/autoscaling v1.53.0 github.com/go-errors/errors v1.5.1 github.com/google/go-cmp v0.7.0 github.com/google/safetext v0.0.0-20230106111101-7156a760e523 @@ -66,6 +69,17 @@ require ( cloud.google.com/go/compute/metadata v0.3.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.17.67 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 // indirect + github.com/aws/smithy-go v1.22.2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect diff --git a/clusterloader2/go.sum b/clusterloader2/go.sum index d500ce7934..6a51a4be41 100644 --- a/clusterloader2/go.sum +++ b/clusterloader2/go.sum @@ -111,6 +111,34 @@ github.com/aws/aws-sdk-go v1.27.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN github.com/aws/aws-sdk-go v1.34.28/go.mod h1:H7NKnBqNVzoTJpGfLrQkkD+ytBA93eiDYi/+8rV9s48= github.com/aws/aws-sdk-go v1.38.3/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g= +github.com/aws/aws-sdk-go-v2 v1.36.3 h1:mJoei2CxPutQVxaATCzDUjcZEjVRdpsiiXi2o38yqWM= +github.com/aws/aws-sdk-go-v2 v1.36.3/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg= +github.com/aws/aws-sdk-go-v2/config v1.29.14 h1:f+eEi/2cKCg9pqKBoAIwRGzVb70MRKqWX4dg1BDcSJM= +github.com/aws/aws-sdk-go-v2/config v1.29.14/go.mod h1:wVPHWcIFv3WO89w0rE10gzf17ZYy+UVS1Geq8Iei34g= +github.com/aws/aws-sdk-go-v2/credentials v1.17.67 h1:9KxtdcIA/5xPNQyZRgUSpYOE6j9Bc4+D7nZua0KGYOM= +github.com/aws/aws-sdk-go-v2/credentials v1.17.67/go.mod h1:p3C44m+cfnbv763s52gCqrjaqyPikj9Sg47kUVaNZQQ= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 h1:ZK5jHhnrioRkUNOc+hOgQKlUL5JeC3S6JgLxtQ+Rm0Q= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34/go.mod h1:p4VfIceZokChbA9FzMbRGz5OV+lekcVtHlPKEO0gSZY= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0ioo/gq8Mn6u9w19Mri8DnJ15Jf0= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= +github.com/aws/aws-sdk-go-v2/service/autoscaling v1.53.0 h1:uYhWKm7FhOKF5chyd2QSVXWqchI+ikht+aIkDJUIg9U= +github.com/aws/aws-sdk-go-v2/service/autoscaling v1.53.0/go.mod h1:CDqMoc3KRdZJ8qziW96J35lKH01Wq3B2aihtHj2JbRs= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY= +github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 h1:1Gw+9ajCV1jogloEv1RRnvfRFia2cL6c9cuKV2Ps+G8= +github.com/aws/aws-sdk-go-v2/service/sso v1.25.3/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 h1:hXmVKytPfTy5axZ+fYbR5d0cFmC3JvwLm5kM83luako= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 h1:1XuUZ8mYJw9B6lzAkXhqHlJd/XvaX32evhproijJEZY= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.19/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4= +github.com/aws/smithy-go v1.22.2 h1:6D9hW43xKFrRx/tXXfAlIZc4JI+yQe6snnWcQyxSyLQ= +github.com/aws/smithy-go v1.22.2/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/clusterloader2/pkg/measurement/common/scale_nodes.go b/clusterloader2/pkg/measurement/common/scale_nodes.go new file mode 100644 index 0000000000..d3ad887d21 --- /dev/null +++ b/clusterloader2/pkg/measurement/common/scale_nodes.go @@ -0,0 +1,142 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package common + +import ( + "context" + "errors" + "fmt" + "time" + + "k8s.io/klog/v2" + "k8s.io/perf-tests/clusterloader2/pkg/measurement" + measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" + "k8s.io/perf-tests/clusterloader2/pkg/provider" + "k8s.io/perf-tests/clusterloader2/pkg/util" +) + +const ( + scaleNodesMeasurementName = "ScaleNodes" + defaultScalingTimeout = 30 * time.Minute + nodeCountCheckInterval = 30 * time.Second +) + +type scaleNodesMeasurement struct{} + +func init() { + if err := measurement.Register(scaleNodesMeasurementName, createScaleNodesMeasurement); err != nil { + klog.Fatalf("Cannot register %s: %v", scaleNodesMeasurementName, err) + } +} + +func createScaleNodesMeasurement() measurement.Measurement { + return &scaleNodesMeasurement{} +} + +// Execute performs the node scaling operation with the specified parameters +func (n *scaleNodesMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { + // Get parameters from config.Params + providerName, err := util.GetString(config.Params, "provider") + if err != nil { + return nil, err + } + region, err := util.GetString(config.Params, "region") + if err != nil { + return nil, err + } + clusterName, err := util.GetString(config.Params, "clusterName") + if err != nil { + return nil, err + } + batchSize, err := util.GetInt(config.Params, "batchSize") + if err != nil { + return nil, err + } + intervalSeconds, err := util.GetInt(config.Params, "intervalSeconds") + if err != nil { + return nil, err + } + targetNodeCount, err := util.GetInt(config.Params, "targetNodeCount") + if err != nil { + return nil, err + } + + // Get timeout with default value if not specified + timeout, err := util.GetDurationOrDefault(config.Params, "timeout", defaultScalingTimeout) + if err != nil { + return nil, err + } + + // Initialize provider specific scaler + scaler, err := provider.CreateNodeScaler(providerName, region, clusterName) + if err != nil { + return nil, fmt.Errorf("failed to create node scaler: %v", err) + } + + // Start scaling operation + klog.Infof("Starting node scaling: target=%d, batchSize=%d/interval, interval=%ds, timeout=%v", + targetNodeCount, batchSize, intervalSeconds, timeout) + + // Start the scaling operation in a goroutine + errCh := make(chan error) + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + go func() { + errCh <- scaler.ScaleNodes(ctx, batchSize, intervalSeconds, targetNodeCount) + }() + + // Create stop channel for timeout + stopCh := make(chan struct{}) + time.AfterFunc(timeout, func() { + close(stopCh) + }) + + // Set up options for waiting on nodes + options := &measurementutil.WaitForNodeOptions{ + Selector: util.NewObjectSelector(), + MinDesiredNodeCount: targetNodeCount, + MaxDesiredNodeCount: targetNodeCount, + CallerName: n.String(), + WaitForNodesInterval: nodeCountCheckInterval, + } + + // Wait for either the scaling operation to fail or nodes to be ready + select { + case err := <-errCh: + if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return nil, fmt.Errorf("scaling operation timed out after %v", timeout) + } + return nil, fmt.Errorf("failed to scale nodes: %v", err) + } + // Scaling operation completed, now wait for nodes to be ready + if err := measurementutil.WaitForNodes(config.ClusterFramework.GetClientSets().GetClient(), stopCh, options); err != nil { + return nil, err + } + return nil, nil + case <-stopCh: + return nil, fmt.Errorf("timeout while waiting for scaling operation to complete after %v", timeout) + } +} + +// Dispose cleans up after the measurement. +func (*scaleNodesMeasurement) Dispose() {} + +// String returns string representation of this measurement. +func (*scaleNodesMeasurement) String() string { + return scaleNodesMeasurementName +} diff --git a/clusterloader2/pkg/provider/node_scaling.go b/clusterloader2/pkg/provider/node_scaling.go new file mode 100644 index 0000000000..2b84754abe --- /dev/null +++ b/clusterloader2/pkg/provider/node_scaling.go @@ -0,0 +1,34 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package provider + +import ( + "fmt" + + "k8s.io/perf-tests/clusterloader2/pkg/provider/scalers" + "k8s.io/perf-tests/clusterloader2/pkg/provider/scalers/aws" +) + +// CreateNodeScaler creates a NodeScaler for the given provider +func CreateNodeScaler(providerName, region, clusterName string) (scalers.NodeScaler, error) { + switch providerName { + case "aws": + return aws.NewNodeScaler(region, clusterName) + default: + return nil, fmt.Errorf("unsupported provider for node scaling: %s", providerName) + } +} diff --git a/clusterloader2/pkg/provider/scalers/aws/node_scaler.go b/clusterloader2/pkg/provider/scalers/aws/node_scaler.go new file mode 100644 index 0000000000..507c45a39e --- /dev/null +++ b/clusterloader2/pkg/provider/scalers/aws/node_scaler.go @@ -0,0 +1,254 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package aws + +import ( + "context" + "fmt" + "math" + "time" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/autoscaling" + "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" + "k8s.io/klog/v2" + "k8s.io/perf-tests/clusterloader2/pkg/provider/scalers" +) + +type nodeScaler struct { + region string + clusterName string + asgClient *autoscaling.Client +} + +// NewNodeScaler creates a new AWS specific NodeScaler implementation +func NewNodeScaler(region, clusterName string) (scalers.NodeScaler, error) { + cfg, err := config.LoadDefaultConfig(context.Background(), + config.WithRegion(region), + ) + if err != nil { + return nil, fmt.Errorf("unable to load AWS SDK config: %v", err) + } + + client := autoscaling.NewFromConfig(cfg) + + return &nodeScaler{ + region: region, + clusterName: clusterName, + asgClient: client, + }, nil +} + +func (s *nodeScaler) ScaleNodes(ctx context.Context, batchSize, intervalSeconds, targetNodes int) error { + // Get fresh ASG information + asgs, err := s.getClusterASGs(ctx) + if err != nil { + return err + } + + if len(asgs) == 0 { + return fmt.Errorf("no ASGs found for cluster %s", s.clusterName) + } + + intervalDuration := time.Duration(intervalSeconds) * time.Second + + for { + // Refresh ASG information + asgs, err = s.getClusterASGs(ctx) + if err != nil { + return fmt.Errorf("failed to refresh ASG information: %v", err) + } + + // Calculate total current nodes + var totalCurrentNodes int32 + for _, asg := range asgs { + totalCurrentNodes += *asg.DesiredCapacity + } + + klog.Infof("Current total nodes: %d, target nodes: %d", totalCurrentNodes, targetNodes) + + if totalCurrentNodes == int32(targetNodes) { + klog.Infof("Target node count reached: current (%d) == target (%d)", totalCurrentNodes, targetNodes) + return nil + } + + scalingStartTime := time.Now() + + // Find ASG with least/most nodes depending on direction + var selectedASG *types.AutoScalingGroup + var selectedSize int32 + if totalCurrentNodes < int32(targetNodes) { + // Scale out: pick ASG with least nodes + selectedSize = int32(math.MaxInt32) + for _, asg := range asgs { + currentSize := *asg.DesiredCapacity + if currentSize < selectedSize { + selectedASG = &asg + selectedSize = currentSize + } + } + } else { + // Scale in: pick ASG with most nodes + selectedSize = int32(math.MinInt32) + for _, asg := range asgs { + currentSize := *asg.DesiredCapacity + if currentSize > selectedSize { + selectedASG = &asg + selectedSize = currentSize + } + } + } + + if selectedASG == nil { + return fmt.Errorf("no ASGs found to scale") + } + + var nodesToChange int32 + if totalCurrentNodes < int32(targetNodes) { + // Scale out + nodesToChange = int32(batchSize) + remainingToTarget := int32(targetNodes) - totalCurrentNodes + if nodesToChange > remainingToTarget { + nodesToChange = remainingToTarget + } + selectedSize += nodesToChange + klog.Infof("Scaling OUT ASG %s to %d nodes (adding %d nodes)", *selectedASG.AutoScalingGroupName, selectedSize, nodesToChange) + } else { + // Scale in + nodesToChange = int32(batchSize) + remainingToTarget := totalCurrentNodes - int32(targetNodes) + if nodesToChange > remainingToTarget { + nodesToChange = remainingToTarget + } + if selectedSize-nodesToChange < 0 { + // Don't go below zero + nodesToChange = selectedSize + } + selectedSize -= nodesToChange + klog.Infof("Scaling IN ASG %s to %d nodes (removing %d nodes)", *selectedASG.AutoScalingGroupName, selectedSize, nodesToChange) + } + + input := &autoscaling.UpdateAutoScalingGroupInput{ + AutoScalingGroupName: selectedASG.AutoScalingGroupName, + DesiredCapacity: awssdk.Int32(selectedSize), + MinSize: awssdk.Int32(selectedSize), + MaxSize: awssdk.Int32(selectedSize), + } + + _, err = s.asgClient.UpdateAutoScalingGroup(ctx, input) + if err != nil { + return fmt.Errorf("failed to update ASG %s capacity: %v", *selectedASG.AutoScalingGroupName, err) + } + + // Wait for ASG to reach desired capacity with interval as timeout + if err := s.waitForASGCapacity(ctx, *selectedASG.AutoScalingGroupName, selectedSize, intervalDuration); err != nil { + return fmt.Errorf("failed to scale at requested batch size: %v", err) + } + + scalingDuration := time.Since(scalingStartTime) + if scalingDuration > intervalDuration { + return fmt.Errorf("scaling operation took %v which exceeds the interval of %v - cannot maintain requested batch size of %d nodes per %d seconds", + scalingDuration, intervalDuration, batchSize, intervalSeconds) + } + + // Wait for the remainder of the interval before next scaling operation + if totalCurrentNodes != int32(targetNodes) { + remainingTime := intervalDuration - scalingDuration + if remainingTime > 0 { + klog.Infof("Scaling operation completed in %v. Waiting %v before next operation.", + scalingDuration, remainingTime) + time.Sleep(remainingTime) + } + } + } +} + +// waitForASGCapacity waits until the ASG reaches the desired capacity or times out +func (s *nodeScaler) waitForASGCapacity(ctx context.Context, asgName string, desiredCapacity int32, timeout time.Duration) error { + klog.Infof("Waiting for ASG %s to reach capacity %d within interval of %v", asgName, desiredCapacity, timeout) + + startTime := time.Now() + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("context cancelled or timed out after %v while waiting for ASG to reach capacity: %w", time.Since(startTime), ctx.Err()) + case <-ticker.C: + input := &autoscaling.DescribeAutoScalingGroupsInput{ + AutoScalingGroupNames: []string{asgName}, + } + + output, err := s.asgClient.DescribeAutoScalingGroups(ctx, input) + if err != nil { + return err + } + + if len(output.AutoScalingGroups) == 0 { + return fmt.Errorf("ASG %s not found", asgName) + } + + asg := output.AutoScalingGroups[0] + + // Count instances that are InService + inServiceCount := 0 + for _, instance := range asg.Instances { + if instance.LifecycleState == types.LifecycleStateInService { + inServiceCount++ + } + } + + waitTime := time.Since(startTime) + klog.V(2).Infof("ASG %s - Current InService: %d, Desired: %d (waiting for %v)", + asgName, inServiceCount, desiredCapacity, waitTime) + + if int32(inServiceCount) >= desiredCapacity { + klog.Infof("ASG %s reached desired capacity of %d after %v", + asgName, desiredCapacity, waitTime) + return nil + } + } + } +} + +func (s *nodeScaler) getClusterASGs(ctx context.Context) ([]types.AutoScalingGroup, error) { + input := &autoscaling.DescribeAutoScalingGroupsInput{} + var clusterASGs []types.AutoScalingGroup + + paginator := autoscaling.NewDescribeAutoScalingGroupsPaginator(s.asgClient, input) + + for paginator.HasMorePages() { + output, err := paginator.NextPage(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get ASGs: %v", err) + } + + for _, asg := range output.AutoScalingGroups { + // Look for ASG with tag "kubernetes.io/cluster/{cluster-name}" + for _, tag := range asg.Tags { + if *tag.Key == fmt.Sprintf("kubernetes.io/cluster/%s", s.clusterName) { + clusterASGs = append(clusterASGs, asg) + break + } + } + } + } + + return clusterASGs, nil +} diff --git a/clusterloader2/pkg/provider/scalers/types.go b/clusterloader2/pkg/provider/scalers/types.go new file mode 100644 index 0000000000..7f9e583837 --- /dev/null +++ b/clusterloader2/pkg/provider/scalers/types.go @@ -0,0 +1,28 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scalers + +import "context" + +// NodeScaler defines interface for scaling nodes in a cluster +type NodeScaler interface { + // ScaleNodes scales the cluster to the target number of nodes at the specified rate + // batchSize: number of nodes to add or remove per interval + // intervalSeconds: seconds to wait between scaling operations + // targetNodes: desired final number of nodes + ScaleNodes(ctx context.Context, batchSize, intervalSeconds, targetNodes int) error +} diff --git a/clusterloader2/testing/node-scaling/config.yaml b/clusterloader2/testing/node-scaling/config.yaml new file mode 100644 index 0000000000..acb76ea37e --- /dev/null +++ b/clusterloader2/testing/node-scaling/config.yaml @@ -0,0 +1,49 @@ +# Node scaling test configuration +# This test demonstrates controlled node scaling with specified rate limits + +{{$NODE_SCALE_RATE := DefaultParam .CL2_NODE_SCALE_RATE 20}} +{{$NODE_SCALE_INTERVAL := DefaultParam .CL2_NODE_SCALE_INTERVAL 60}} +{{$TARGET_NODE_COUNT := DefaultParam .CL2_TARGET_NODE_COUNT 100}} +{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "5s"}} +{{$PROVIDER_NAME := DefaultParam .CL2_PROVIDER_NAME "aws"}} +{{$PROVIDER_REGION := DefaultParam .CL2_PROVIDER_REGION ""}} +{{$CLUSTER_NAME := DefaultParam .CL2_CLUSTER_NAME ""}} + +name: node-scaling-test +namespace: + number: 1 + +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}} + +- module: + path: modules/node-scaling.yaml + params: + CL2_NODE_SCALE_RATE: {{$NODE_SCALE_RATE}} + CL2_NODE_SCALE_INTERVAL: {{$NODE_SCALE_INTERVAL}} + CL2_TARGET_NODE_COUNT: {{$TARGET_NODE_COUNT}} + CL2_PROVIDER_NAME: {{$PROVIDER_NAME}} + CL2_PROVIDER_REGION: {{$PROVIDER_REGION}} + CL2_CLUSTER_NAME: {{$CLUSTER_NAME}} + +- name: Gathering measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather \ No newline at end of file diff --git a/clusterloader2/testing/node-scaling/modules/node-scaling.yaml b/clusterloader2/testing/node-scaling/modules/node-scaling.yaml new file mode 100644 index 0000000000..3c0745ba27 --- /dev/null +++ b/clusterloader2/testing/node-scaling/modules/node-scaling.yaml @@ -0,0 +1,15 @@ +## Node scaling module for ClusterLoader2 + +steps: +- name: "Scale cluster nodes" + measurements: + - Identifier: ScaleNodes + Method: ScaleNodes + Params: + provider: {{.CL2_PROVIDER_NAME}} + region: {{.CL2_PROVIDER_REGION}} + clusterName: {{.CL2_CLUSTER_NAME}} + batchSize: {{.CL2_NODE_SCALE_RATE}} + intervalSeconds: {{.CL2_NODE_SCALE_INTERVAL}} + targetNodeCount: {{.CL2_TARGET_NODE_COUNT}} + timeout: {{AddInt (MultiplyInt .CL2_NODE_SCALE_INTERVAL (DivideInt .CL2_TARGET_NODE_COUNT .CL2_NODE_SCALE_RATE)) 10}}m \ No newline at end of file