Skip to content

Commit 274aac0

Browse files
hanwen-clusterhanwen-pcluste
authored andcommitted
[integ-tests] Improve ParallelCluster cfn custom resource integration tests
1. Extract cluster configuration file from `cluster_custom_resource` to individual pcluster.config.yaml files. Therefore, the tests are more similar/extensible to other integration tests and the cluster configuration can be easily modified for each test. 2. Let test_cluster_create and test_cluster_update use 50 queues. This will test most potential limits (e.g. Eventbridge size limit) 3. Simplify test_cluster_create_invalid and test_cluster_update_invalid. This will save the cost and time of integration tests 4. Reorder some fixtures, add a wait_for_rollback, add resource cleanup to fix sporadic failures and resource leftovers Signed-off-by: Hanwen <[email protected]>
1 parent 4a0d7d6 commit 274aac0

File tree

16 files changed

+405
-156
lines changed

16 files changed

+405
-156
lines changed

tests/integration-tests/cfn_stacks_factory.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,16 @@ def delete_stack(self, name, region):
236236
wait_fixed=5000,
237237
retry_on_exception=lambda exception: isinstance(exception, ClientError),
238238
)
239-
def update_stack(self, name, region, parameters, stack_is_under_test=False, tags=None, wait_for_rollback=False):
239+
def update_stack(
240+
self,
241+
name,
242+
region,
243+
parameters,
244+
stack_is_under_test=False,
245+
tags=None,
246+
template_body=None,
247+
wait_for_rollback=False,
248+
):
240249
"""Update a created cfn stack."""
241250
with aws_credential_provider(region, self.__credentials):
242251
internal_id = self.__get_stack_internal_id(name, region)
@@ -245,12 +254,11 @@ def update_stack(self, name, region, parameters, stack_is_under_test=False, tags
245254
try:
246255
stack = self.__created_stacks[internal_id]
247256
cfn_client = boto3.client("cloudformation", region_name=stack.region)
257+
template_args = {"TemplateBody": template_body} if template_body else {"UsePreviousTemplate": True}
248258
if tags is not None:
249-
cfn_client.update_stack(
250-
StackName=stack.name, UsePreviousTemplate=True, Parameters=parameters, Tags=tags
251-
)
259+
cfn_client.update_stack(StackName=stack.name, Parameters=parameters, Tags=tags, **template_args)
252260
else:
253-
cfn_client.update_stack(StackName=stack.name, UsePreviousTemplate=True, Parameters=parameters)
261+
cfn_client.update_stack(StackName=stack.name, Parameters=parameters, **template_args)
254262

255263
if wait_for_rollback:
256264
final_status = self.__wait_for_stack_update_rollback(stack.cfn_stack_id, cfn_client)
@@ -322,11 +330,11 @@ def __wait_for_stack_update(self, name, cfn_client):
322330
return self.__get_stack_status(name, cfn_client)
323331

324332
@retry(
325-
stop_max_attempt_number=15,
333+
stop_max_attempt_number=30,
326334
retry_on_result=lambda result: result == "UPDATE_ROLLBACK_IN_PROGRESS"
327335
or result == "UPDATE_IN_PROGRESS"
328336
or result == "UPDATE_FAILED",
329-
wait_fixed=5000,
337+
wait_fixed=10000,
330338
retry_on_exception=lambda exception: isinstance(exception, ClientError) and "Rate exceeded" in str(exception),
331339
)
332340
def __wait_for_stack_update_rollback(self, name, cfn_client):

tests/integration-tests/configs/develop.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,31 +105,39 @@ test-suites:
105105
dimensions:
106106
- oss: ["alinux2"]
107107
regions: ["us-east-2"]
108+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
108109
test_cluster_custom_resource.py::test_cluster_create_invalid:
109110
dimensions:
110111
- oss: ["alinux2"]
111112
regions: ["us-east-2"]
113+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
112114
test_cluster_custom_resource.py::test_cluster_update:
113115
dimensions:
114116
- oss: ["alinux2"]
115117
regions: ["us-east-2"]
118+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
116119
test_cluster_custom_resource.py::test_cluster_update_invalid:
117120
dimensions:
118121
- oss: ["alinux2"]
119122
regions: ["us-east-2"]
123+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
120124
test_cluster_custom_resource.py::test_cluster_update_tag_propagation:
121125
dimensions:
122126
- oss: [ "alinux2" ]
123127
regions: ["us-east-2"]
128+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
124129
test_cluster_custom_resource.py::test_cluster_delete_out_of_band:
125130
dimensions:
126131
- oss: ["alinux2"]
127132
regions: ["us-east-2"]
133+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
128134
test_cluster_custom_resource.py::test_cluster_delete_retain:
129135
dimensions:
130136
- oss: ["alinux2"]
131137
regions: ["us-east-2"]
138+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
132139
test_cluster_custom_resource.py::test_cluster_create_with_custom_policies:
133140
dimensions:
134141
- oss: ["alinux2"]
135142
regions: ["us-east-2"]
143+
instances: {{ common.INSTANCES_DEFAULT_X86 }}

tests/integration-tests/resources/cluster_custom_resource.yaml

Lines changed: 0 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -5,89 +5,16 @@ Parameters:
55
ClusterName:
66
Description: Name of cluster. Note this must be different than the stack name.
77
Type: String
8-
HeadNodeSubnet:
9-
Description: Subnet for the HeadNode
10-
Type: String
11-
ComputeNodeSubnet:
12-
Description: Subnet for the ComputeNode
13-
Type: String
14-
ComputeInstanceMax:
15-
Description: Maximum number of compute instances
16-
Type: Number
17-
Default: 16
188
ServiceToken:
199
Description: ARN of Lambda Function backing the Cluster Resource
2010
Type: String
21-
Os:
22-
Description: Operating system for nodes
23-
Type: String
24-
Default: 'alinux2'
25-
OnNodeConfigured:
26-
Description: Script to run on HeadNode configured
27-
Type: String
28-
Default: ''
29-
CustomBucketAccess:
30-
Description: Name of a bucket to provide access to on the HeadNode
31-
Type: String
32-
Default: ''
33-
DeletionPolicy:
34-
Type: String
35-
Default: Delete
36-
AllowedValues:
37-
- Delete
38-
- Retain
39-
Description: Enter Retain or Delete to define the operation when the stack is deleted. Default is to Delete.
40-
41-
Conditions:
42-
OnNodeConfiguredCondition: !Not [!Equals [!Ref OnNodeConfigured, '']]
43-
CustomBucketCondition: !Not [!Equals [!Ref CustomBucketAccess, '']]
4411

4512
Resources:
4613
PclusterCluster:
4714
Type: Custom::PclusterCluster
4815
Properties:
4916
ServiceToken: !Ref ServiceToken
50-
DeletionPolicy: !Ref DeletionPolicy
5117
ClusterName: !Ref ClusterName
52-
ClusterConfiguration:
53-
Imds:
54-
ImdsSupport: v2.0
55-
Tags:
56-
- Key: inside_configuration_key
57-
Value: overridden
58-
DevSettings:
59-
AmiSearchFilters:
60-
Owner: self
61-
Image:
62-
Os: !Ref Os
63-
HeadNode:
64-
InstanceType: t2.small
65-
Networking:
66-
SubnetId: !Ref HeadNodeSubnet
67-
CustomActions: !If
68-
- OnNodeConfiguredCondition
69-
-
70-
OnNodeConfigured:
71-
Script: !Ref OnNodeConfigured
72-
- !Ref AWS::NoValue
73-
Iam: !If
74-
- CustomBucketCondition
75-
-
76-
S3Access:
77-
- BucketName: !Ref CustomBucketAccess
78-
EnableWriteAccess: false
79-
- !Ref AWS::NoValue
80-
Scheduling:
81-
Scheduler: slurm
82-
SlurmQueues:
83-
- Name: queue0
84-
ComputeResources:
85-
- Name: queue0-cr0
86-
InstanceType: t2.micro
87-
MaxCount: !Ref ComputeInstanceMax
88-
Networking:
89-
SubnetIds:
90-
- !Ref ComputeNodeSubnet
9118

9219
Outputs:
9320
HeadNodeIp:

tests/integration-tests/tests/custom_resource/conftest.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import cfn_tools
1818
import pkg_resources
1919
import pytest
20+
import yaml
2021
from cfn_stacks_factory import CfnStack
2122
from troposphere import Output, Ref
2223
from troposphere.iam import ManagedPolicy
@@ -74,10 +75,10 @@ def cluster_custom_resource_provider_generator(cfn_stacks_factory, region, stack
7475

7576
@pytest.fixture(scope="class", name="cluster_custom_resource_provider")
7677
def cluster_custom_resource_provider_fixture(
77-
cfn_stacks_factory,
7878
request,
7979
region,
8080
resource_bucket,
81+
cfn_stacks_factory,
8182
cluster_custom_resource_service_token,
8283
cluster_custom_resource_provider_template,
8384
):
@@ -115,35 +116,43 @@ def cluster_1_click_fixture(cfn_stacks_factory, request, region, key_name, clust
115116
return stack
116117

117118

119+
def get_custom_resource_template(cluster_config_path, cluster_custom_resource_template, deletion_policy="Delete"):
120+
with open(cluster_custom_resource_template, "r", encoding="utf-8") as f:
121+
template = TemplateGenerator(cfn_tools.load_yaml(f.read()))
122+
with open(cluster_config_path, encoding="utf-8") as cluster_config:
123+
template.resources["PclusterCluster"].properties["ClusterConfiguration"] = yaml.safe_load(cluster_config.read())
124+
template.resources["PclusterCluster"].properties["DeletionPolicy"] = deletion_policy
125+
return template
126+
127+
118128
@pytest.fixture(scope="class", name="cluster_custom_resource_factory")
119129
def cluster_custom_resource_factory_fixture(
120-
cfn_stacks_factory,
121130
request,
122131
region,
123-
os,
124-
cluster_custom_resource_template,
125132
cluster_custom_resource_provider,
126133
vpc_stack,
134+
cfn_stacks_factory,
135+
cluster_custom_resource_template,
127136
):
128-
def _produce_cluster_custom_resource_stack(parameters=None):
129-
cluster_name = generate_stack_name("integ-test-custom-resource-c", request.config.getoption("stackname_suffix"))
137+
created_stacks = []
130138

131-
parameters = {
132-
"ClusterName": cluster_name,
133-
"HeadNodeSubnet": vpc_stack.get_public_subnet(),
134-
"ComputeNodeSubnet": vpc_stack.get_private_subnet(),
135-
"ServiceToken": cluster_custom_resource_provider,
136-
"Os": os,
137-
**(parameters or {}),
138-
}
139+
def _produce_cluster_custom_resource_stack(
140+
cluster_config_path, cluster_name=None, deletion_policy="Delete", service_token=None
141+
):
142+
cluster_name = cluster_name or generate_stack_name(
143+
"integ-test-custom-resource-c", request.config.getoption("stackname_suffix")
144+
)
139145

140-
with open(cluster_custom_resource_template, encoding="utf-8") as cfn_file:
141-
template_data = cfn_file.read()
146+
parameters = {"ClusterName": cluster_name, "ServiceToken": service_token or cluster_custom_resource_provider}
147+
148+
template = get_custom_resource_template(
149+
cluster_config_path, cluster_custom_resource_template, deletion_policy=deletion_policy
150+
)
142151

143152
stack = CfnStack(
144153
name=generate_stack_name("integ-tests-custom-resource", request.config.getoption("stackname_suffix")),
145154
region=region,
146-
template=template_data,
155+
template=template.to_yaml(),
147156
parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in parameters.items()],
148157
capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"],
149158
tags=[
@@ -154,9 +163,13 @@ def _produce_cluster_custom_resource_stack(parameters=None):
154163

155164
cfn_stacks_factory.create_stack(stack, True)
156165
stack.factory = cfn_stacks_factory
166+
created_stacks.append(stack)
157167
return stack
158168

159169
yield _produce_cluster_custom_resource_stack
170+
if not request.config.getoption("no_delete"):
171+
for stack in created_stacks:
172+
stack.factory.delete_stack(stack.name, region)
160173

161174

162175
@pytest.fixture(scope="class", name="resource_bucket_cluster_template")

0 commit comments

Comments
 (0)