Skip to content

Commit cae269f

Browse files
authored
Updated test infra setup and fixed integration tests (#44)
1 parent 5430c04 commit cae269f

File tree

3 files changed

+158
-304
lines changed

3 files changed

+158
-304
lines changed

test/integration_tests/abstract_integration_tests.py

Lines changed: 25 additions & 173 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
import os
1314
import subprocess
1415
import time
1516
import uuid
1617
import re
1718

1819
import boto3
20+
import yaml
1921
from botocore.exceptions import ClientError
2022

2123
from hyperpod_cli.utils import setup_logger
@@ -32,164 +34,37 @@ class AbstractIntegrationTests:
3234
"InService",
3335
]
3436
suffix = str(uuid.uuid4())[:8]
35-
hyperpod_cli_cluster_name = "hyperpod-cli-cluster-" + suffix
36-
vpc_eks_stack_name = "hyperpod-cli-stack-" + suffix
37+
hyperpod_cli_job_name: str = 'hyperpod-job-'+ suffix
38+
test_job_file = os.path.expanduser("./test/integration_tests/data/basicJob.yaml")
39+
hyperpod_cli_cluster_name = "HyperPodCLI-cluster"
3740
s3_roles_stack_name = "hyperpod-cli-resource-stack"
3841
vpc_stack_name = "hyperpod-cli-vpc-stack"
39-
eks_cluster_name = "hyperpod-cli-cluster-" + suffix
40-
bucket_name = "hyperpod-cli-s3-" + suffix
4142
test_team_name = "test-team"
4243

4344
def _create_session(self):
4445
session = boto3.Session()
4546
return session
4647

47-
def create_test_resources(self, session):
48-
cfn = session.client("cloudformation")
48+
def replace_placeholders(self):
49+
replacements = {
50+
'JOB_NAME': self.hyperpod_cli_job_name,
51+
}
52+
with open(self.test_job_file, 'r') as file:
53+
yaml_content = file.read()
54+
pattern = re.compile(r'\$\{([^}^{]+)\}')
4955

50-
# Get static resources from static-resource stack
51-
self.describe_constant_resources_stack_and_set_values(cfn)
56+
def replace(match):
57+
key = match.group(1)
58+
return str(replacements.get(key, match.group(0)))
5259

53-
# Get static resources from static-resource stack
54-
self.describe_vpc_stack_and_set_values(cfn)
60+
processed_yaml = pattern.sub(replace, yaml_content)
5561

56-
# Create VPC, EKS cluster and roles
57-
with open(
58-
"test/integration_tests/cloudformation/resources.yaml",
59-
"r",
60-
) as fh:
61-
template = fh.read()
62-
cfn.create_stack(
63-
StackName=self.vpc_eks_stack_name,
64-
TemplateBody=template,
65-
Capabilities=["CAPABILITY_NAMED_IAM"],
66-
Parameters=[
67-
{
68-
"ParameterKey": "ClusterName",
69-
"ParameterValue": self.eks_cluster_name,
70-
"ResolvedValue": "string",
71-
},
72-
{
73-
"ParameterKey": "EKSClusterRoleArn",
74-
"ParameterValue": self.cfn_output_map.get("EKSClusterRoleArn"),
75-
"ResolvedValue": "string",
76-
},
77-
{
78-
"ParameterKey": "SubnetId1",
79-
"ParameterValue": self.cfn_output_map.get("PrivateSubnet1"),
80-
"ResolvedValue": "string",
81-
},
82-
{
83-
"ParameterKey": "SubnetId2",
84-
"ParameterValue": self.cfn_output_map.get("PrivateSubnet2"),
85-
"ResolvedValue": "string",
86-
},
87-
{
88-
"ParameterKey": "SecurityGroupId",
89-
"ParameterValue": self.cfn_output_map.get("SecurityGroup"),
90-
"ResolvedValue": "string",
91-
},
92-
],
93-
)
94-
waiter = cfn.get_waiter("stack_create_complete")
95-
waiter.wait(
96-
StackName=self.vpc_eks_stack_name,
97-
WaiterConfig={
98-
"Delay": 30,
99-
"MaxAttempts": 40,
100-
},
101-
)
102-
describe = cfn.describe_stacks(StackName=self.vpc_eks_stack_name)
103-
if describe:
104-
cfn_output = describe.get("Stacks")[0]
105-
if cfn_output and cfn_output.get("Outputs"):
106-
for output in cfn_output.get("Outputs"):
107-
self.cfn_output_map[output.get("OutputKey")] = output.get(
108-
"OutputValue"
109-
)
110-
111-
def delete_cloudformation_stack(self, session):
112-
cfn = session.client("cloudformation")
113-
cfn.delete_stack(StackName=self.vpc_eks_stack_name)
114-
115-
def upload_lifecycle_script(self, session):
116-
s3_client = session.client("s3")
117-
try:
118-
response = s3_client.upload_file(
119-
"test/integration_tests/lifecycle_script/on_create_noop.sh",
120-
self.cfn_output_map.get("Bucket"),
121-
"on_create_noop.sh",
122-
)
123-
except ClientError as e:
124-
logger.error(f"Error uploading lifecycle script to s3 {e}")
125-
126-
def get_hyperpod_cluster_status(self, sagemaker_client):
127-
return sagemaker_client.describe_cluster(
128-
ClusterName=self.hyperpod_cli_cluster_name
129-
)
130-
131-
def create_hyperpod_cluster(self, session):
132-
# Create HyperPod cluster using eks cluster from stack above
133-
sagemaker_client = session.client("sagemaker")
134-
sagemaker_client.create_cluster(
135-
ClusterName=self.hyperpod_cli_cluster_name,
136-
Orchestrator={"Eks": {"ClusterArn": self.cfn_output_map.get("ClusterArn")}},
137-
InstanceGroups=[
138-
{
139-
"InstanceGroupName": "group2",
140-
"InstanceType": "ml.c5.2xlarge",
141-
"InstanceCount": 2,
142-
"LifeCycleConfig": {
143-
"SourceS3Uri": f's3://{self.cfn_output_map.get("Bucket")}',
144-
"OnCreate": "on_create_noop.sh",
145-
},
146-
"ExecutionRole": self.cfn_output_map.get("ExecutionRole"),
147-
"ThreadsPerCore": 1,
148-
}
149-
],
150-
VpcConfig={
151-
"SecurityGroupIds": [self.cfn_output_map.get("SecurityGroup")],
152-
"Subnets": [self.cfn_output_map.get("PrivateSubnet1")],
153-
},
154-
)
62+
with open(self.test_job_file, 'w') as file:
63+
file.write(processed_yaml)
15564

156-
time.sleep(1)
157-
# Wait for sagemkaer stack to create complete
158-
try:
159-
result = self.get_hyperpod_cluster_status(sagemaker_client)
160-
while (
161-
result.get("ClusterStatus") not in self.hyperpod_cluster_terminal_state
162-
):
163-
time.sleep(30)
164-
result = self.get_hyperpod_cluster_status(sagemaker_client)
165-
except Exception as e:
166-
logger.error(e)
167-
logger.info(f"Hyperpod cluster created {self.hyperpod_cli_cluster_name}")
168-
169-
def delete_hyperpod_cluster(self, session):
170-
# delete HyperPod cluster using eks cluster from stack above
171-
sagemaker_client = session.client("sagemaker")
172-
sagemaker_client.delete_cluster(ClusterName=self.hyperpod_cli_cluster_name)
173-
174-
time.sleep(10)
175-
# Wait for sagemaker stack to create complete
176-
try:
177-
result = self.get_hyperpod_cluster_status(sagemaker_client)
178-
while result.get("ClusterStatus") == "Deleting":
179-
time.sleep(30)
180-
result = self.get_hyperpod_cluster_status(sagemaker_client)
181-
except Exception as e:
182-
logger.info(
183-
f"Caught exception while trying to describe cluster during teardown {e}"
184-
)
185-
return
186-
raise Exception(
187-
f"Hyperpod Cluster {self.hyperpod_cli_cluster_name} fail to delete"
188-
)
18965

19066
def create_kube_context(self):
191-
eks_cluster_name = self.cfn_output_map.get("ClusterArn").split(":")[-1]
192-
eks_cluster_name = eks_cluster_name.split("/")[-1]
67+
eks_cluster_name = 'HyperPodCLI-eks-cluster'
19368
command = [
19469
"aws",
19570
"eks",
@@ -204,28 +79,6 @@ def create_kube_context(self):
20479
except subprocess.CalledProcessError as e:
20580
raise RuntimeError(f"Failed to update kubeconfig: {e}")
20681

207-
def describe_constant_resources_stack_and_set_values(self, cfn_client):
208-
describe_s3_stack = cfn_client.describe_stacks(
209-
StackName=self.s3_roles_stack_name
210-
)
211-
if describe_s3_stack:
212-
cfn_output = describe_s3_stack.get("Stacks")[0]
213-
if cfn_output and cfn_output.get("Outputs"):
214-
for output in cfn_output.get("Outputs"):
215-
self.cfn_output_map[output.get("OutputKey")] = output.get(
216-
"OutputValue"
217-
)
218-
219-
def describe_vpc_stack_and_set_values(self, cfn_client):
220-
describe_vpc_stack = cfn_client.describe_stacks(StackName=self.vpc_stack_name)
221-
if describe_vpc_stack:
222-
cfn_output = describe_vpc_stack.get("Stacks")[0]
223-
if cfn_output and cfn_output.get("Outputs"):
224-
for output in cfn_output.get("Outputs"):
225-
self.cfn_output_map[output.get("OutputKey")] = output.get(
226-
"OutputValue"
227-
)
228-
22982
def apply_helm_charts(self):
23083
command = ["helm", "dependencies", "update", "helm_chart/HyperPodHelmChart"]
23184

@@ -244,7 +97,8 @@ def apply_helm_charts(self):
24497

24598
apply_command = [
24699
"helm",
247-
"install",
100+
"upgrade",
101+
"--install",
248102
"dependencies",
249103
"helm_chart/HyperPodHelmChart",
250104
"--namespace",
@@ -410,13 +264,11 @@ def create_quota_allocation_resources(self):
410264

411265
def setup(self):
412266
self.new_session = self._create_session()
413-
self.create_test_resources(self.new_session)
267+
self.replace_placeholders()
414268
self.create_kube_context()
415269
self.apply_helm_charts()
416-
self.create_hyperpod_cluster(self.new_session)
417-
self.install_kueue()
418-
self.create_quota_allocation_resources()
270+
# self.install_kueue()
271+
# self.create_quota_allocation_resources()
419272

420273
def tearDown(self):
421-
self.delete_hyperpod_cluster(self.new_session)
422-
self.delete_cloudformation_stack(self.new_session)
274+
logger.info("Tests completed")

test/integration_tests/data/basicJob.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ training_cfg:
2222
entry_script: /opt/pytorch-mnist/mnist.py
2323
script_args: []
2424
run:
25-
name: hyperpod-cli-test # Current run name
25+
name: ${JOB_NAME} # Current run name
2626
nodes: 1 # Number of nodes to use for current training
2727
ntasks_per_node: 1 # Number of devices to use per node
2828
cluster:
@@ -33,7 +33,7 @@ cluster:
3333
service_account_name: null
3434
# persistent volume, usually used to mount FSx
3535
persistent_volume_claims: null
36-
namespace: hyperpod-ns-test-team
36+
namespace: kubeflow
3737
# required node affinity to select nodes with HyperPod
3838
# labels and passed health check if burn-in enabled
3939
label_selector:
@@ -47,7 +47,7 @@ cluster:
4747
- 100
4848
pullPolicy: IfNotPresent # policy to pull container, can be Always, IfNotPresent and Never
4949
restartPolicy: OnFailure # restart policy
50-
scheduler_type: Kueue
50+
scheduler_type: None
5151

5252
base_results_dir: ./result # Location to store the results, checkpoints and logs.
5353
container: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-bc09cfd # container to use

0 commit comments

Comments
 (0)