1010# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111# ANY KIND, either express or implied. See the License for the specific
1212# language governing permissions and limitations under the License.
13+ import os
1314import subprocess
1415import time
1516import uuid
1617import re
1718
1819import boto3
20+ import yaml
1921from botocore .exceptions import ClientError
2022
2123from hyperpod_cli .utils import setup_logger
@@ -32,164 +34,37 @@ class AbstractIntegrationTests:
3234 "InService" ,
3335 ]
3436 suffix = str (uuid .uuid4 ())[:8 ]
35- hyperpod_cli_cluster_name = "hyperpod-cli-cluster-" + suffix
36- vpc_eks_stack_name = "hyperpod-cli-stack-" + suffix
37+ hyperpod_cli_job_name : str = 'hyperpod-job-' + suffix
38+ test_job_file = os .path .expanduser ("./test/integration_tests/data/basicJob.yaml" )
39+ hyperpod_cli_cluster_name = "HyperPodCLI-cluster"
3740 s3_roles_stack_name = "hyperpod-cli-resource-stack"
3841 vpc_stack_name = "hyperpod-cli-vpc-stack"
39- eks_cluster_name = "hyperpod-cli-cluster-" + suffix
40- bucket_name = "hyperpod-cli-s3-" + suffix
4142 test_team_name = "test-team"
4243
4344 def _create_session (self ):
4445 session = boto3 .Session ()
4546 return session
4647
47- def create_test_resources (self , session ):
48- cfn = session .client ("cloudformation" )
48+ def replace_placeholders (self ):
49+ replacements = {
50+ 'JOB_NAME' : self .hyperpod_cli_job_name ,
51+ }
52+ with open (self .test_job_file , 'r' ) as file :
53+ yaml_content = file .read ()
54+ pattern = re .compile (r'\$\{([^}^{]+)\}' )
4955
50- # Get static resources from static-resource stack
51- self .describe_constant_resources_stack_and_set_values (cfn )
56+ def replace (match ):
57+ key = match .group (1 )
58+ return str (replacements .get (key , match .group (0 )))
5259
53- # Get static resources from static-resource stack
54- self .describe_vpc_stack_and_set_values (cfn )
60+ processed_yaml = pattern .sub (replace , yaml_content )
5561
56- # Create VPC, EKS cluster and roles
57- with open (
58- "test/integration_tests/cloudformation/resources.yaml" ,
59- "r" ,
60- ) as fh :
61- template = fh .read ()
62- cfn .create_stack (
63- StackName = self .vpc_eks_stack_name ,
64- TemplateBody = template ,
65- Capabilities = ["CAPABILITY_NAMED_IAM" ],
66- Parameters = [
67- {
68- "ParameterKey" : "ClusterName" ,
69- "ParameterValue" : self .eks_cluster_name ,
70- "ResolvedValue" : "string" ,
71- },
72- {
73- "ParameterKey" : "EKSClusterRoleArn" ,
74- "ParameterValue" : self .cfn_output_map .get ("EKSClusterRoleArn" ),
75- "ResolvedValue" : "string" ,
76- },
77- {
78- "ParameterKey" : "SubnetId1" ,
79- "ParameterValue" : self .cfn_output_map .get ("PrivateSubnet1" ),
80- "ResolvedValue" : "string" ,
81- },
82- {
83- "ParameterKey" : "SubnetId2" ,
84- "ParameterValue" : self .cfn_output_map .get ("PrivateSubnet2" ),
85- "ResolvedValue" : "string" ,
86- },
87- {
88- "ParameterKey" : "SecurityGroupId" ,
89- "ParameterValue" : self .cfn_output_map .get ("SecurityGroup" ),
90- "ResolvedValue" : "string" ,
91- },
92- ],
93- )
94- waiter = cfn .get_waiter ("stack_create_complete" )
95- waiter .wait (
96- StackName = self .vpc_eks_stack_name ,
97- WaiterConfig = {
98- "Delay" : 30 ,
99- "MaxAttempts" : 40 ,
100- },
101- )
102- describe = cfn .describe_stacks (StackName = self .vpc_eks_stack_name )
103- if describe :
104- cfn_output = describe .get ("Stacks" )[0 ]
105- if cfn_output and cfn_output .get ("Outputs" ):
106- for output in cfn_output .get ("Outputs" ):
107- self .cfn_output_map [output .get ("OutputKey" )] = output .get (
108- "OutputValue"
109- )
110-
111- def delete_cloudformation_stack (self , session ):
112- cfn = session .client ("cloudformation" )
113- cfn .delete_stack (StackName = self .vpc_eks_stack_name )
114-
115- def upload_lifecycle_script (self , session ):
116- s3_client = session .client ("s3" )
117- try :
118- response = s3_client .upload_file (
119- "test/integration_tests/lifecycle_script/on_create_noop.sh" ,
120- self .cfn_output_map .get ("Bucket" ),
121- "on_create_noop.sh" ,
122- )
123- except ClientError as e :
124- logger .error (f"Error uploading lifecycle script to s3 { e } " )
125-
126- def get_hyperpod_cluster_status (self , sagemaker_client ):
127- return sagemaker_client .describe_cluster (
128- ClusterName = self .hyperpod_cli_cluster_name
129- )
130-
131- def create_hyperpod_cluster (self , session ):
132- # Create HyperPod cluster using eks cluster from stack above
133- sagemaker_client = session .client ("sagemaker" )
134- sagemaker_client .create_cluster (
135- ClusterName = self .hyperpod_cli_cluster_name ,
136- Orchestrator = {"Eks" : {"ClusterArn" : self .cfn_output_map .get ("ClusterArn" )}},
137- InstanceGroups = [
138- {
139- "InstanceGroupName" : "group2" ,
140- "InstanceType" : "ml.c5.2xlarge" ,
141- "InstanceCount" : 2 ,
142- "LifeCycleConfig" : {
143- "SourceS3Uri" : f's3://{ self .cfn_output_map .get ("Bucket" )} ' ,
144- "OnCreate" : "on_create_noop.sh" ,
145- },
146- "ExecutionRole" : self .cfn_output_map .get ("ExecutionRole" ),
147- "ThreadsPerCore" : 1 ,
148- }
149- ],
150- VpcConfig = {
151- "SecurityGroupIds" : [self .cfn_output_map .get ("SecurityGroup" )],
152- "Subnets" : [self .cfn_output_map .get ("PrivateSubnet1" )],
153- },
154- )
62+ with open (self .test_job_file , 'w' ) as file :
63+ file .write (processed_yaml )
15564
156- time .sleep (1 )
157- # Wait for sagemkaer stack to create complete
158- try :
159- result = self .get_hyperpod_cluster_status (sagemaker_client )
160- while (
161- result .get ("ClusterStatus" ) not in self .hyperpod_cluster_terminal_state
162- ):
163- time .sleep (30 )
164- result = self .get_hyperpod_cluster_status (sagemaker_client )
165- except Exception as e :
166- logger .error (e )
167- logger .info (f"Hyperpod cluster created { self .hyperpod_cli_cluster_name } " )
168-
169- def delete_hyperpod_cluster (self , session ):
170- # delete HyperPod cluster using eks cluster from stack above
171- sagemaker_client = session .client ("sagemaker" )
172- sagemaker_client .delete_cluster (ClusterName = self .hyperpod_cli_cluster_name )
173-
174- time .sleep (10 )
175- # Wait for sagemaker stack to create complete
176- try :
177- result = self .get_hyperpod_cluster_status (sagemaker_client )
178- while result .get ("ClusterStatus" ) == "Deleting" :
179- time .sleep (30 )
180- result = self .get_hyperpod_cluster_status (sagemaker_client )
181- except Exception as e :
182- logger .info (
183- f"Caught exception while trying to describe cluster during teardown { e } "
184- )
185- return
186- raise Exception (
187- f"Hyperpod Cluster { self .hyperpod_cli_cluster_name } fail to delete"
188- )
18965
19066 def create_kube_context (self ):
191- eks_cluster_name = self .cfn_output_map .get ("ClusterArn" ).split (":" )[- 1 ]
192- eks_cluster_name = eks_cluster_name .split ("/" )[- 1 ]
67+ eks_cluster_name = 'HyperPodCLI-eks-cluster'
19368 command = [
19469 "aws" ,
19570 "eks" ,
@@ -204,28 +79,6 @@ def create_kube_context(self):
20479 except subprocess .CalledProcessError as e :
20580 raise RuntimeError (f"Failed to update kubeconfig: { e } " )
20681
207- def describe_constant_resources_stack_and_set_values (self , cfn_client ):
208- describe_s3_stack = cfn_client .describe_stacks (
209- StackName = self .s3_roles_stack_name
210- )
211- if describe_s3_stack :
212- cfn_output = describe_s3_stack .get ("Stacks" )[0 ]
213- if cfn_output and cfn_output .get ("Outputs" ):
214- for output in cfn_output .get ("Outputs" ):
215- self .cfn_output_map [output .get ("OutputKey" )] = output .get (
216- "OutputValue"
217- )
218-
219- def describe_vpc_stack_and_set_values (self , cfn_client ):
220- describe_vpc_stack = cfn_client .describe_stacks (StackName = self .vpc_stack_name )
221- if describe_vpc_stack :
222- cfn_output = describe_vpc_stack .get ("Stacks" )[0 ]
223- if cfn_output and cfn_output .get ("Outputs" ):
224- for output in cfn_output .get ("Outputs" ):
225- self .cfn_output_map [output .get ("OutputKey" )] = output .get (
226- "OutputValue"
227- )
228-
22982 def apply_helm_charts (self ):
23083 command = ["helm" , "dependencies" , "update" , "helm_chart/HyperPodHelmChart" ]
23184
@@ -244,7 +97,8 @@ def apply_helm_charts(self):
24497
24598 apply_command = [
24699 "helm" ,
247- "install" ,
100+ "upgrade" ,
101+ "--install" ,
248102 "dependencies" ,
249103 "helm_chart/HyperPodHelmChart" ,
250104 "--namespace" ,
@@ -410,13 +264,11 @@ def create_quota_allocation_resources(self):
410264
411265 def setup (self ):
412266 self .new_session = self ._create_session ()
413- self .create_test_resources ( self . new_session )
267+ self .replace_placeholders ( )
414268 self .create_kube_context ()
415269 self .apply_helm_charts ()
416- self .create_hyperpod_cluster (self .new_session )
417- self .install_kueue ()
418- self .create_quota_allocation_resources ()
270+ # self.install_kueue()
271+ # self.create_quota_allocation_resources()
419272
420273 def tearDown (self ):
421- self .delete_hyperpod_cluster (self .new_session )
422- self .delete_cloudformation_stack (self .new_session )
274+ logger .info ("Tests completed" )
0 commit comments