Skip to content

Commit a7ee420

Browse files
Merge branch 'aws:main' into main
2 parents e876bf3 + 1590894 commit a7ee420

File tree

1 file changed

+50
-6
lines changed

1 file changed

+50
-6
lines changed

test/integration_tests/cluster_management/test_hp_cluster_creation.py

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,49 @@ def get_cluster_status(cluster_name, region):
6565
except Exception as e:
6666
raise AssertionError(f"Failed to get cluster status: {e}")
6767

68+
69+
def wait_for_stack_complete(stack_name, region, timeout_minutes=15):
70+
"""Wait for CloudFormation stack to be CREATE_COMPLETE."""
71+
import boto3
72+
client = boto3.client('cloudformation', region_name=region)
73+
74+
deadline = time.time() + (timeout_minutes * 60)
75+
while time.time() < deadline:
76+
try:
77+
response = client.describe_stacks(StackName=stack_name)
78+
status = response['Stacks'][0]['StackStatus']
79+
80+
if status == 'CREATE_COMPLETE':
81+
return True
82+
elif status in ['CREATE_FAILED', 'ROLLBACK_COMPLETE']:
83+
raise AssertionError(f"Stack creation failed with status: {status}")
84+
85+
time.sleep(30)
86+
except Exception as e:
87+
if "does not exist" in str(e).lower():
88+
print(f"[STATUS] Stack '{stack_name}' not found yet, waiting for creation...")
89+
else:
90+
print(f"[ERROR] Error checking stack status: {e}")
91+
time.sleep(30)
92+
93+
raise AssertionError(f"Stack did not complete after {timeout_minutes} minutes")
94+
95+
6896
# --------- Test Configuration ---------
6997
REGION = "us-east-2"
7098

7199
# Global variables to share data between tests
72100
STACK_NAME = None
73101
CREATE_TIME = None
102+
UNIQUE_TIMESTAMP = int(time.time() * 1000)
74103

75104
@pytest.fixture(scope="module")
76105
def runner():
77106
return CliRunner()
78107

79108
@pytest.fixture(scope="module")
80109
def cluster_name():
81-
return "hyperpod-cluster"
110+
return f"hyperpod-{UNIQUE_TIMESTAMP}-cluster-integ-test"
82111

83112
@pytest.fixture(scope="module")
84113
def create_time():
@@ -109,7 +138,8 @@ def test_configure_cluster(runner, cluster_name):
109138
# Configuration mapping for cleaner code
110139
config_options = {
111140
"stage": "prod",
112-
"resource-name-prefix": f"hyperpod-cli-integ-test-{int(time.time())}",
141+
"resource-name-prefix": f"hyperpod-cli-integ-test-{UNIQUE_TIMESTAMP}",
142+
"hyperpod-cluster-name": cluster_name,
113143
"create-vpc-stack": "true",
114144
"create-security-group-stack": "true",
115145
"create-eks-cluster-stack": "true",
@@ -236,7 +266,6 @@ def test_describe_cluster_via_cli(runner, cluster_name):
236266

237267

238268
# --------- Extended Cluster Resource Verification Tests ---------
239-
240269
@pytest.mark.dependency(name="wait_for_cluster", depends=["verify_submission"])
241270
def test_wait_for_cluster_ready(runner, cluster_name):
242271
"""Wait for cluster to be ready by polling cluster status until InService.
@@ -271,9 +300,12 @@ def test_wait_for_cluster_ready(runner, cluster_name):
271300
assert False, f"Cluster creation failed with status: {status}"
272301

273302
except AssertionError as e:
274-
if "AWS CLI not available" in str(e) or "timed out" in str(e):
303+
if "ResourceNotFound" in str(e) or "not found" in str(e):
304+
print(f"[STATUS] Cluster '{cluster_name}' not created yet, waiting...")
305+
elif "AWS CLI not available" in str(e) or "timed out" in str(e):
275306
assert False, str(e)
276-
print(f"[ERROR] Error during polling: {e}")
307+
else:
308+
print(f"[ERROR] Error during polling: {e}")
277309

278310
time.sleep(poll_interval)
279311
# Exponential backoff with cap
@@ -282,7 +314,19 @@ def test_wait_for_cluster_ready(runner, cluster_name):
282314
assert False, f"Timed out waiting for cluster '{cluster_name}' to be InService after {timeout_minutes} minutes"
283315

284316

285-
@pytest.mark.dependency(name="update_cluster", depends=["wait_for_cluster"])
317+
# Add this test after cluster is InService but before cleanup
318+
@pytest.mark.dependency(name="wait_for_stack", depends=["wait_for_cluster"])
319+
def test_wait_for_stack_completion(runner, cluster_name):
320+
"""Wait for CloudFormation stack to be fully complete."""
321+
global STACK_NAME
322+
assert STACK_NAME, "Stack name should be available"
323+
324+
print(f"⏳ Waiting for CloudFormation stack {STACK_NAME} to be CREATE_COMPLETE...")
325+
wait_for_stack_complete(STACK_NAME, REGION)
326+
print(f"✅ Stack {STACK_NAME} is now CREATE_COMPLETE")
327+
328+
329+
@pytest.mark.dependency(name="update_cluster", depends=["wait_for_stack"])
286330
def test_cluster_update_workflow(runner, cluster_name):
287331
"""Test hyp update-cluster command by toggling node recovery setting."""
288332
global STACK_NAME

0 commit comments

Comments
 (0)