Skip to content

Commit 3ee6d51

Browse files
aviruthenrsareddy0329
authored andcommitted
Integration tests for init experience (#242)
* First draft integ tests * Mini fixes to ensure integ tests work * Allow integ tests to run from clean directory * Change torch job creation namespace to default
1 parent 7421a76 commit 3ee6d51

File tree

7 files changed

+1271
-22
lines changed

7 files changed

+1271
-22
lines changed

src/sagemaker/hyperpod/cli/commands/init.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ def _default_create(region):
386386
data, template, version = load_config(dir_path)
387387
namespace = data.get("namespace", "default")
388388
registry = TEMPLATES[template]["registry"]
389-
model = registry.get(version)
389+
model = registry.get(str(version))
390390
if model:
391391
# Filter out CLI metadata fields before passing to model
392392
from sagemaker.hyperpod.cli.init_utils import filter_cli_metadata_fields

src/sagemaker/hyperpod/cli/constants/init_constants.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,27 @@
1313
CRD = "crd"
1414
CFN = "cfn"
1515
TEMPLATES = {
16-
# "hyp-jumpstart-endpoint": {
17-
# "registry": JS_REG,
18-
# "schema_pkg": "hyperpod_jumpstart_inference_template",
19-
# "schema_type": CRD,
20-
# 'template': KUBERNETES_JS_ENDPOINT_TEMPLATE,
21-
# 'type': "jinja"
22-
# },
23-
# "hyp-custom-endpoint": {
24-
# "registry": C_REG,
25-
# "schema_pkg": "hyperpod_custom_inference_template",
26-
# "schema_type": CRD,
27-
# 'template': KUBERNETES_CUSTOM_ENDPOINT_TEMPLATE,
28-
# 'type': "jinja"
29-
# },
30-
# "hyp-pytorch-job": {
31-
# "registry": P_REG,
32-
# "schema_pkg": "hyperpod_pytorch_job_template",
33-
# "schema_type": CRD,
34-
# 'template': KUBERNETES_PYTORCH_JOB_TEMPLATE,
35-
# 'type': "jinja"
36-
# },
16+
"hyp-jumpstart-endpoint": {
17+
"registry": JS_REG,
18+
"schema_pkg": "hyperpod_jumpstart_inference_template",
19+
"schema_type": CRD,
20+
'template': KUBERNETES_JS_ENDPOINT_TEMPLATE,
21+
'type': "jinja"
22+
},
23+
"hyp-custom-endpoint": {
24+
"registry": C_REG,
25+
"schema_pkg": "hyperpod_custom_inference_template",
26+
"schema_type": CRD,
27+
'template': KUBERNETES_CUSTOM_ENDPOINT_TEMPLATE,
28+
'type': "jinja"
29+
},
30+
"hyp-pytorch-job": {
31+
"registry": P_REG,
32+
"schema_pkg": "hyperpod_pytorch_job_template",
33+
"schema_type": CRD,
34+
'template': KUBERNETES_PYTORCH_JOB_TEMPLATE,
35+
'type': "jinja"
36+
},
3737
"cluster-stack": {
3838
"schema_pkg": "hyperpod_cluster_stack_template",
3939
"schema_type": CFN,
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
"""
2+
End-to-end integration tests for init workflow with custom endpoint template.
3+
4+
SAFETY WARNING: This test involves creating real AWS SageMaker endpoints.
5+
Only run with proper cost controls and cleanup procedures in place.
6+
7+
Tests complete user workflow: init -> configure -> validate -> create -> wait -> invoke -> delete.
8+
Uses real AWS resources with cost implications.
9+
"""
10+
import time
11+
import yaml
12+
import pytest
13+
import boto3
14+
from pathlib import Path
15+
import os
16+
import tempfile
17+
18+
import sys
19+
from unittest.mock import patch
20+
21+
from test.integration_tests.init.utils import (
22+
assert_command_succeeded,
23+
assert_init_files_created,
24+
assert_config_values,
25+
)
26+
27+
from click.testing import CliRunner
28+
from sagemaker.hyperpod.cli.commands.inference import custom_invoke
29+
from sagemaker.hyperpod.cli.commands.init import init, configure, validate, _default_create as create
30+
from sagemaker.hyperpod.cli.hyp_cli import delete
31+
from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
32+
from test.integration_tests.utils import get_time_str
33+
34+
# --------- Test Configuration ---------
35+
NAMESPACE = "default"
36+
VERSION = "1.0"
37+
REGION = "us-east-2"
38+
TIMEOUT_MINUTES = 15
39+
POLL_INTERVAL_SECONDS = 30
40+
41+
BETA_BUCKET = "sagemaker-hyperpod-beta-integ-test-model-bucket-n"
42+
PROD_BUCKET = "sagemaker-hyperpod-prod-integ-test-model-bucket"
43+
stage = os.getenv("STAGE", "BETA").upper()
44+
BUCKET_LOCATION = BETA_BUCKET if stage == "BETA" else PROD_BUCKET
45+
46+
@pytest.fixture(scope="module")
47+
def runner():
48+
return CliRunner()
49+
50+
@pytest.fixture(scope="module")
51+
def custom_endpoint_name():
52+
return "custom-cli-integration-" + get_time_str()
53+
54+
@pytest.fixture(scope="module")
55+
def sagemaker_client():
56+
return boto3.client("sagemaker", region_name=REGION)
57+
58+
59+
@pytest.fixture(scope="module")
60+
def test_directory():
61+
"""Create a temporary directory for test isolation."""
62+
with tempfile.TemporaryDirectory() as temp_dir:
63+
original_cwd = os.getcwd()
64+
os.chdir(temp_dir)
65+
try:
66+
yield temp_dir
67+
finally:
68+
os.chdir(original_cwd)
69+
70+
71+
# --------- Custom Endpoint Tests ---------
72+
@pytest.mark.dependency(name="init")
73+
def test_init_custom(runner, custom_endpoint_name, test_directory):
74+
"""Initialize custom endpoint template and verify file creation."""
75+
result = runner.invoke(
76+
init, ["hyp-custom-endpoint", "."], catch_exceptions=False
77+
)
78+
assert_command_succeeded(result)
79+
assert_init_files_created("./", "hyp-custom-endpoint")
80+
81+
82+
@pytest.mark.dependency(name="configure", depends=["init"])
83+
def test_configure_custom(runner, custom_endpoint_name, test_directory):
84+
"""Configure custom endpoint with S3 model source and verify config persistence."""
85+
with patch.object(sys, 'argv', ['hyp', 'configure']):
86+
import importlib
87+
from sagemaker.hyperpod.cli.commands import init
88+
importlib.reload(init)
89+
configure = init.configure
90+
91+
result = runner.invoke(
92+
configure, [
93+
# Required fields
94+
"--endpoint-name", custom_endpoint_name,
95+
"--model-name", "test-pytorch-model",
96+
"--instance-type", "ml.c5.2xlarge",
97+
"--model-source-type", "s3",
98+
"--image-uri", "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04",
99+
"--container-port", "8080",
100+
"--model-volume-mount-name", "model-weights",
101+
102+
# S3-specific required fields
103+
"--s3-bucket-name", BUCKET_LOCATION,
104+
"--model-location", "hf-eqa",
105+
"--s3-region", REGION,
106+
107+
# Optional Params, but likely needed
108+
"--env", '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }',
109+
"--resources-requests", '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}',
110+
"--resources-limits", '{"cpu": "3200m", "memory": "12Gi", "nvidia.com/gpu": 0}',
111+
], catch_exceptions=False
112+
)
113+
assert_command_succeeded(result)
114+
115+
# Verify configuration was saved correctly
116+
expected_config = {
117+
# Required fields
118+
"endpoint_name": custom_endpoint_name,
119+
"model_name": "test-pytorch-model",
120+
"instance_type": "ml.c5.2xlarge",
121+
"model_source_type": "s3",
122+
"image_uri": "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04",
123+
"container_port": 8080,
124+
"model_volume_mount_name": "model-weights",
125+
126+
# S3-specific required fields
127+
"s3_bucket_name": BUCKET_LOCATION,
128+
"model_location": "hf-eqa",
129+
"s3_region": REGION,
130+
131+
# Optional Params, but likely needed
132+
"env": {'SAGEMAKER_PROGRAM': 'inference.py', 'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_MODEL_SERVER_TIMEOUT': '3600', 'ENDPOINT_SERVER_TIMEOUT': '3600', 'MODEL_CACHE_ROOT': '/opt/ml/model', 'SAGEMAKER_ENV': '1', 'SAGEMAKER_MODEL_SERVER_WORKERS': '1'},
133+
"resources_requests": {'cpu': '3200m', 'nvidia.com/gpu': 0, 'memory': '12Gi'},
134+
"resources_limits": {'cpu': '3200m', 'memory': '12Gi', 'nvidia.com/gpu': 0},
135+
}
136+
assert_config_values("./", expected_config)
137+
138+
139+
@pytest.mark.dependency(name="validate", depends=["configure", "init"])
140+
def test_validate_custom(runner, custom_endpoint_name, test_directory):
141+
"""Validate custom endpoint configuration for correctness."""
142+
result = runner.invoke(validate, [], catch_exceptions=False)
143+
assert_command_succeeded(result)
144+
145+
146+
@pytest.mark.dependency(name="create", depends=["validate", "configure", "init"])
147+
def test_create_custom(runner, custom_endpoint_name, test_directory):
148+
"""Create custom endpoint for deployment and verify template rendering."""
149+
result = runner.invoke(create, [], catch_exceptions=False)
150+
assert_command_succeeded(result)
151+
152+
# Verify expected submission messages appear
153+
assert "Configuration is valid!" in result.output
154+
assert "Submitted!" in result.output
155+
assert "Creating sagemaker model and endpoint" in result.output
156+
assert custom_endpoint_name in result.output
157+
assert "The process may take a few minutes" in result.output
158+
159+
160+
@pytest.mark.dependency(name="wait", depends=["create"])
161+
def test_wait_until_inservice(custom_endpoint_name, test_directory):
162+
"""Poll SDK until specific JumpStart endpoint reaches DeploymentComplete"""
163+
print(f"[INFO] Waiting for JumpStart endpoint '{custom_endpoint_name}' to be DeploymentComplete...")
164+
deadline = time.time() + (TIMEOUT_MINUTES * 60)
165+
poll_count = 0
166+
167+
while time.time() < deadline:
168+
poll_count += 1
169+
print(f"[DEBUG] Poll #{poll_count}: Checking endpoint status...")
170+
171+
try:
172+
ep = HPEndpoint.get(name=custom_endpoint_name, namespace=NAMESPACE)
173+
state = ep.status.endpoints.sagemaker.state
174+
print(f"[DEBUG] Current state: {state}")
175+
if state == "CreationCompleted":
176+
print("[INFO] Endpoint is in CreationCompleted state.")
177+
return
178+
179+
deployment_state = ep.status.deploymentStatus.deploymentObjectOverallState
180+
if deployment_state == "DeploymentFailed":
181+
pytest.fail("Endpoint deployment failed.")
182+
183+
except Exception as e:
184+
print(f"[ERROR] Exception during polling: {e}")
185+
186+
time.sleep(POLL_INTERVAL_SECONDS)
187+
188+
pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete")
189+
190+
191+
@pytest.mark.dependency(name="invoke", depends=["wait"])
192+
def test_custom_invoke(runner, custom_endpoint_name, test_directory):
193+
result = runner.invoke(custom_invoke, [
194+
"--endpoint-name", custom_endpoint_name,
195+
"--body", '{"question" :"what is the name of the planet?", "context":"mars"}',
196+
"--content-type", "application/list-text"
197+
])
198+
assert result.exit_code == 0
199+
assert "error" not in result.output.lower()
200+
201+
202+
@pytest.mark.dependency(depends=["invoke"])
203+
def test_custom_delete(runner, custom_endpoint_name, test_directory):
204+
"""Clean up deployed custom endpoint using CLI delete command."""
205+
result = runner.invoke(delete, [
206+
"hyp-custom-endpoint",
207+
"--name", custom_endpoint_name,
208+
"--namespace", NAMESPACE
209+
])
210+
assert_command_succeeded(result)
211+
212+
213+
214+
215+
216+
217+
218+
219+
220+
221+
222+
################################ OLD CONFIG FN ########################################
223+
# @pytest.mark.dependency(name="configure", depends=["init"])
224+
# def test_configure_custom(runner, custom_endpoint_name):
225+
# with patch.object(sys, 'argv', ['hyp', 'configure']):
226+
# import importlib
227+
# from sagemaker.hyperpod.cli.commands import init
228+
# importlib.reload(init)
229+
# configure = init.configure
230+
# """Configure custom endpoint with S3 model source and verify config persistence."""
231+
# result = runner.invoke(
232+
# configure, [
233+
# # "--endpoint-name", custom_endpoint_name,
234+
# # "--model-name", "test-pytorch-model",
235+
# # "--instance-type", "ml.g5.8xlarge",
236+
# # "--image-uri", "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:latest",
237+
# # "--container-port", "8080",
238+
# # "--model-source-type", "s3",
239+
# # "--s3-bucket-name", "sagemaker-test-bucket",
240+
# # "--model-location", "models/test-pytorch-model.tar.gz",
241+
# # "--s3-region", "us-east-1",
242+
# "--namespace", NAMESPACE,
243+
# "--version", VERSION,
244+
# "--instance-type", "ml.c5.2xlarge",
245+
# "--model-name", "test-model-integration-cli-s3",
246+
# "--model-source-type", "s3",
247+
# "--model-location", "hf-eqa",
248+
# "--s3-bucket-name", BUCKET_LOCATION,
249+
# "--s3-region", REGION,
250+
# "--image-uri", "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04",
251+
# "--container-port", "8080",
252+
# "--model-volume-mount-name", "model-weights",
253+
# "--endpoint-name", custom_endpoint_name,
254+
# "--resources-requests", '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}',
255+
# "--resources-limits", '{"nvidia.com/gpu": 0}',
256+
# "--env", '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }'
257+
# ], catch_exceptions=False
258+
# )
259+
# assert_command_succeeded(result)
260+
261+
# # Verify configuration was saved correctly
262+
# expected_config = {
263+
# # "endpoint_name": custom_endpoint_name,
264+
# # "model_name": "test-pytorch-model",
265+
# # "instance_type": "ml.g5.8xlarge",
266+
# # "image_uri": "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:latest",
267+
# # "container_port": "8080",
268+
# # "model_source_type": "s3",
269+
# # "s3_bucket_name": "sagemaker-test-bucket",
270+
# # "model_location": "models/test-pytorch-model.tar.gz",
271+
# # "s3_region": "us-east-1",
272+
# "namespace": NAMESPACE,
273+
# "version": VERSION,
274+
# "instance-type": "ml.c5.2xlarge",
275+
# "model-name": "test-model-integration-cli-s3",
276+
# "model-source-type": "s3",
277+
# "model-location": "hf-eqa",
278+
# "s3-bucket-name": BUCKET_LOCATION,
279+
# "s3-region": REGION,
280+
# "image-uri": "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04",
281+
# "container-port": "8080",
282+
# "model-volume-mount-name": "model-weights",
283+
# "endpoint-name": custom_endpoint_name,
284+
# "resources-requests": '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}',
285+
# "resources-limits": '{"nvidia.com/gpu": 0}',
286+
# "env": '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }'
287+
# }
288+
# assert_config_values("./", expected_config)

0 commit comments

Comments
 (0)