Skip to content

Commit 4bc6c69

Browse files
authored
Merge branch 'aws:main' into main-enable-telemetry
2 parents bbe5e25 + 63192b5 commit 4bc6c69

14 files changed

+729
-44
lines changed

README.md

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ The Amazon SageMaker HyperPod command-line interface (HyperPod CLI) is a tool th
55

66
This documentation serves as a reference for the available HyperPod CLI commands. For a comprehensive user guide, see [Orchestrating SageMaker HyperPod clusters with Amazon EKS](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-eks.html) in the *Amazon SageMaker Developer Guide*.
77

8+
Note: Old `hyperpod`CLI V2 has been moved to `release_v2` branch. Please refer [release_v2 branch](https://github.com/aws/sagemaker-hyperpod-cli/tree/release_v2) for usage.
9+
810
## Table of Contents
911
- [Overview](#overview)
1012
- [Prerequisites](#prerequisites)
@@ -21,8 +23,8 @@ This documentation serves as a reference for the available HyperPod CLI commands
2123
- [Training](#training-)
2224
- [Inference](#inference-)
2325
- [SDK](#sdk-)
24-
- [Training](#training-)
25-
- [Inference](#inference)
26+
- [Training](#training-sdk)
27+
- [Inference](#inference-sdk)
2628

2729

2830
## Overview
@@ -72,27 +74,9 @@ SageMaker HyperPod CLI currently supports start training job with:
7274
1. Verify if the installation succeeded by running the following command.
7375
7476
```
75-
hyperpod --help
77+
hyp --help
7678
```
7779
78-
1. If you have a running HyperPod cluster, you can try to run a training job using the sample configuration file provided at ```/examples/basic-job-example-config.yaml```.
79-
- Get your HyperPod clusters to show their capacities.
80-
```
81-
hyperpod get-clusters
82-
```
83-
- Get your HyperPod clusters to show their capacities and quota allocation info for a team.
84-
```
85-
hyperpod get-clusters -n hyperpod-ns-<team-name>
86-
```
87-
- Connect to one HyperPod cluster and specify a namespace you have access to.
88-
```
89-
hyperpod connect-cluster --cluster-name <cluster-name>
90-
```
91-
- Start a job in your cluster. Change the `instance_type` in the yaml file to be same as the one in your HyperPod cluster. Also change the `namespace` you want to submit a job to, the example uses kubeflow namespace. You need to have installed PyTorch in your cluster.
92-
```
93-
hyperpod start-job --config-file ./examples/basic-job-example-config.yaml
94-
```
95-
9680
## Usage
9781
9882
The HyperPod CLI provides the following commands:
@@ -106,8 +90,8 @@ The HyperPod CLI provides the following commands:
10690
- [Training](#training-)
10791
- [Inference](#inference-)
10892
- [SDK](#sdk-)
109-
- [Training](#training-)
110-
- [Inference](#inference)
93+
- [Training](#training-sdk)
94+
- [Inference](#inference-sdk)
11195
11296
11397
### Getting Cluster information
@@ -267,7 +251,7 @@ hyp delete hyp-jumpstart-endpoint --name endpoint-jumpstart
267251
268252
Along with the CLI, we also have SDKs available that can perform the training and inference functionalities that the CLI performs
269253
270-
### Training
254+
### Training SDK
271255
272256
#### Creating a Training Job
273257
@@ -342,7 +326,7 @@ pytorch_job.create()
342326
343327
344328
345-
### Inference
329+
### Inference SDK
346330
347331
#### Creating a JumpstartModel Endpoint
348332
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit ce96b513c3033f815d24469f07e2ef0531aaf8d4

src/sagemaker/hyperpod/cli/commands/inference.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,17 @@ def custom_create(namespace, version, custom_endpoint):
6969
required=True,
7070
help="Required. The body of the request to invoke.",
7171
)
72+
@click.option(
73+
"--content-type",
74+
type=click.STRING,
75+
required=False,
76+
default="application/json",
77+
help="Optional. The content type of the request to invoke. Default set to 'application/json'",
78+
)
7279
def custom_invoke(
7380
endpoint_name: str,
7481
body: str,
82+
content_type: Optional[str]
7583
):
7684
"""
7785
Invoke a model endpoint.
@@ -105,7 +113,7 @@ def custom_invoke(
105113
resp = rt.invoke_endpoint(
106114
EndpointName=endpoint_name,
107115
Body=payload.encode("utf-8"),
108-
ContentType="application/json",
116+
ContentType=content_type,
109117
)
110118
result = resp["Body"].read().decode("utf-8")
111119
click.echo(result)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import time
2+
import uuid
3+
import pytest
4+
import boto3
5+
import os
6+
from click.testing import CliRunner
7+
from sagemaker.hyperpod.cli.commands.inference import (
8+
custom_create,
9+
custom_invoke,
10+
custom_list,
11+
custom_describe,
12+
custom_delete,
13+
custom_get_operator_logs,
14+
custom_list_pods
15+
)
16+
from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
17+
18+
# --------- Test Configuration ---------
19+
NAMESPACE = "integration"
20+
VERSION = "1.0"
21+
REGION = "us-east-2"
22+
TIMEOUT_MINUTES = 15
23+
POLL_INTERVAL_SECONDS = 30
24+
25+
BETA_FSX = "fs-0454e783bbb7356fc"
26+
PROD_FSX = "fs-03c59e2a7e824a22f"
27+
BETA_TLS = "s3://sagemaker-hyperpod-certificate-beta-us-east-2"
28+
PROD_TLS = "s3://sagemaker-hyperpod-certificate-prod-us-east-2"
29+
stage = os.getenv("STAGE", "BETA").upper()
30+
FSX_LOCATION = BETA_FSX if stage == "BETA" else PROD_FSX
31+
TLS_LOCATION = BETA_TLS if stage == "BETA" else PROD_TLS
32+
33+
@pytest.fixture(scope="module")
34+
def runner():
35+
return CliRunner()
36+
37+
@pytest.fixture(scope="module")
38+
def custom_endpoint_name():
39+
return f"custom-cli-integration-fsx"
40+
41+
@pytest.fixture(scope="module")
42+
def sagemaker_client():
43+
return boto3.client("sagemaker", region_name=REGION)
44+
45+
# --------- Custom Endpoint Tests ---------
46+
47+
def test_custom_create(runner, custom_endpoint_name):
48+
result = runner.invoke(custom_create, [
49+
"--namespace", NAMESPACE,
50+
"--version", VERSION,
51+
"--instance-type", "ml.c5.2xlarge",
52+
"--model-name", "test-model-integration-cli-fsx",
53+
"--model-source-type", "fsx",
54+
"--model-location", "hf-eqa",
55+
"--fsx-file-system-id", FSX_LOCATION,
56+
"--s3-region", REGION,
57+
"--image-uri", "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04",
58+
"--container-port", "8080",
59+
"--model-volume-mount-name", "model-weights",
60+
"--endpoint-name", custom_endpoint_name,
61+
"--resources-requests", '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}',
62+
"--resources-limits", '{"nvidia.com/gpu": 0}',
63+
"--tls-certificate-output-s3-uri", TLS_LOCATION,
64+
"--env", '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }'
65+
])
66+
assert result.exit_code == 0, result.output
67+
68+
69+
def test_custom_list(runner, custom_endpoint_name):
70+
result = runner.invoke(custom_list, ["--namespace", NAMESPACE])
71+
assert result.exit_code == 0
72+
assert custom_endpoint_name in result.output
73+
74+
75+
def test_custom_describe(runner, custom_endpoint_name):
76+
result = runner.invoke(custom_describe, [
77+
"--name", custom_endpoint_name,
78+
"--namespace", NAMESPACE,
79+
"--full"
80+
])
81+
assert result.exit_code == 0
82+
assert custom_endpoint_name in result.output
83+
84+
85+
def test_wait_until_inservice(custom_endpoint_name):
86+
"""Poll SDK until specific JumpStart endpoint reaches DeploymentComplete"""
87+
print(f"[INFO] Waiting for JumpStart endpoint '{custom_endpoint_name}' to be DeploymentComplete...")
88+
deadline = time.time() + (TIMEOUT_MINUTES * 60)
89+
poll_count = 0
90+
91+
while time.time() < deadline:
92+
poll_count += 1
93+
print(f"[DEBUG] Poll #{poll_count}: Checking endpoint status...")
94+
95+
try:
96+
ep = HPEndpoint.get(name=custom_endpoint_name, namespace=NAMESPACE)
97+
state = ep.status.endpoints.sagemaker.state
98+
print(f"[DEBUG] Current state: {state}")
99+
if state == "CreationCompleted":
100+
print("[INFO] Endpoint is in CreationCompleted state.")
101+
return
102+
103+
deployment_state = ep.status.deploymentStatus.deploymentObjectOverallState
104+
if deployment_state == "DeploymentFailed":
105+
pytest.fail("Endpoint deployment failed.")
106+
107+
except Exception as e:
108+
print(f"[ERROR] Exception during polling: {e}")
109+
110+
time.sleep(POLL_INTERVAL_SECONDS)
111+
112+
pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete")
113+
114+
115+
def test_custom_invoke(runner, custom_endpoint_name):
116+
result = runner.invoke(custom_invoke, [
117+
"--endpoint-name", custom_endpoint_name,
118+
"--body", '{"question" :"what is the name of the planet?", "context":"mars"}',
119+
"--content-type", "application/list-text"
120+
])
121+
assert result.exit_code == 0
122+
assert "error" not in result.output.lower()
123+
124+
125+
def test_custom_get_operator_logs(runner):
126+
result = runner.invoke(custom_get_operator_logs, ["--since-hours", "1"])
127+
assert result.exit_code == 0
128+
129+
130+
def test_custom_list_pods(runner):
131+
result = runner.invoke(custom_list_pods, ["--namespace", NAMESPACE])
132+
assert result.exit_code == 0
133+
134+
135+
def test_custom_delete(runner, custom_endpoint_name):
136+
result = runner.invoke(custom_delete, [
137+
"--name", custom_endpoint_name,
138+
"--namespace", NAMESPACE
139+
])
140+
assert result.exit_code == 0
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import time
2+
import uuid
3+
import pytest
4+
import boto3
5+
import os
6+
from click.testing import CliRunner
7+
from sagemaker.hyperpod.cli.commands.inference import (
8+
custom_create,
9+
custom_invoke,
10+
custom_list,
11+
custom_describe,
12+
custom_delete,
13+
custom_get_operator_logs,
14+
custom_list_pods
15+
)
16+
from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
17+
18+
# --------- Test Configuration ---------
19+
NAMESPACE = "integration"
20+
VERSION = "1.0"
21+
REGION = "us-east-2"
22+
TIMEOUT_MINUTES = 15
23+
POLL_INTERVAL_SECONDS = 30
24+
25+
BETA_BUCKET = "sagemaker-hyperpod-beta-integ-test-model-bucket-n"
26+
PROD_BUCKET = "sagemaker-hyperpod-prod-integ-test-model-bucket"
27+
BETA_TLS = "s3://sagemaker-hyperpod-certificate-beta-us-east-2"
28+
PROD_TLS = "s3://sagemaker-hyperpod-certificate-prod-us-east-2"
29+
stage = os.getenv("STAGE", "BETA").upper()
30+
BUCKET_LOCATION = BETA_BUCKET if stage == "BETA" else PROD_BUCKET
31+
TLS_LOCATION = BETA_TLS if stage == "BETA" else PROD_TLS
32+
33+
@pytest.fixture(scope="module")
34+
def runner():
35+
return CliRunner()
36+
37+
@pytest.fixture(scope="module")
38+
def custom_endpoint_name():
39+
return f"custom-cli-integration-s3"
40+
41+
@pytest.fixture(scope="module")
42+
def sagemaker_client():
43+
return boto3.client("sagemaker", region_name=REGION)
44+
45+
# --------- Custom Endpoint Tests ---------
46+
47+
def test_custom_create(runner, custom_endpoint_name):
48+
result = runner.invoke(custom_create, [
49+
"--namespace", NAMESPACE,
50+
"--version", VERSION,
51+
"--instance-type", "ml.c5.2xlarge",
52+
"--model-name", "test-model-integration-cli-s3",
53+
"--model-source-type", "s3",
54+
"--model-location", "hf-eqa",
55+
"--s3-bucket-name", BUCKET_LOCATION,
56+
"--s3-region", REGION,
57+
"--image-uri", "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04",
58+
"--container-port", "8080",
59+
"--model-volume-mount-name", "model-weights",
60+
"--endpoint-name", custom_endpoint_name,
61+
"--resources-requests", '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}',
62+
"--resources-limits", '{"nvidia.com/gpu": 0}',
63+
"--tls-certificate-output-s3-uri", TLS_LOCATION,
64+
"--env", '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }'
65+
])
66+
assert result.exit_code == 0, result.output
67+
68+
69+
def test_custom_list(runner, custom_endpoint_name):
70+
result = runner.invoke(custom_list, ["--namespace", NAMESPACE])
71+
assert result.exit_code == 0
72+
assert custom_endpoint_name in result.output
73+
74+
75+
def test_custom_describe(runner, custom_endpoint_name):
76+
result = runner.invoke(custom_describe, [
77+
"--name", custom_endpoint_name,
78+
"--namespace", NAMESPACE,
79+
"--full"
80+
])
81+
assert result.exit_code == 0
82+
assert custom_endpoint_name in result.output
83+
84+
85+
def test_wait_until_inservice(custom_endpoint_name):
86+
"""Poll SDK until specific JumpStart endpoint reaches DeploymentComplete"""
87+
print(f"[INFO] Waiting for JumpStart endpoint '{custom_endpoint_name}' to be DeploymentComplete...")
88+
deadline = time.time() + (TIMEOUT_MINUTES * 60)
89+
poll_count = 0
90+
91+
while time.time() < deadline:
92+
poll_count += 1
93+
print(f"[DEBUG] Poll #{poll_count}: Checking endpoint status...")
94+
95+
try:
96+
ep = HPEndpoint.get(name=custom_endpoint_name, namespace=NAMESPACE)
97+
state = ep.status.endpoints.sagemaker.state
98+
print(f"[DEBUG] Current state: {state}")
99+
if state == "CreationCompleted":
100+
print("[INFO] Endpoint is in CreationCompleted state.")
101+
return
102+
103+
deployment_state = ep.status.deploymentStatus.deploymentObjectOverallState
104+
if deployment_state == "DeploymentFailed":
105+
pytest.fail("Endpoint deployment failed.")
106+
107+
except Exception as e:
108+
print(f"[ERROR] Exception during polling: {e}")
109+
110+
time.sleep(POLL_INTERVAL_SECONDS)
111+
112+
pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete")
113+
114+
115+
def test_custom_invoke(runner, custom_endpoint_name):
116+
result = runner.invoke(custom_invoke, [
117+
"--endpoint-name", custom_endpoint_name,
118+
"--body", '{"question" :"what is the name of the planet?", "context":"mars"}',
119+
"--content-type", "application/list-text"
120+
])
121+
assert result.exit_code == 0
122+
assert "error" not in result.output.lower()
123+
124+
125+
def test_custom_get_operator_logs(runner):
126+
result = runner.invoke(custom_get_operator_logs, ["--since-hours", "1"])
127+
assert result.exit_code == 0
128+
129+
130+
def test_custom_list_pods(runner):
131+
result = runner.invoke(custom_list_pods, ["--namespace", NAMESPACE])
132+
assert result.exit_code == 0
133+
134+
135+
def test_custom_delete(runner, custom_endpoint_name):
136+
result = runner.invoke(custom_delete, [
137+
"--name", custom_endpoint_name,
138+
"--namespace", NAMESPACE
139+
])
140+
assert result.exit_code == 0

test/integration_tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_job_name():
2121
@pytest.fixture(scope="class")
2222
def image_uri():
2323
"""Return a standard PyTorch image URI for testing."""
24-
return "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.2.0-cpu-py310-ubuntu20.04-sagemaker"
24+
return "448049793756.dkr.ecr.us-west-2.amazonaws.com/ptjob:mnist"
2525

2626
@pytest.fixture(scope="class")
2727
def cluster_name():

test/integration_tests/cli/test_cli_custom_inference.py renamed to test/integration_tests/inference/cli/test_cli_custom_inference.py

File renamed without changes.

test/integration_tests/cli/test_cli_jumpstart_inference.py renamed to test/integration_tests/inference/cli/test_cli_jumpstart_inference.py

File renamed without changes.

test/integration_tests/sdk/test_sdk_custom_inferece.py renamed to test/integration_tests/inference/sdk/test_sdk_custom_inferece.py

File renamed without changes.

test/integration_tests/sdk/test_sdk_jumpstart_inference.py renamed to test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py

File renamed without changes.

0 commit comments

Comments
 (0)