Skip to content

Commit 5ccd69d

Browse files
Merge branch 'aws:main' into jinja-template
2 parents 3518171 + c5edf2d commit 5ccd69d

File tree

12 files changed

+1064
-40
lines changed

12 files changed

+1064
-40
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,7 @@ doc/_build/
3232
/result/
3333
/results/
3434

35-
.idea/
35+
.idea/
36+
37+
.venv*
38+
venv

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ The SageMaker HyperPod CLI is a tool that helps create training jobs and inferen
4646
### Prerequisites for Inference
4747

4848
- HyperPod CLI supports creating Inference Endpoints through jumpstart and through custom Endpoint config
49-
- You can follow [inference operator doc](https://github.com/aws/sagemaker-hyperpod-cli/tree/master/helm_chart/HyperPodHelmChart/charts/inference-operator) to install it.
49+
- You can follow [inference operator doc](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) to install it.
5050

5151
## Platform Support
5252

helm_chart/HyperPodHelmChart/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ dependencies:
4141
repository: https://nvidia.github.io/k8s-device-plugin
4242
condition: nvidia-device-plugin.devicePlugin.enabled
4343
- name: aws-efa-k8s-device-plugin
44-
version: "0.5.3"
44+
version: "0.5.10"
4545
repository: https://aws.github.io/eks-charts/
4646
condition: aws-efa-k8s-device-plugin.devicePlugin.enabled
4747
- name: neuron-device-plugin

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,9 @@
276276
"title": "Queue Name"
277277
},
278278
"accelerators": {
279-
"type": "integer",
280-
"minimum": 0,
281-
"description": "Number of accelerators (GPUs/TPUs)"
279+
"type": "integer",
280+
"minimum": 0,
281+
"description": "Number of accelerators (GPUs/TPUs)"
282282
},
283283
"vcpu": {
284284
"type": "float",

src/sagemaker/hyperpod/cli/commands/cluster.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@
7777
_hyperpod_telemetry_emitter,
7878
)
7979
from sagemaker.hyperpod.common.telemetry.constants import Feature
80+
from sagemaker.hyperpod.cli.utils import convert_datetimes
81+
from sagemaker_core.main.resources import Cluster
8082

8183
RATE_LIMIT = 4
8284
RATE_LIMIT_PERIOD = 1 # 1 second
@@ -684,6 +686,75 @@ def get_cluster_context(
684686
sys.exit(1)
685687

686688

689+
@click.command("cluster")
690+
@click.argument("cluster-name", required=True)
691+
@click.option("--region", help="AWS region")
692+
@click.option("--debug", is_flag=True, help="Enable debug logging")
693+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "describe_cluster_cli")
694+
def describe_cluster(cluster_name: str, debug: bool, region: str) -> None:
695+
"""Describe the status of a HyperPod cluster.
696+
Shows detailed information about a SageMaker HyperPod cluster including its current status,
697+
instance groups, orchestrator details, and configuration.
698+
Usage Examples
699+
# Describe a cluster
700+
hyp describe cluster my-cluster-name
701+
# Describe with specific region
702+
hyp describe cluster my-cluster-name --region us-west-2
703+
"""
704+
if debug:
705+
set_logging_level(logger, logging.DEBUG)
706+
707+
try:
708+
botocore_config = botocore.config.Config(
709+
user_agent_extra=get_user_agent_extra_suffix()
710+
)
711+
session = boto3.Session(region_name=region) if region else boto3.Session()
712+
sm_client = get_sagemaker_client(session, botocore_config)
713+
714+
# Get cluster details using SageMaker client
715+
cluster_dict = sm_client.describe_cluster(ClusterName=cluster_name)
716+
717+
# Convert datetimes for display
718+
cluster_dict = convert_datetimes(cluster_dict)
719+
720+
logger.debug(f"Describing cluster name: {cluster_name}\ninfo: {json.dumps(cluster_dict, indent=2, default=str)}")
721+
722+
click.echo(f"📋 Cluster Details for: {cluster_name}")
723+
724+
# Highlight cluster status
725+
cluster_status = cluster_dict.get('ClusterStatus', 'UNKNOWN')
726+
click.echo(f"Status: ", nl=False)
727+
click.secho(cluster_status)
728+
729+
table_data = []
730+
for key, value in cluster_dict.items():
731+
if isinstance(value, (dict, list)):
732+
formatted_value = json.dumps(value, indent=2, default=str)
733+
else:
734+
formatted_value = str(value)
735+
table_data.append([key, formatted_value])
736+
737+
# Only display table if we have data
738+
if table_data:
739+
click.echo(tabulate(table_data, tablefmt="presto"))
740+
else:
741+
click.echo("No cluster data available")
742+
743+
except Exception as e:
744+
logger.error(f"Failed to describe cluster: {e}")
745+
if debug:
746+
logger.exception("Detailed error information:")
747+
748+
if "does not exist" in str(e) or "not found" in str(e).lower():
749+
click.echo(f"❌ Cluster '{cluster_name}' not found")
750+
elif "AccessDenied" in str(e):
751+
click.echo("❌ Access denied. Check AWS permissions")
752+
else:
753+
click.echo(f"❌ Error describing cluster: {e}")
754+
755+
sys.exit(1)
756+
757+
687758
@click.command()
688759
@click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL")
689760
@click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL")

src/sagemaker/hyperpod/cli/hyp_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from importlib.metadata import version, PackageNotFoundError
99

1010
from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
11-
get_monitoring
11+
get_monitoring, describe_cluster
1212
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \
1313
list_cluster_stacks, update_cluster, delete_cluster_stack
1414
from sagemaker.hyperpod.cli.commands.training import (
@@ -183,6 +183,7 @@ def exec():
183183
describe.add_command(js_describe)
184184
describe.add_command(custom_describe)
185185
describe.add_command(describe_cluster_stack)
186+
describe.add_command(describe_cluster)
186187

187188
update.add_command(update_cluster)
188189

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pydantic import ConfigDict, Field
22

3-
from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL
3+
from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL, NEURON_RESOURCE_LIMIT_KEY, \
4+
NVIDIA_GPU_RESOURCE_LIMIT_KEY
45
from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import (
56
_HyperPodPytorchJob, HyperPodPytorchJobStatus
67
)
@@ -20,15 +21,26 @@
2021
import yaml
2122
import logging
2223

23-
from sagemaker.hyperpod.training.quota_allocation_util import _is_valid, _get_resources_from_compute_quotas, _get_resources_from_instance, _get_limits
24+
from sagemaker.hyperpod.training.quota_allocation_util import (
25+
_is_valid,
26+
_get_resources_from_compute_quotas,
27+
_get_resources_from_instance,
28+
_get_limits,
29+
_resolve_default_memory_values,
30+
_set_default_accelerators_val,
31+
_validate_accelerators_inputs,
32+
_resolve_default_cpu_values,
33+
_trim_resource_requests
34+
)
2435

2536
TRAINING_GROUP = "sagemaker.amazonaws.com"
2637
API_VERSION = "v1"
2738
PLURAL = "hyperpodpytorchjobs"
2839
KIND = "HyperPodPyTorchJob"
2940
TRAINING_OPERATOR_NAMESPACE = "aws-hyperpod"
3041
TRAINING_OPERATOR_LABEL = "hp-training-control-plane"
31-
42+
NVIDIA_RESOURCE_KEY = NVIDIA_GPU_RESOURCE_LIMIT_KEY
43+
NEURON_RESOURCE_KEY = NEURON_RESOURCE_LIMIT_KEY
3244

3345
class HyperPodPytorchJob(_HyperPodPytorchJob):
3446
"""HyperPod PyTorch job for distributed training on Amazon SageMaker HyperPod clusters.
@@ -94,27 +106,64 @@ def _process_replica_resources(cls, data):
94106
requests = resources.get('requests', {})
95107
limits = resources.get('limits', {})
96108

109+
accelerators = None
110+
if requests.get('accelerators'):
111+
accelerators = int(requests.get('accelerators'))
112+
elif requests.get(NVIDIA_RESOURCE_KEY):
113+
accelerators = int(requests.get(NVIDIA_RESOURCE_KEY))
114+
elif requests.get(NEURON_RESOURCE_KEY):
115+
accelerators = int(requests.get(NEURON_RESOURCE_KEY))
116+
97117
# Extract resource values
98-
vcpu = float(requests.get('vcpu')) if requests.get('vcpu') else None
118+
vcpu = None
119+
if requests.get('cpu'):
120+
vcpu = float(requests.get('cpu'))
121+
elif requests.get('vcpu'):
122+
vcpu = float(requests.get('vcpu'))
123+
124+
vcpu_limit = None
125+
if limits.get('cpu'):
126+
vcpu_limit = float(limits.get('cpu'))
127+
elif limits.get('vcpu'):
128+
vcpu_limit = float(limits.get('vcpu'))
129+
99130
memory = cls._extract_numeric_value(requests.get('memory'))
100-
accelerators = int(requests.get('accelerators')) if requests.get('accelerators') else None
101131
memory_limit = cls._extract_numeric_value(limits.get('memory'))
102-
vcpu_limit = float(limits.get('vcpu')) if limits.get('vcpu') else None
103-
accelerators_limit = int(limits.get('accelerators')) if limits.get('accelerators') else None
132+
133+
accelerators_limit = None
134+
if limits.get('accelerators'):
135+
accelerators_limit = int(limits.get('accelerators'))
136+
elif limits.get(NVIDIA_RESOURCE_KEY):
137+
accelerators_limit = int(limits.get(NVIDIA_RESOURCE_KEY))
138+
elif limits.get(NEURON_RESOURCE_KEY):
139+
accelerators_limit = int(limits.get(NEURON_RESOURCE_KEY))
140+
141+
acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit)
142+
_validate_accelerators_inputs(instance_type, acc_req, acc_lim)
104143

105144
# Validate configuration
106-
valid, error = _is_valid(vcpu, memory, accelerators, node_count, instance_type)
145+
valid, error = _is_valid(vcpu, memory, acc_req, node_count, instance_type)
107146
if not valid:
108147
raise ValueError(error)
109148

110149
# Calculate resource values
111-
requests_value = (_get_resources_from_compute_quotas(instance_type, vcpu, memory, accelerators)
112-
or _get_resources_from_instance(instance_type, node_count=1))
113-
limits_value = _get_limits(instance_type, vcpu_limit, memory_limit, accelerators_limit)
150+
requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req)
151+
if requests_values is None:
152+
requests_values = _get_resources_from_instance(instance_type, node_count=1)
153+
_trim_resource_requests(instance_type, requests_values)
154+
if NVIDIA_RESOURCE_KEY in requests_values:
155+
acc_lim = requests_values[NVIDIA_RESOURCE_KEY]
156+
elif NEURON_RESOURCE_KEY in requests_values:
157+
acc_lim = requests_values[NEURON_RESOURCE_KEY]
158+
159+
limits_values = _get_limits(instance_type, vcpu_limit, memory_limit, acc_lim)
160+
_resolve_default_memory_values(instance_type, requests_values, limits_values)
161+
_resolve_default_cpu_values(instance_type, requests_values)
114162

115163
# Update data with calculated values
116-
data['template']['spec']['containers'][0]['resources']['requests'] = requests_value
117-
data['template']['spec']['containers'][0]['resources']['limits'] = limits_value
164+
data['template']['spec']['containers'][0]['resources']['requests'] = requests_values
165+
data['template']['spec']['containers'][0]['resources']['limits'] = limits_values
166+
118167
return data
119168
except KeyError as e:
120169
raise ValueError(f"Missing required configuration key: {str(e)}")

0 commit comments

Comments
 (0)