aws
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎helm_chart/HyperPodHelmChart/Chart.yaml‎
Lines changed: 1 addition & 1 deletion b/‎helm_chart/HyperPodHelmChart/Chart.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json‎
Lines changed: 3 additions & 3 deletions b/‎hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/sagemaker/hyperpod/cli/commands/cluster.py‎
Lines changed: 71 additions & 0 deletions b/‎src/sagemaker/hyperpod/cli/commands/cluster.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎src/sagemaker/hyperpod/cli/hyp_cli.py‎
Lines changed: 2 additions & 1 deletion b/‎src/sagemaker/hyperpod/cli/hyp_cli.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py‎
Lines changed: 62 additions & 13 deletions b/‎src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py‎
Lines changed: 62 additions & 13 deletions
@@ -32,4 +32,7 @@ doc/_build/
 /result/
 /results/
 
-.idea/
+.idea/
+
+.venv*
+venv
@@ -46,7 +46,7 @@ The SageMaker HyperPod CLI is a tool that helps create training jobs and inferen
 ### Prerequisites for Inference 
 
 - HyperPod CLI supports creating Inference Endpoints through jumpstart and through custom Endpoint config 
-  - You can follow [inference operator doc](https://github.com/aws/sagemaker-hyperpod-cli/tree/master/helm_chart/HyperPodHelmChart/charts/inference-operator) to install it.
+  - You can follow [inference operator doc](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) to install it.
 
 ## Platform Support
 
 
@@ -41,7 +41,7 @@ dependencies:
     repository: https://nvidia.github.io/k8s-device-plugin
     condition: nvidia-device-plugin.devicePlugin.enabled    
   - name: aws-efa-k8s-device-plugin
-    version: "0.5.3"
+    version: "0.5.10"
     repository: https://aws.github.io/eks-charts/
     condition: aws-efa-k8s-device-plugin.devicePlugin.enabled
   - name: neuron-device-plugin
 
@@ -276,9 +276,9 @@
       "title": "Queue Name"
     },
     "accelerators": {
-        "type": "integer",
-        "minimum": 0,
-        "description": "Number of accelerators (GPUs/TPUs)"
+      "type": "integer",
+      "minimum": 0,
+      "description": "Number of accelerators (GPUs/TPUs)"
     },
     "vcpu": {
       "type": "float",
 
@@ -77,6 +77,8 @@
     _hyperpod_telemetry_emitter,
 )
 from sagemaker.hyperpod.common.telemetry.constants import Feature
+from sagemaker.hyperpod.cli.utils import convert_datetimes
+from sagemaker_core.main.resources import Cluster
 
 RATE_LIMIT = 4
 RATE_LIMIT_PERIOD = 1  # 1 second
@@ -684,6 +686,75 @@ def get_cluster_context(
         sys.exit(1)
 
 
+@click.command("cluster")
+@click.argument("cluster-name", required=True)
+@click.option("--region", help="AWS region")
+@click.option("--debug", is_flag=True, help="Enable debug logging")
+@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "describe_cluster_cli")
+def describe_cluster(cluster_name: str, debug: bool, region: str) -> None:
+    """Describe the status of a HyperPod cluster.
+    Shows detailed information about a SageMaker HyperPod cluster including its current status,
+    instance groups, orchestrator details, and configuration.
+    Usage Examples
+          # Describe a cluster
+          hyp describe cluster my-cluster-name
+          # Describe with specific region
+          hyp describe cluster my-cluster-name --region us-west-2
+    """
+    if debug:
+        set_logging_level(logger, logging.DEBUG)
+
+    try:
+        botocore_config = botocore.config.Config(
+            user_agent_extra=get_user_agent_extra_suffix()
+        )
+        session = boto3.Session(region_name=region) if region else boto3.Session()
+        sm_client = get_sagemaker_client(session, botocore_config)
+
+        # Get cluster details using SageMaker client
+        cluster_dict = sm_client.describe_cluster(ClusterName=cluster_name)
+
+        # Convert datetimes for display
+        cluster_dict = convert_datetimes(cluster_dict)
+
+        logger.debug(f"Describing cluster name: {cluster_name}\ninfo: {json.dumps(cluster_dict, indent=2, default=str)}")
+
+        click.echo(f"📋 Cluster Details for: {cluster_name}")
+
+        # Highlight cluster status
+        cluster_status = cluster_dict.get('ClusterStatus', 'UNKNOWN')
+        click.echo(f"Status: ", nl=False)
+        click.secho(cluster_status)
+
+        table_data = []
+        for key, value in cluster_dict.items():
+            if isinstance(value, (dict, list)):
+                formatted_value = json.dumps(value, indent=2, default=str)
+            else:
+                formatted_value = str(value)
+            table_data.append([key, formatted_value])
+
+        # Only display table if we have data
+        if table_data:
+            click.echo(tabulate(table_data, tablefmt="presto"))
+        else:
+            click.echo("No cluster data available")
+
+    except Exception as e:
+        logger.error(f"Failed to describe cluster: {e}")
+        if debug:
+            logger.exception("Detailed error information:")
+
+        if "does not exist" in str(e) or "not found" in str(e).lower():
+            click.echo(f"❌ Cluster '{cluster_name}' not found")
+        elif "AccessDenied" in str(e):
+            click.echo("❌ Access denied. Check AWS permissions")
+        else:
+            click.echo(f"❌ Error describing cluster: {e}")
+
+        sys.exit(1)
+        
+
 @click.command()
 @click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL")
 @click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL")
 
@@ -8,7 +8,7 @@
 from importlib.metadata import version, PackageNotFoundError
 
 from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
-    get_monitoring
+    get_monitoring, describe_cluster
 from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \
     list_cluster_stacks, update_cluster, delete_cluster_stack
 from sagemaker.hyperpod.cli.commands.training import (
@@ -183,6 +183,7 @@ def exec():
 describe.add_command(js_describe)
 describe.add_command(custom_describe)
 describe.add_command(describe_cluster_stack)
+describe.add_command(describe_cluster)
 
 update.add_command(update_cluster)
 
 
@@ -1,6 +1,7 @@
 from pydantic import ConfigDict, Field
 
-from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL
+from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL, NEURON_RESOURCE_LIMIT_KEY, \
+    NVIDIA_GPU_RESOURCE_LIMIT_KEY
 from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import (
     _HyperPodPytorchJob, HyperPodPytorchJobStatus
 )
@@ -20,15 +21,26 @@
 import yaml
 import logging
 
-from sagemaker.hyperpod.training.quota_allocation_util import _is_valid, _get_resources_from_compute_quotas, _get_resources_from_instance, _get_limits
+from sagemaker.hyperpod.training.quota_allocation_util import (
+    _is_valid,
+    _get_resources_from_compute_quotas,
+    _get_resources_from_instance,
+    _get_limits,
+    _resolve_default_memory_values,
+    _set_default_accelerators_val,
+    _validate_accelerators_inputs,
+    _resolve_default_cpu_values,
+    _trim_resource_requests
+)
 
 TRAINING_GROUP = "sagemaker.amazonaws.com"
 API_VERSION = "v1"
 PLURAL = "hyperpodpytorchjobs"
 KIND = "HyperPodPyTorchJob"
 TRAINING_OPERATOR_NAMESPACE = "aws-hyperpod"
 TRAINING_OPERATOR_LABEL = "hp-training-control-plane"
-
+NVIDIA_RESOURCE_KEY = NVIDIA_GPU_RESOURCE_LIMIT_KEY
+NEURON_RESOURCE_KEY = NEURON_RESOURCE_LIMIT_KEY
 
 class HyperPodPytorchJob(_HyperPodPytorchJob):
     """HyperPod PyTorch job for distributed training on Amazon SageMaker HyperPod clusters.
@@ -94,27 +106,64 @@ def _process_replica_resources(cls, data):
             requests = resources.get('requests', {})
             limits = resources.get('limits', {})
 
+            accelerators = None
+            if requests.get('accelerators'):
+                accelerators = int(requests.get('accelerators'))
+            elif requests.get(NVIDIA_RESOURCE_KEY):
+                accelerators = int(requests.get(NVIDIA_RESOURCE_KEY))
+            elif requests.get(NEURON_RESOURCE_KEY):
+                accelerators = int(requests.get(NEURON_RESOURCE_KEY))
+
             # Extract resource values
-            vcpu = float(requests.get('vcpu')) if requests.get('vcpu') else None
+            vcpu = None
+            if requests.get('cpu'):
+                vcpu = float(requests.get('cpu'))
+            elif requests.get('vcpu'):
+                vcpu = float(requests.get('vcpu'))
+
+            vcpu_limit = None
+            if limits.get('cpu'):
+                vcpu_limit = float(limits.get('cpu'))
+            elif limits.get('vcpu'):
+                vcpu_limit = float(limits.get('vcpu'))
+
             memory = cls._extract_numeric_value(requests.get('memory'))
-            accelerators = int(requests.get('accelerators'))  if requests.get('accelerators') else None
             memory_limit = cls._extract_numeric_value(limits.get('memory'))
-            vcpu_limit = float(limits.get('vcpu')) if limits.get('vcpu') else None
-            accelerators_limit = int(limits.get('accelerators'))  if limits.get('accelerators') else None
+
+            accelerators_limit = None
+            if limits.get('accelerators'):
+                accelerators_limit = int(limits.get('accelerators'))
+            elif limits.get(NVIDIA_RESOURCE_KEY):
+                accelerators_limit = int(limits.get(NVIDIA_RESOURCE_KEY))
+            elif limits.get(NEURON_RESOURCE_KEY):
+                accelerators_limit = int(limits.get(NEURON_RESOURCE_KEY))
+
+            acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit)
+            _validate_accelerators_inputs(instance_type, acc_req, acc_lim)
 
             # Validate configuration
-            valid, error = _is_valid(vcpu, memory, accelerators, node_count, instance_type)
+            valid, error = _is_valid(vcpu, memory, acc_req, node_count, instance_type)
             if not valid:
                 raise ValueError(error)
 
             # Calculate resource values
-            requests_value = (_get_resources_from_compute_quotas(instance_type, vcpu, memory, accelerators)
-                              or _get_resources_from_instance(instance_type, node_count=1))
-            limits_value = _get_limits(instance_type, vcpu_limit, memory_limit, accelerators_limit)
+            requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req)
+            if requests_values is None:
+                requests_values = _get_resources_from_instance(instance_type, node_count=1)
+                _trim_resource_requests(instance_type, requests_values)
+                if NVIDIA_RESOURCE_KEY in requests_values:
+                    acc_lim = requests_values[NVIDIA_RESOURCE_KEY]
+                elif NEURON_RESOURCE_KEY in requests_values:
+                    acc_lim = requests_values[NEURON_RESOURCE_KEY]
+
+            limits_values = _get_limits(instance_type, vcpu_limit, memory_limit, acc_lim)
+            _resolve_default_memory_values(instance_type, requests_values, limits_values)
+            _resolve_default_cpu_values(instance_type, requests_values)
 
             # Update data with calculated values
-            data['template']['spec']['containers'][0]['resources']['requests'] = requests_value
-            data['template']['spec']['containers'][0]['resources']['limits'] = limits_value
+            data['template']['spec']['containers'][0]['resources']['requests'] = requests_values
+            data['template']['spec']['containers'][0]['resources']['limits'] = limits_values
+
             return data
         except KeyError as e:
             raise ValueError(f"Missing required configuration key: {str(e)}")