Skip to content

Commit 9891db4

Browse files
mohamedzeidan2021Mohamed Zeidan
andauthored
added describe cluster cmd (#278)
Co-authored-by: Mohamed Zeidan <[email protected]>
1 parent 315f7ec commit 9891db4

File tree

3 files changed

+355
-1
lines changed

3 files changed

+355
-1
lines changed

src/sagemaker/hyperpod/cli/commands/cluster.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@
7777
_hyperpod_telemetry_emitter,
7878
)
7979
from sagemaker.hyperpod.common.telemetry.constants import Feature
80+
from sagemaker.hyperpod.cli.utils import convert_datetimes
81+
from sagemaker_core.main.resources import Cluster
8082

8183
RATE_LIMIT = 4
8284
RATE_LIMIT_PERIOD = 1 # 1 second
@@ -684,6 +686,75 @@ def get_cluster_context(
684686
sys.exit(1)
685687

686688

689+
@click.command("cluster")
690+
@click.argument("cluster-name", required=True)
691+
@click.option("--region", help="AWS region")
692+
@click.option("--debug", is_flag=True, help="Enable debug logging")
693+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "describe_cluster_cli")
694+
def describe_cluster(cluster_name: str, debug: bool, region: str) -> None:
695+
"""Describe the status of a HyperPod cluster.
696+
Shows detailed information about a SageMaker HyperPod cluster including its current status,
697+
instance groups, orchestrator details, and configuration.
698+
Usage Examples
699+
# Describe a cluster
700+
hyp describe cluster my-cluster-name
701+
# Describe with specific region
702+
hyp describe cluster my-cluster-name --region us-west-2
703+
"""
704+
if debug:
705+
set_logging_level(logger, logging.DEBUG)
706+
707+
try:
708+
botocore_config = botocore.config.Config(
709+
user_agent_extra=get_user_agent_extra_suffix()
710+
)
711+
session = boto3.Session(region_name=region) if region else boto3.Session()
712+
sm_client = get_sagemaker_client(session, botocore_config)
713+
714+
# Get cluster details using SageMaker client
715+
cluster_dict = sm_client.describe_cluster(ClusterName=cluster_name)
716+
717+
# Convert datetimes for display
718+
cluster_dict = convert_datetimes(cluster_dict)
719+
720+
logger.debug(f"Describing cluster name: {cluster_name}\ninfo: {json.dumps(cluster_dict, indent=2, default=str)}")
721+
722+
click.echo(f"📋 Cluster Details for: {cluster_name}")
723+
724+
# Highlight cluster status
725+
cluster_status = cluster_dict.get('ClusterStatus', 'UNKNOWN')
726+
click.echo(f"Status: ", nl=False)
727+
click.secho(cluster_status)
728+
729+
table_data = []
730+
for key, value in cluster_dict.items():
731+
if isinstance(value, (dict, list)):
732+
formatted_value = json.dumps(value, indent=2, default=str)
733+
else:
734+
formatted_value = str(value)
735+
table_data.append([key, formatted_value])
736+
737+
# Only display table if we have data
738+
if table_data:
739+
click.echo(tabulate(table_data, tablefmt="presto"))
740+
else:
741+
click.echo("No cluster data available")
742+
743+
except Exception as e:
744+
logger.error(f"Failed to describe cluster: {e}")
745+
if debug:
746+
logger.exception("Detailed error information:")
747+
748+
if "does not exist" in str(e) or "not found" in str(e).lower():
749+
click.echo(f"❌ Cluster '{cluster_name}' not found")
750+
elif "AccessDenied" in str(e):
751+
click.echo("❌ Access denied. Check AWS permissions")
752+
else:
753+
click.echo(f"❌ Error describing cluster: {e}")
754+
755+
sys.exit(1)
756+
757+
687758
@click.command()
688759
@click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL")
689760
@click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL")

src/sagemaker/hyperpod/cli/hyp_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from importlib.metadata import version, PackageNotFoundError
99

1010
from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
11-
get_monitoring
11+
get_monitoring, describe_cluster
1212
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \
1313
list_cluster_stacks, update_cluster, delete_cluster_stack
1414
from sagemaker.hyperpod.cli.commands.training import (
@@ -183,6 +183,7 @@ def exec():
183183
describe.add_command(js_describe)
184184
describe.add_command(custom_describe)
185185
describe.add_command(describe_cluster_stack)
186+
describe.add_command(describe_cluster)
186187

187188
update.add_command(update_cluster)
188189

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
import unittest
14+
from unittest.mock import Mock, patch
15+
from click.testing import CliRunner
16+
from botocore.exceptions import ClientError
17+
from sagemaker.hyperpod.cli.commands.cluster import describe_cluster
18+
19+
20+
class DescribeClusterTest(unittest.TestCase):
21+
def setUp(self):
22+
self.runner = CliRunner()
23+
24+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
25+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
26+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
27+
def test_describe_cluster_happy_case(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
28+
"""Test successful cluster description with valid cluster name."""
29+
# Arrange
30+
mock_logger = Mock()
31+
mock_setup_logger.return_value = mock_logger
32+
33+
mock_session_instance = Mock()
34+
mock_session.return_value = mock_session_instance
35+
36+
mock_sm_client = Mock()
37+
mock_get_sagemaker_client.return_value = mock_sm_client
38+
39+
# Mock successful cluster response
40+
cluster_response = {
41+
"ClusterArn": "arn:aws:sagemaker:us-east-2:123456789012:cluster/test-cluster",
42+
"ClusterName": "test-cluster",
43+
"ClusterStatus": "InService",
44+
"CreationTime": "2023-09-23T14:35:38.223000+00:00",
45+
"InstanceGroups": [
46+
{
47+
"InstanceGroupName": "controller-group",
48+
"InstanceType": "ml.t3.medium",
49+
"CurrentCount": 1,
50+
"TargetCount": 1
51+
}
52+
],
53+
"VpcConfig": {
54+
"SecurityGroupIds": ["sg-1234567890abcdef0"],
55+
"Subnets": ["subnet-1234567890abcdef0"]
56+
},
57+
"Orchestrator": {
58+
"Eks": {
59+
"ClusterArn": "arn:aws:eks:us-east-2:123456789012:cluster/eks-cluster"
60+
}
61+
}
62+
}
63+
64+
mock_sm_client.describe_cluster.return_value = cluster_response
65+
66+
# Act
67+
result = self.runner.invoke(describe_cluster, ["test-cluster"])
68+
69+
# Assert
70+
assert result.exit_code == 0
71+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="test-cluster")
72+
assert "📋 Cluster Details for: test-cluster" in result.output
73+
assert "test-cluster" in result.output
74+
assert "InService" in result.output
75+
76+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
77+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
78+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
79+
def test_describe_cluster_with_region_flag(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
80+
"""Test cluster description with region flag specified."""
81+
# Arrange
82+
mock_logger = Mock()
83+
mock_setup_logger.return_value = mock_logger
84+
85+
mock_session_instance = Mock()
86+
mock_session.return_value = mock_session_instance
87+
88+
mock_sm_client = Mock()
89+
mock_get_sagemaker_client.return_value = mock_sm_client
90+
91+
# Mock successful cluster response
92+
cluster_response = {
93+
"ClusterArn": "arn:aws:sagemaker:us-west-2:123456789012:cluster/test-cluster",
94+
"ClusterName": "test-cluster",
95+
"ClusterStatus": "InService",
96+
"CreationTime": "2023-09-23T14:35:38.223000+00:00",
97+
"InstanceGroups": [
98+
{
99+
"InstanceGroupName": "worker-group",
100+
"InstanceType": "ml.p4d.24xlarge",
101+
"CurrentCount": 2,
102+
"TargetCount": 2
103+
}
104+
]
105+
}
106+
107+
mock_sm_client.describe_cluster.return_value = cluster_response
108+
109+
# Act
110+
result = self.runner.invoke(describe_cluster, ["test-cluster", "--region", "us-west-2"])
111+
112+
# Assert
113+
assert result.exit_code == 0
114+
115+
# Verify that boto3.Session was called with the correct region
116+
mock_session.assert_called_with(region_name="us-west-2")
117+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="test-cluster")
118+
assert "📋 Cluster Details for: test-cluster" in result.output
119+
assert "test-cluster" in result.output
120+
assert "InService" in result.output
121+
122+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
123+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
124+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
125+
def test_describe_cluster_unknown_cluster_name(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
126+
"""Test cluster description with unknown/non-existent cluster name."""
127+
# Arrange
128+
mock_logger = Mock()
129+
mock_setup_logger.return_value = mock_logger
130+
131+
mock_session_instance = Mock()
132+
mock_session.return_value = mock_session_instance
133+
134+
mock_sm_client = Mock()
135+
mock_get_sagemaker_client.return_value = mock_sm_client
136+
137+
# Mock cluster not found exception
138+
error_response = {
139+
'Error': {
140+
'Code': 'ResourceNotFound',
141+
'Message': 'Cluster does not exist'
142+
}
143+
}
144+
mock_sm_client.describe_cluster.side_effect = ClientError(
145+
error_response, 'DescribeCluster'
146+
)
147+
148+
# Act
149+
result = self.runner.invoke(describe_cluster, ["unknown-cluster"])
150+
151+
# Assert
152+
assert result.exit_code == 1
153+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="unknown-cluster")
154+
# Should show the error message
155+
assert "❌ Cluster 'unknown-cluster' not found" in result.output
156+
157+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
158+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
159+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
160+
def test_describe_cluster_access_denied(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
161+
"""Test cluster description with access denied error."""
162+
# Arrange
163+
mock_logger = Mock()
164+
mock_setup_logger.return_value = mock_logger
165+
166+
mock_session_instance = Mock()
167+
mock_session.return_value = mock_session_instance
168+
169+
mock_sm_client = Mock()
170+
mock_get_sagemaker_client.return_value = mock_sm_client
171+
172+
# Mock access denied exception
173+
error_response = {
174+
'Error': {
175+
'Code': 'AccessDenied',
176+
'Message': 'User is not authorized to perform this action'
177+
}
178+
}
179+
mock_sm_client.describe_cluster.side_effect = ClientError(
180+
error_response, 'DescribeCluster'
181+
)
182+
183+
# Act
184+
result = self.runner.invoke(describe_cluster, ["test-cluster"])
185+
186+
# Assert
187+
assert result.exit_code == 1
188+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="test-cluster")
189+
# Should show the access denied message
190+
assert "❌ Access denied. Check AWS permissions" in result.output
191+
192+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
193+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
194+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
195+
def test_describe_cluster_generic_error(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
196+
"""Test cluster description with generic error."""
197+
# Arrange
198+
mock_logger = Mock()
199+
mock_setup_logger.return_value = mock_logger
200+
201+
mock_session_instance = Mock()
202+
mock_session.return_value = mock_session_instance
203+
204+
mock_sm_client = Mock()
205+
mock_get_sagemaker_client.return_value = mock_sm_client
206+
207+
# Mock generic exception
208+
mock_sm_client.describe_cluster.side_effect = Exception("Unexpected error occurred")
209+
210+
# Act
211+
result = self.runner.invoke(describe_cluster, ["test-cluster"])
212+
213+
# Assert
214+
assert result.exit_code == 1
215+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="test-cluster")
216+
# Should show the generic error message
217+
assert "❌ Error describing cluster: Unexpected error occurred" in result.output
218+
219+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
220+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
221+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
222+
def test_describe_cluster_with_debug_flag(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
223+
"""Test cluster description with debug flag enabled."""
224+
# Arrange
225+
mock_logger = Mock()
226+
mock_setup_logger.return_value = mock_logger
227+
228+
mock_session_instance = Mock()
229+
mock_session.return_value = mock_session_instance
230+
231+
mock_sm_client = Mock()
232+
mock_get_sagemaker_client.return_value = mock_sm_client
233+
234+
# Mock successful cluster response
235+
cluster_response = {
236+
"ClusterArn": "arn:aws:sagemaker:us-east-2:123456789012:cluster/test-cluster",
237+
"ClusterName": "test-cluster",
238+
"ClusterStatus": "InService"
239+
}
240+
241+
mock_sm_client.describe_cluster.return_value = cluster_response
242+
243+
# Act
244+
result = self.runner.invoke(describe_cluster, ["test-cluster", "--debug"])
245+
246+
# Assert
247+
assert result.exit_code == 0
248+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="test-cluster")
249+
assert "📋 Cluster Details for: test-cluster" in result.output
250+
251+
@patch('sagemaker.hyperpod.cli.commands.cluster.get_sagemaker_client')
252+
@patch('sagemaker.hyperpod.cli.commands.cluster.boto3.Session')
253+
@patch('sagemaker.hyperpod.cli.commands.cluster.setup_logger')
254+
def test_describe_cluster_empty_response(self, mock_setup_logger, mock_session, mock_get_sagemaker_client):
255+
"""Test cluster description with empty response."""
256+
# Arrange
257+
mock_logger = Mock()
258+
mock_setup_logger.return_value = mock_logger
259+
260+
mock_session_instance = Mock()
261+
mock_session.return_value = mock_session_instance
262+
263+
mock_sm_client = Mock()
264+
mock_get_sagemaker_client.return_value = mock_sm_client
265+
266+
# Mock empty cluster response
267+
cluster_response = {}
268+
269+
mock_sm_client.describe_cluster.return_value = cluster_response
270+
271+
# Act
272+
result = self.runner.invoke(describe_cluster, ["test-cluster"])
273+
274+
# Assert
275+
assert result.exit_code == 0
276+
mock_sm_client.describe_cluster.assert_called_once_with(ClusterName="test-cluster")
277+
assert "📋 Cluster Details for: test-cluster" in result.output
278+
assert "No cluster data available" in result.output
279+
280+
281+
if __name__ == "__main__":
282+
unittest.main()

0 commit comments

Comments
 (0)