Skip to content

Commit 664be9d

Browse files
rsareddy0329Roja Reddy Sareddy
andauthored
Enable Hyperpod telemetry (#116)
* Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry --------- Co-authored-by: Roja Reddy Sareddy <[email protected]>
1 parent 63192b5 commit 664be9d

File tree

11 files changed

+658
-18
lines changed

11 files changed

+658
-18
lines changed

src/sagemaker/hyperpod/cli/commands/cluster.py

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
TEMP_KUBE_CONFIG_FILE,
4343
OutputFormat,
4444
)
45-
from sagemaker.hyperpod.cli.telemetry.user_agent import (
45+
from sagemaker.hyperpod.common.telemetry.user_agent import (
4646
get_user_agent_extra_suffix,
4747
)
4848
from sagemaker.hyperpod.cli.service.list_pods import (
@@ -61,8 +61,17 @@
6161
from sagemaker.hyperpod.cli.utils import (
6262
get_eks_cluster_name,
6363
)
64-
from sagemaker.hyperpod.common.utils import get_cluster_context as get_cluster_context_util
65-
from sagemaker.hyperpod.observability.utils import get_monitoring_config, is_observability_addon_enabled
64+
from sagemaker.hyperpod.common.utils import (
65+
get_cluster_context as get_cluster_context_util,
66+
)
67+
from sagemaker.hyperpod.observability.utils import (
68+
get_monitoring_config,
69+
is_observability_addon_enabled,
70+
)
71+
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
72+
_hyperpod_telemetry_emitter,
73+
)
74+
from sagemaker.hyperpod.common.telemetry.constants import Feature
6675

6776
RATE_LIMIT = 4
6877
RATE_LIMIT_PERIOD = 1 # 1 second
@@ -103,12 +112,13 @@
103112
multiple=True,
104113
help="Optional. The namespace that you want to check the capacity for. Only SageMaker managed namespaces are supported.",
105114
)
115+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_cluster")
106116
def list_cluster(
107117
region: Optional[str],
108118
output: Optional[str],
109119
clusters: Optional[str],
110120
debug: bool,
111-
namespace: Optional[List]
121+
namespace: Optional[List],
112122
):
113123
"""List SageMaker Hyperpod Clusters with cluster metadata.
114124
@@ -261,8 +271,14 @@ def rate_limited_operation(
261271
for ns in namespace:
262272
sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns)
263273
if sm_managed_namespace:
264-
quota_allocation_id = sm_managed_namespace.metadata.labels[SAGEMAKER_QUOTA_ALLOCATION_LABEL]
265-
cluster_queue_name = HYPERPOD_NAMESPACE_PREFIX + quota_allocation_id + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
274+
quota_allocation_id = sm_managed_namespace.metadata.labels[
275+
SAGEMAKER_QUOTA_ALLOCATION_LABEL
276+
]
277+
cluster_queue_name = (
278+
HYPERPOD_NAMESPACE_PREFIX
279+
+ quota_allocation_id
280+
+ SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
281+
)
266282
cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name)
267283
nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue)
268284
quota_usage = _get_cluster_queue_quota_usage(cluster_queue)
@@ -282,8 +298,19 @@ def rate_limited_operation(
282298
nodes_summary["deep_health_check_passed"],
283299
]
284300
for ns in namespace:
285-
capacities.append(ns_nominal_quota.get(ns).get(instance_type, {}).get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A"))
286-
capacities.append(_get_available_quota(ns_nominal_quota.get(ns), ns_quota_usage.get(ns), instance_type, NVIDIA_GPU_RESOURCE_LIMIT_KEY))
301+
capacities.append(
302+
ns_nominal_quota.get(ns)
303+
.get(instance_type, {})
304+
.get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A")
305+
)
306+
capacities.append(
307+
_get_available_quota(
308+
ns_nominal_quota.get(ns),
309+
ns_quota_usage.get(ns),
310+
instance_type,
311+
NVIDIA_GPU_RESOURCE_LIMIT_KEY,
312+
)
313+
)
287314
cluster_capacities.append(capacities)
288315
except Exception as e:
289316
logger.error(f"Error processing cluster {cluster_name}: {e}, continue...")
@@ -305,7 +332,7 @@ def _get_cluster_queue_nominal_quota(cluster_queue):
305332
if resource_name == NVIDIA_GPU_RESOURCE_LIMIT_KEY:
306333
quota = int(quota)
307334
nominal_quota[flavor_name][resource_name] = quota
308-
335+
309336
return nominal_quota
310337

311338

@@ -336,7 +363,7 @@ def _get_available_quota(nominal, usage, flavor, resource_name):
336363
# Some resources need to be further processed by parsing unit like memory, e.g 10Gi
337364
if nominal_quota is not None and usage_quota is not None:
338365
return int(nominal_quota) - int(usage_quota)
339-
366+
340367
return "N/A"
341368

342369

@@ -358,7 +385,9 @@ def _restructure_output(summary_list, namespaces):
358385
for node_summary in summary_list:
359386
node_summary["Namespaces"] = {}
360387
for ns in namespaces:
361-
available_accelerators = node_summary[ns + AVAILABLE_ACCELERATOR_DEVICES_KEY]
388+
available_accelerators = node_summary[
389+
ns + AVAILABLE_ACCELERATOR_DEVICES_KEY
390+
]
362391
total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY]
363392
quota_accelerator_info = {
364393
AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators,
@@ -425,9 +454,9 @@ def _aggregate_nodes_info(
425454

426455
# Accelerator Devices available = Allocatable devices - Allocated devices
427456
if node_name in nodes_resource_allocated_dict:
428-
nodes_summary[instance_type]["accelerator_devices_available"] -= (
429-
nodes_resource_allocated_dict[node_name]
430-
)
457+
nodes_summary[instance_type][
458+
"accelerator_devices_available"
459+
] -= nodes_resource_allocated_dict[node_name]
431460

432461
logger.debug(f"nodes_summary: {nodes_summary}")
433462
return nodes_summary
@@ -550,7 +579,6 @@ def get_cluster_context(
550579
sys.exit(1)
551580

552581

553-
554582
@click.command()
555583
@click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL")
556584
@click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL")
@@ -572,14 +600,21 @@ def get_monitoring(grafana: bool, prometheus: bool, list: bool) -> None:
572600
print(f"Grafana dashboard URL: {monitor_config.grafanaURL}")
573601
if list:
574602
metrics_data = monitor_config.availableMetrics
575-
print(tabulate([[k, v.get('level', v.get('enabled'))] for k, v in metrics_data.items()],
576-
headers=['Metric', 'Level/Enabled'], tablefmt='presto'))
603+
print(
604+
tabulate(
605+
[
606+
[k, v.get("level", v.get("enabled"))]
607+
for k, v in metrics_data.items()
608+
],
609+
headers=["Metric", "Level/Enabled"],
610+
tablefmt="presto",
611+
)
612+
)
577613
except Exception as e:
578614
logger.error(f"Failed to get metrics: {e}")
579615
sys.exit(1)
580616

581617

582-
583618
def _update_kube_config(
584619
eks_name: str,
585620
region: Optional[str],

src/sagemaker/hyperpod/cli/telemetry/__init__.py renamed to src/sagemaker/hyperpod/common/telemetry/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
from .telemetry_logging import _hyperpod_telemetry_emitter
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import absolute_import
2+
from enum import Enum
3+
4+
5+
class Feature(Enum):
6+
"""Enumeration of feature names used in telemetry."""
7+
8+
HYPERPOD = 6 # Added to support telemetry in sagemaker-hyperpod-cli
9+
10+
def __str__(self): # pylint: disable=E0307
11+
"""Return the feature name."""
12+
return self.name
13+
14+
15+
class Status(Enum):
16+
"""Enumeration of status values used in telemetry."""
17+
18+
SUCCESS = 1
19+
FAILURE = 0
20+
21+
def __str__(self): # pylint: disable=E0307
22+
"""Return the status name."""
23+
return self.name
24+
25+
26+
class Region(str, Enum):
27+
"""Telemetry: List of all supported AWS regions."""
28+
29+
# Classic
30+
US_EAST_1 = "us-east-1" # IAD
31+
US_EAST_2 = "us-east-2" # CMH
32+
US_WEST_1 = "us-west-1" # SFO
33+
US_WEST_2 = "us-west-2" # PDX
34+
AP_NORTHEAST_1 = "ap-northeast-1" # NRT
35+
AP_NORTHEAST_2 = "ap-northeast-2" # ICN
36+
AP_NORTHEAST_3 = "ap-northeast-3" # KIX
37+
AP_SOUTH_1 = "ap-south-1" # BOM
38+
AP_SOUTHEAST_1 = "ap-southeast-1" # SIN
39+
AP_SOUTHEAST_2 = "ap-southeast-2" # SYD
40+
CA_CENTRAL_1 = "ca-central-1" # YUL
41+
EU_CENTRAL_1 = "eu-central-1" # FRA
42+
EU_NORTH_1 = "eu-north-1" # ARN
43+
EU_WEST_1 = "eu-west-1" # DUB
44+
EU_WEST_2 = "eu-west-2" # LHR
45+
EU_WEST_3 = "eu-west-3" # CDG
46+
SA_EAST_1 = "sa-east-1" # GRU
47+
# Opt-in
48+
AP_EAST_1 = "ap-east-1" # HKG
49+
AP_SOUTHEAST_3 = "ap-southeast-3" # CGK
50+
AF_SOUTH_1 = "af-south-1" # CPT
51+
EU_SOUTH_1 = "eu-south-1" # MXP
52+
ME_SOUTH_1 = "me-south-1" # BAH
53+
MX_CENTRAL_1 = "mx-central-1" # QRO
54+
AP_SOUTHEAST_7 = "ap-southeast-7" # BKK
55+
AP_SOUTH_2 = "ap-south-2" # HYD
56+
AP_SOUTHEAST_4 = "ap-southeast-4" # MEL
57+
EU_CENTRAL_2 = "eu-central-2" # ZRH
58+
EU_SOUTH_2 = "eu-south-2" # ZAZ
59+
IL_CENTRAL_1 = "il-central-1" # TLV
60+
ME_CENTRAL_1 = "me-central-1" # DXB

0 commit comments

Comments
 (0)