4242 TEMP_KUBE_CONFIG_FILE ,
4343 OutputFormat ,
4444)
45- from sagemaker .hyperpod .cli .telemetry .user_agent import (
45+ from sagemaker .hyperpod .common .telemetry .user_agent import (
4646 get_user_agent_extra_suffix ,
4747)
4848from sagemaker .hyperpod .cli .service .list_pods import (
6161from sagemaker .hyperpod .cli .utils import (
6262 get_eks_cluster_name ,
6363)
64- from sagemaker .hyperpod .common .utils import get_cluster_context as get_cluster_context_util
65- from sagemaker .hyperpod .observability .utils import get_monitoring_config , is_observability_addon_enabled
64+ from sagemaker .hyperpod .common .utils import (
65+ get_cluster_context as get_cluster_context_util ,
66+ )
67+ from sagemaker .hyperpod .observability .utils import (
68+ get_monitoring_config ,
69+ is_observability_addon_enabled ,
70+ )
71+ from sagemaker .hyperpod .common .telemetry .telemetry_logging import (
72+ _hyperpod_telemetry_emitter ,
73+ )
74+ from sagemaker .hyperpod .common .telemetry .constants import Feature
6675
6776RATE_LIMIT = 4
6877RATE_LIMIT_PERIOD = 1 # 1 second
103112 multiple = True ,
104113 help = "Optional. The namespace that you want to check the capacity for. Only SageMaker managed namespaces are supported." ,
105114)
115+ @_hyperpod_telemetry_emitter (Feature .HYPERPOD , "list_cluster" )
106116def list_cluster (
107117 region : Optional [str ],
108118 output : Optional [str ],
109119 clusters : Optional [str ],
110120 debug : bool ,
111- namespace : Optional [List ]
121+ namespace : Optional [List ],
112122):
113123 """List SageMaker Hyperpod Clusters with cluster metadata.
114124
@@ -261,8 +271,14 @@ def rate_limited_operation(
261271 for ns in namespace :
262272 sm_managed_namespace = k8s_client .get_sagemaker_managed_namespace (ns )
263273 if sm_managed_namespace :
264- quota_allocation_id = sm_managed_namespace .metadata .labels [SAGEMAKER_QUOTA_ALLOCATION_LABEL ]
265- cluster_queue_name = HYPERPOD_NAMESPACE_PREFIX + quota_allocation_id + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
274+ quota_allocation_id = sm_managed_namespace .metadata .labels [
275+ SAGEMAKER_QUOTA_ALLOCATION_LABEL
276+ ]
277+ cluster_queue_name = (
278+ HYPERPOD_NAMESPACE_PREFIX
279+ + quota_allocation_id
280+ + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
281+ )
266282 cluster_queue = k8s_client .get_cluster_queue (cluster_queue_name )
267283 nominal_quota = _get_cluster_queue_nominal_quota (cluster_queue )
268284 quota_usage = _get_cluster_queue_quota_usage (cluster_queue )
@@ -282,8 +298,19 @@ def rate_limited_operation(
282298 nodes_summary ["deep_health_check_passed" ],
283299 ]
284300 for ns in namespace :
285- capacities .append (ns_nominal_quota .get (ns ).get (instance_type , {}).get (NVIDIA_GPU_RESOURCE_LIMIT_KEY , "N/A" ))
286- capacities .append (_get_available_quota (ns_nominal_quota .get (ns ), ns_quota_usage .get (ns ), instance_type , NVIDIA_GPU_RESOURCE_LIMIT_KEY ))
301+ capacities .append (
302+ ns_nominal_quota .get (ns )
303+ .get (instance_type , {})
304+ .get (NVIDIA_GPU_RESOURCE_LIMIT_KEY , "N/A" )
305+ )
306+ capacities .append (
307+ _get_available_quota (
308+ ns_nominal_quota .get (ns ),
309+ ns_quota_usage .get (ns ),
310+ instance_type ,
311+ NVIDIA_GPU_RESOURCE_LIMIT_KEY ,
312+ )
313+ )
287314 cluster_capacities .append (capacities )
288315 except Exception as e :
289316 logger .error (f"Error processing cluster { cluster_name } : { e } , continue..." )
@@ -305,7 +332,7 @@ def _get_cluster_queue_nominal_quota(cluster_queue):
305332 if resource_name == NVIDIA_GPU_RESOURCE_LIMIT_KEY :
306333 quota = int (quota )
307334 nominal_quota [flavor_name ][resource_name ] = quota
308-
335+
309336 return nominal_quota
310337
311338
@@ -336,7 +363,7 @@ def _get_available_quota(nominal, usage, flavor, resource_name):
336363 # Some resources need to be further processed by parsing unit like memory, e.g 10Gi
337364 if nominal_quota is not None and usage_quota is not None :
338365 return int (nominal_quota ) - int (usage_quota )
339-
366+
340367 return "N/A"
341368
342369
@@ -358,7 +385,9 @@ def _restructure_output(summary_list, namespaces):
358385 for node_summary in summary_list :
359386 node_summary ["Namespaces" ] = {}
360387 for ns in namespaces :
361- available_accelerators = node_summary [ns + AVAILABLE_ACCELERATOR_DEVICES_KEY ]
388+ available_accelerators = node_summary [
389+ ns + AVAILABLE_ACCELERATOR_DEVICES_KEY
390+ ]
362391 total_accelerators = node_summary [ns + TOTAL_ACCELERATOR_DEVICES_KEY ]
363392 quota_accelerator_info = {
364393 AVAILABLE_ACCELERATOR_DEVICES_KEY : available_accelerators ,
@@ -425,9 +454,9 @@ def _aggregate_nodes_info(
425454
426455 # Accelerator Devices available = Allocatable devices - Allocated devices
427456 if node_name in nodes_resource_allocated_dict :
428- nodes_summary [instance_type ]["accelerator_devices_available" ] -= (
429- nodes_resource_allocated_dict [ node_name ]
430- )
457+ nodes_summary [instance_type ][
458+ "accelerator_devices_available"
459+ ] -= nodes_resource_allocated_dict [ node_name ]
431460
432461 logger .debug (f"nodes_summary: { nodes_summary } " )
433462 return nodes_summary
@@ -550,7 +579,6 @@ def get_cluster_context(
550579 sys .exit (1 )
551580
552581
553-
554582@click .command ()
555583@click .option ("--grafana" , is_flag = True , help = "Returns Grafana Dashboard URL" )
556584@click .option ("--prometheus" , is_flag = True , help = "Returns Prometheus Workspace URL" )
@@ -572,14 +600,21 @@ def get_monitoring(grafana: bool, prometheus: bool, list: bool) -> None:
572600 print (f"Grafana dashboard URL: { monitor_config .grafanaURL } " )
573601 if list :
574602 metrics_data = monitor_config .availableMetrics
575- print (tabulate ([[k , v .get ('level' , v .get ('enabled' ))] for k , v in metrics_data .items ()],
576- headers = ['Metric' , 'Level/Enabled' ], tablefmt = 'presto' ))
603+ print (
604+ tabulate (
605+ [
606+ [k , v .get ("level" , v .get ("enabled" ))]
607+ for k , v in metrics_data .items ()
608+ ],
609+ headers = ["Metric" , "Level/Enabled" ],
610+ tablefmt = "presto" ,
611+ )
612+ )
577613 except Exception as e :
578614 logger .error (f"Failed to get metrics: { e } " )
579615 sys .exit (1 )
580616
581617
582-
583618def _update_kube_config (
584619 eks_name : str ,
585620 region : Optional [str ],
0 commit comments