aws
diff --git a/‎src/sagemaker/hyperpod/cli/commands/cluster.py‎
Lines changed: 53 additions & 18 deletions b/‎src/sagemaker/hyperpod/cli/commands/cluster.py‎
Lines changed: 53 additions & 18 deletions
diff --git a/‎…maker/hyperpod/cli/telemetry/__init__.py‎ ‎…er/hyperpod/common/telemetry/__init__.py‎src/sagemaker/hyperpod/cli/telemetry/__init__.py renamed to src/sagemaker/hyperpod/common/telemetry/__init__.py
Lines changed: 2 additions & 0 deletions b/‎…maker/hyperpod/cli/telemetry/__init__.py‎ ‎…er/hyperpod/common/telemetry/__init__.py‎src/sagemaker/hyperpod/cli/telemetry/__init__.py renamed to src/sagemaker/hyperpod/common/telemetry/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/sagemaker/hyperpod/common/telemetry/constants.py‎
Lines changed: 60 additions & 0 deletions b/‎src/sagemaker/hyperpod/common/telemetry/constants.py‎
Lines changed: 60 additions & 0 deletions
@@ -42,7 +42,7 @@
     TEMP_KUBE_CONFIG_FILE,
     OutputFormat,
 )
-from sagemaker.hyperpod.cli.telemetry.user_agent import (
+from sagemaker.hyperpod.common.telemetry.user_agent import (
     get_user_agent_extra_suffix,
 )
 from sagemaker.hyperpod.cli.service.list_pods import (
@@ -61,8 +61,17 @@
 from sagemaker.hyperpod.cli.utils import (
     get_eks_cluster_name,
 )
-from sagemaker.hyperpod.common.utils import get_cluster_context as get_cluster_context_util
-from sagemaker.hyperpod.observability.utils import get_monitoring_config, is_observability_addon_enabled
+from sagemaker.hyperpod.common.utils import (
+    get_cluster_context as get_cluster_context_util,
+)
+from sagemaker.hyperpod.observability.utils import (
+    get_monitoring_config,
+    is_observability_addon_enabled,
+)
+from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
+    _hyperpod_telemetry_emitter,
+)
+from sagemaker.hyperpod.common.telemetry.constants import Feature
 
 RATE_LIMIT = 4
 RATE_LIMIT_PERIOD = 1  # 1 second
@@ -103,12 +112,13 @@
     multiple=True,
     help="Optional. The namespace that you want to check the capacity for. Only SageMaker managed namespaces are supported.",
 )
+@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_cluster")
 def list_cluster(
     region: Optional[str],
     output: Optional[str],
     clusters: Optional[str],
     debug: bool,
-    namespace: Optional[List]
+    namespace: Optional[List],
 ):
     """List SageMaker Hyperpod Clusters with cluster metadata.
 
@@ -261,8 +271,14 @@ def rate_limited_operation(
         for ns in namespace:
             sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns)
             if sm_managed_namespace:
-                quota_allocation_id = sm_managed_namespace.metadata.labels[SAGEMAKER_QUOTA_ALLOCATION_LABEL]
-                cluster_queue_name = HYPERPOD_NAMESPACE_PREFIX + quota_allocation_id + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
+                quota_allocation_id = sm_managed_namespace.metadata.labels[
+                    SAGEMAKER_QUOTA_ALLOCATION_LABEL
+                ]
+                cluster_queue_name = (
+                    HYPERPOD_NAMESPACE_PREFIX
+                    + quota_allocation_id
+                    + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
+                )
                 cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name)
                 nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue)
                 quota_usage = _get_cluster_queue_quota_usage(cluster_queue)
@@ -282,8 +298,19 @@ def rate_limited_operation(
                 nodes_summary["deep_health_check_passed"],
             ]
             for ns in namespace:
-                capacities.append(ns_nominal_quota.get(ns).get(instance_type, {}).get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A"))
-                capacities.append(_get_available_quota(ns_nominal_quota.get(ns), ns_quota_usage.get(ns), instance_type, NVIDIA_GPU_RESOURCE_LIMIT_KEY))
+                capacities.append(
+                    ns_nominal_quota.get(ns)
+                    .get(instance_type, {})
+                    .get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A")
+                )
+                capacities.append(
+                    _get_available_quota(
+                        ns_nominal_quota.get(ns),
+                        ns_quota_usage.get(ns),
+                        instance_type,
+                        NVIDIA_GPU_RESOURCE_LIMIT_KEY,
+                    )
+                )
             cluster_capacities.append(capacities)
     except Exception as e:
         logger.error(f"Error processing cluster {cluster_name}: {e}, continue...")
@@ -305,7 +332,7 @@ def _get_cluster_queue_nominal_quota(cluster_queue):
             if resource_name == NVIDIA_GPU_RESOURCE_LIMIT_KEY:
                 quota = int(quota)
             nominal_quota[flavor_name][resource_name] = quota
-    
+
     return nominal_quota
 
 
@@ -336,7 +363,7 @@ def _get_available_quota(nominal, usage, flavor, resource_name):
     # Some resources need to be further processed by parsing unit like memory, e.g 10Gi
     if nominal_quota is not None and usage_quota is not None:
         return int(nominal_quota) - int(usage_quota)
-    
+
     return "N/A"
 
 
@@ -358,7 +385,9 @@ def _restructure_output(summary_list, namespaces):
     for node_summary in summary_list:
         node_summary["Namespaces"] = {}
         for ns in namespaces:
-            available_accelerators = node_summary[ns + AVAILABLE_ACCELERATOR_DEVICES_KEY]
+            available_accelerators = node_summary[
+                ns + AVAILABLE_ACCELERATOR_DEVICES_KEY
+            ]
             total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY]
             quota_accelerator_info = {
                 AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators,
@@ -425,9 +454,9 @@ def _aggregate_nodes_info(
 
         # Accelerator Devices available = Allocatable devices - Allocated devices
         if node_name in nodes_resource_allocated_dict:
-            nodes_summary[instance_type]["accelerator_devices_available"] -= (
-                nodes_resource_allocated_dict[node_name]
-            )
+            nodes_summary[instance_type][
+                "accelerator_devices_available"
+            ] -= nodes_resource_allocated_dict[node_name]
 
     logger.debug(f"nodes_summary: {nodes_summary}")
     return nodes_summary
@@ -550,7 +579,6 @@ def get_cluster_context(
         sys.exit(1)
 
 
-
 @click.command()
 @click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL")
 @click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL")
@@ -572,14 +600,21 @@ def get_monitoring(grafana: bool, prometheus: bool, list: bool) -> None:
             print(f"Grafana dashboard URL: {monitor_config.grafanaURL}")
         if list:
             metrics_data = monitor_config.availableMetrics
-            print(tabulate([[k, v.get('level', v.get('enabled'))] for k, v in metrics_data.items()],
-                           headers=['Metric', 'Level/Enabled'], tablefmt='presto'))
+            print(
+                tabulate(
+                    [
+                        [k, v.get("level", v.get("enabled"))]
+                        for k, v in metrics_data.items()
+                    ],
+                    headers=["Metric", "Level/Enabled"],
+                    tablefmt="presto",
+                )
+            )
     except Exception as e:
         logger.error(f"Failed to get metrics: {e}")
         sys.exit(1)
 
 
-
 def _update_kube_config(
     eks_name: str,
     region: Optional[str],
 
@@ -10,3 +10,5 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+from __future__ import absolute_import
+from .telemetry_logging import _hyperpod_telemetry_emitter
@@ -0,0 +1,60 @@
+from __future__ import absolute_import
+from enum import Enum
+
+
+class Feature(Enum):
+    """Enumeration of feature names used in telemetry."""
+
+    HYPERPOD = 6  # Added to support telemetry in sagemaker-hyperpod-cli
+
+    def __str__(self):  # pylint: disable=E0307
+        """Return the feature name."""
+        return self.name
+
+
+class Status(Enum):
+    """Enumeration of status values used in telemetry."""
+
+    SUCCESS = 1
+    FAILURE = 0
+
+    def __str__(self):  # pylint: disable=E0307
+        """Return the status name."""
+        return self.name
+
+
+class Region(str, Enum):
+    """Telemetry: List of all supported AWS regions."""
+
+    # Classic
+    US_EAST_1 = "us-east-1"  # IAD
+    US_EAST_2 = "us-east-2"  # CMH
+    US_WEST_1 = "us-west-1"  # SFO
+    US_WEST_2 = "us-west-2"  # PDX
+    AP_NORTHEAST_1 = "ap-northeast-1"  # NRT
+    AP_NORTHEAST_2 = "ap-northeast-2"  # ICN
+    AP_NORTHEAST_3 = "ap-northeast-3"  # KIX
+    AP_SOUTH_1 = "ap-south-1"  # BOM
+    AP_SOUTHEAST_1 = "ap-southeast-1"  # SIN
+    AP_SOUTHEAST_2 = "ap-southeast-2"  # SYD
+    CA_CENTRAL_1 = "ca-central-1"  # YUL
+    EU_CENTRAL_1 = "eu-central-1"  # FRA
+    EU_NORTH_1 = "eu-north-1"  # ARN
+    EU_WEST_1 = "eu-west-1"  # DUB
+    EU_WEST_2 = "eu-west-2"  # LHR
+    EU_WEST_3 = "eu-west-3"  # CDG
+    SA_EAST_1 = "sa-east-1"  # GRU
+    # Opt-in
+    AP_EAST_1 = "ap-east-1"  # HKG
+    AP_SOUTHEAST_3 = "ap-southeast-3"  # CGK
+    AF_SOUTH_1 = "af-south-1"  # CPT
+    EU_SOUTH_1 = "eu-south-1"  # MXP
+    ME_SOUTH_1 = "me-south-1"  # BAH
+    MX_CENTRAL_1 = "mx-central-1"  # QRO
+    AP_SOUTHEAST_7 = "ap-southeast-7"  # BKK
+    AP_SOUTH_2 = "ap-south-2"  # HYD
+    AP_SOUTHEAST_4 = "ap-southeast-4"  # MEL
+    EU_CENTRAL_2 = "eu-central-2"  # ZRH
+    EU_SOUTH_2 = "eu-south-2"  # ZAZ
+    IL_CENTRAL_1 = "il-central-1"  # TLV
+    ME_CENTRAL_1 = "me-central-1"  # DXB