Skip to content

Commit 0dedc06

Browse files
author
Roja Reddy Sareddy
committed
Enable Hyperpod telemetry
1 parent 8f0d9ef commit 0dedc06

File tree

10 files changed

+333
-1
lines changed

10 files changed

+333
-1
lines changed

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@
8888
"pydantic>=2.10.6,<3.0.0",
8989
"hyperpod-pytorch-job-template>=1.0.0, <2.0.0",
9090
"hyperpod-custom-inference-template>=1.0.0, <2.0.0",
91-
"hyperpod-jumpstart-inference-template>=1.0.0, <2.0.0"
91+
"hyperpod-jumpstart-inference-template>=1.0.0, <2.0.0",
92+
"sagemaker",
9293
],
9394
entry_points={
9495
"console_scripts": [

src/sagemaker/hyperpod/cli/telemetry/__init__.py renamed to src/sagemaker/hyperpod/common/telemetry/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
from .telemetry_logging import _hyperpod_telemetry_emitter
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
from __future__ import absolute_import
2+
import logging
3+
import platform
4+
import sys
5+
from time import perf_counter
6+
from typing import List, Tuple
7+
import functools
8+
import requests
9+
import subprocess
10+
import re
11+
12+
import boto3
13+
from sagemaker.telemetry.constants import Feature, Status, Region
14+
from sagemaker.telemetry.telemetry_logging import (
15+
FEATURE_TO_CODE,
16+
STATUS_TO_CODE,
17+
_requests_helper,
18+
_construct_url,
19+
)
20+
import importlib.metadata
21+
22+
SDK_VERSION = importlib.metadata.version("sagemaker-hyperpod")
23+
DEFAULT_AWS_REGION = "us-east-2"
24+
OS_NAME = platform.system() or "UnresolvedOS"
25+
OS_VERSION = platform.release() or "UnresolvedOSVersion"
26+
OS_NAME_VERSION = "{}/{}".format(OS_NAME, OS_VERSION)
27+
PYTHON_VERSION = "{}.{}.{}".format(
28+
sys.version_info.major, sys.version_info.minor, sys.version_info.micro
29+
)
30+
31+
logger = logging.getLogger(__name__)
32+
33+
34+
def get_region_and_account_from_current_context() -> Tuple[str, str]:
35+
"""
36+
Get region and account ID from current kubernetes context
37+
Returns: (region, account_id)
38+
"""
39+
try:
40+
# Get current context
41+
result = subprocess.run(
42+
["kubectl", "config", "current-context"], capture_output=True, text=True
43+
)
44+
45+
if result.returncode == 0:
46+
context = result.stdout.strip()
47+
48+
# Extract region
49+
region_pattern = r"([a-z]{2}-[a-z]+-\d{1})"
50+
region = DEFAULT_AWS_REGION
51+
if match := re.search(region_pattern, context):
52+
region = match.group(1)
53+
54+
# Extract account ID (12 digits)
55+
account_pattern = r"(\d{12})"
56+
account = "unknown"
57+
if match := re.search(account_pattern, context):
58+
account = match.group(1)
59+
60+
return region, account
61+
62+
except Exception as e:
63+
logger.debug(f"Failed to get context info from kubectl: {e}")
64+
65+
return DEFAULT_AWS_REGION, "unknown"
66+
67+
68+
def _send_telemetry_request(
69+
status: int,
70+
feature_list: List[int],
71+
session,
72+
failure_reason: str = None,
73+
failure_type: str = None,
74+
extra_info: str = None,
75+
) -> None:
76+
"""Make GET request to an empty object in S3 bucket"""
77+
try:
78+
region, accountId = get_region_and_account_from_current_context()
79+
80+
try:
81+
Region(region) # Validate the region
82+
except ValueError:
83+
logger.warning(
84+
"Region not found in supported regions. Telemetry request will not be emitted."
85+
)
86+
return
87+
88+
url = _construct_url(
89+
accountId,
90+
region,
91+
str(status),
92+
str(
93+
",".join(map(str, feature_list))
94+
), # Remove brackets and quotes to cut down on length
95+
failure_reason,
96+
failure_type,
97+
extra_info,
98+
)
99+
# Send the telemetry request
100+
logger.info("Sending telemetry request to [%s]", url)
101+
_requests_helper(url, 2)
102+
logger.info("SageMaker Python SDK telemetry successfully emitted.")
103+
except Exception: # pylint: disable=W0703
104+
logger.debug("SageMaker Python SDK telemetry not emitted!")
105+
106+
107+
def _hyperpod_telemetry_emitter(feature: str, func_name: str):
108+
def decorator(func):
109+
@functools.wraps(func)
110+
def wrapper(*args, **kwargs):
111+
extra = (
112+
f"{func_name}"
113+
f"&x-sdkVersion={SDK_VERSION}"
114+
f"&x-env={PYTHON_VERSION}"
115+
f"&x-sys={OS_NAME_VERSION}"
116+
)
117+
start = perf_counter()
118+
try:
119+
result = func(*args, **kwargs)
120+
duration = round(perf_counter() - start, 2)
121+
extra += f"&x-latency={duration}"
122+
_send_telemetry_request(
123+
Status.SUCCESS,
124+
[FEATURE_TO_CODE[str(feature)]],
125+
None,
126+
None,
127+
None,
128+
extra,
129+
)
130+
return result
131+
except Exception as e:
132+
duration = round(perf_counter() - start, 2)
133+
extra += f"&x-latency={duration}"
134+
_send_telemetry_request(
135+
Status.FAILURE,
136+
[FEATURE_TO_CODE[str(feature)]],
137+
None,
138+
str(e),
139+
type(e).__name__,
140+
extra,
141+
)
142+
raise
143+
144+
return wrapper
145+
146+
return decorator
File renamed without changes.

src/sagemaker/hyperpod/inference/hp_endpoint.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
InferenceEndpointConfigStatus,
1212
_HPEndpoint,
1313
)
14+
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
15+
_hyperpod_telemetry_emitter,
16+
)
17+
from sagemaker.telemetry.constants import Feature
1418
from sagemaker.hyperpod.inference.hp_endpoint_base import HPEndpointBase
1519
from typing import Dict, List, Optional
1620
from sagemaker_core.main.resources import Endpoint
@@ -21,6 +25,7 @@ class HPEndpoint(_HPEndpoint, HPEndpointBase):
2125
metadata: Optional[Metadata] = Field(default=None)
2226
status: Optional[InferenceEndpointConfigStatus] = Field(default=None)
2327

28+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_endpoint")
2429
def create(
2530
self,
2631
name=None,
@@ -59,6 +64,7 @@ def create(
5964
f"Creating sagemaker model and endpoint. Endpoint name: {spec.endpointName}.\n The process may take a few minutes..."
6065
)
6166

67+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_endpoint_from_dict")
6268
def create_from_dict(
6369
self,
6470
input: Dict,
@@ -116,6 +122,7 @@ def refresh(self):
116122
return self
117123

118124
@classmethod
125+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_endpoints")
119126
def list(
120127
cls,
121128
namespace: str = None,
@@ -138,6 +145,7 @@ def list(
138145
return endpoints
139146

140147
@classmethod
148+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_endpoint")
141149
def get(cls, name: str, namespace: str = None) -> Endpoint:
142150
if not namespace:
143151
namespace = get_default_namespace()
@@ -163,6 +171,7 @@ def get(cls, name: str, namespace: str = None) -> Endpoint:
163171

164172
return endpoint
165173

174+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "delete_endpoint")
166175
def delete(self) -> None:
167176
logger = self.get_logger()
168177
logger = setup_logging(logger)
@@ -174,6 +183,7 @@ def delete(self) -> None:
174183
)
175184
logger.info(f"Deleting HPEndpoint: {self.metadata.name}...")
176185

186+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "invoke_endpoint")
177187
def invoke(self, body, content_type="application/json"):
178188
if not self.endpointName:
179189
raise Exception("SageMaker endpoint name not found in this object!")

src/sagemaker/hyperpod/inference/hp_endpoint_base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
setup_logging,
1616
get_default_namespace,
1717
)
18+
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
19+
_hyperpod_telemetry_emitter,
20+
)
21+
from sagemaker.telemetry.constants import Feature
1822

1923

2024
class HPEndpointBase:
@@ -130,6 +134,7 @@ def call_delete_api(
130134
handle_exception(e, name, namespace)
131135

132136
@classmethod
137+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_operator_logs")
133138
def get_operator_logs(cls, since_hours: float):
134139
cls.verify_kube_config()
135140

@@ -159,6 +164,7 @@ def get_operator_logs(cls, since_hours: float):
159164
return logs
160165

161166
@classmethod
167+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_logs")
162168
def get_logs(
163169
cls,
164170
pod: str,
@@ -194,6 +200,7 @@ def get_logs(
194200
return logs
195201

196202
@classmethod
203+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_pods_endpoint")
197204
def list_pods(cls, namespace=None):
198205
cls.verify_kube_config()
199206

@@ -210,6 +217,7 @@ def list_pods(cls, namespace=None):
210217
return pods
211218

212219
@classmethod
220+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_namespaces")
213221
def list_namespaces(cls):
214222
cls.verify_kube_config()
215223

src/sagemaker/hyperpod/inference/hp_jumpstart_endpoint.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,17 @@
1616
_HPJumpStartEndpoint,
1717
JumpStartModelStatus,
1818
)
19+
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
20+
_hyperpod_telemetry_emitter,
21+
)
22+
from sagemaker.telemetry.constants import Feature
1923

2024

2125
class HPJumpStartEndpoint(_HPJumpStartEndpoint, HPEndpointBase):
2226
metadata: Optional[Metadata] = Field(default=None)
2327
status: Optional[JumpStartModelStatus] = Field(default=None)
2428

29+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_js_endpoint")
2530
def create(
2631
self,
2732
name=None,
@@ -64,6 +69,7 @@ def create(
6469
f"Creating JumpStart model and sagemaker endpoint. Endpoint name: {endpoint_name}.\n The process may take a few minutes..."
6570
)
6671

72+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_js_endpoint_from_dict")
6773
def create_from_dict(
6874
self,
6975
input: Dict,
@@ -125,6 +131,7 @@ def refresh(self):
125131
return self
126132

127133
@classmethod
134+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_js_endpoints")
128135
def list(
129136
cls,
130137
namespace: str = None,
@@ -147,6 +154,7 @@ def list(
147154
return endpoints
148155

149156
@classmethod
157+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_js_endpoint")
150158
def get(cls, name: str, namespace: str = None):
151159
if not namespace:
152160
namespace = get_default_namespace()
@@ -172,6 +180,7 @@ def get(cls, name: str, namespace: str = None):
172180

173181
return endpoint
174182

183+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "delete_js_endpoint")
175184
def delete(self) -> None:
176185
logger = self.get_logger()
177186
logger = setup_logging(logger)
@@ -185,6 +194,7 @@ def delete(self) -> None:
185194
f"Deleting JumpStart model and sagemaker endpoint: {self.metadata.name}. This may take a few minutes..."
186195
)
187196

197+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "invoke_js_endpoint")
188198
def invoke(self, body, content_type="application/json"):
189199
if not self.sageMakerEndpoint or not self.sageMakerEndpoint.name:
190200
raise Exception("SageMaker endpoint name not found in this object!")

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
get_default_namespace,
1414
setup_logging,
1515
)
16+
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
17+
_hyperpod_telemetry_emitter,
18+
)
19+
from sagemaker.telemetry.constants import Feature
1620
import yaml
1721
import logging
1822

@@ -45,6 +49,7 @@ def verify_kube_config(cls):
4549
def get_logger(cls):
4650
return logging.getLogger(__name__)
4751

52+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create")
4853
def create(self, debug=False):
4954
self.verify_kube_config()
5055

@@ -83,6 +88,7 @@ def create(self, debug=False):
8388
handle_exception(e, self.metadata.name, self.metadata.namespace)
8489

8590
@classmethod
91+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list")
8692
def list(cls, namespace=None) -> List["HyperPodPytorchJob"]:
8793
cls.verify_kube_config()
8894

@@ -106,6 +112,7 @@ def list(cls, namespace=None) -> List["HyperPodPytorchJob"]:
106112
logger.error(f"Failed to list HyperpodPytorchJobs!")
107113
handle_exception(e, "", namespace)
108114

115+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "delete")
109116
def delete(self):
110117
self.verify_kube_config()
111118

@@ -128,6 +135,7 @@ def delete(self):
128135
handle_exception(e, self.metadata.name, self.metadata.namespace)
129136

130137
@classmethod
138+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get")
131139
def get(cls, name, namespace=None) -> "HyperPodPytorchJob":
132140
cls.verify_kube_config()
133141

@@ -175,6 +183,7 @@ def refresh(self) -> "HyperPodPytorchJob":
175183
logger.error(f"Failed to refresh HyperPodPytorchJob {self.metadata.name}!")
176184
handle_exception(e, self.metadata.name, self.metadata.namespace)
177185

186+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_pods")
178187
def list_pods(self) -> List[str]:
179188
self.verify_kube_config()
180189

@@ -196,6 +205,7 @@ def list_pods(self) -> List[str]:
196205
logger.error(f"Failed to list pod in namespace {self.metadata.namespace}!")
197206
handle_exception(e, self.metadata.name, self.metadata.namespace)
198207

208+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_logs_from_pod")
199209
def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> str:
200210
self.verify_kube_config()
201211

test/unit_tests/common/telemetry/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)