Skip to content

Commit ba24c96

Browse files
authored
Adding telemetry to release_v2 (#259)
* Added telemetry to release_v2 * Minor Code Clean-up * Remove exiting system so that telemetry still sends on errors * Adding hyperpod_v2 to names for clarity
1 parent 520886b commit ba24c96

File tree

7 files changed

+732
-16
lines changed

7 files changed

+732
-16
lines changed

src/hyperpod_cli/commands/cluster.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
TEMP_KUBE_CONFIG_FILE,
4444
OutputFormat,
4545
)
46+
from hyperpod_cli.telemetry import _hyperpod_telemetry_emitter
47+
from hyperpod_cli.telemetry.constants import Feature
4648
from hyperpod_cli.telemetry.user_agent import (
4749
get_user_agent_extra_suffix,
4850
)
@@ -107,6 +109,7 @@
107109
multiple=True,
108110
help="Optional. The namespace that you want to check the capacity for. Only SageMaker managed namespaces are supported.",
109111
)
112+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.get_clusters_cli")
110113
def get_clusters(
111114
region: Optional[str],
112115
orchestrator: Optional[str],
@@ -463,6 +466,7 @@ def _aggregate_nodes_info(
463466
is_flag=True,
464467
help="Enable debug mode",
465468
)
469+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.connect_cluster_cli")
466470
def connect_cluster(
467471
cluster_name: str,
468472
region: Optional[str],

src/hyperpod_cli/commands/job.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
Volume,
5151
USER_NAME_LABEL_KEY,
5252
)
53+
from hyperpod_cli.telemetry import _hyperpod_telemetry_emitter
54+
from hyperpod_cli.telemetry.constants import Feature
5355
from hyperpod_cli.clients.kubernetes_client import (
5456
KubernetesClient,
5557
)
@@ -124,6 +126,7 @@
124126
is_flag=True,
125127
help="Enable debug mode",
126128
)
129+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.get_job_cli")
127130
def get_job(
128131
job_name: str,
129132
namespace: Optional[str],
@@ -144,9 +147,8 @@ def get_job(
144147
result = get_training_job_service.get_training_job(job_name, namespace, verbose)
145148
click.echo(result)
146149
except Exception as e:
147-
sys.exit(
148-
f"Unexpected error happens when trying to get training job {job_name} : {e}"
149-
)
150+
logger.error(f"Unexpected error happens when trying to get training job {job_name} : {e}")
151+
raise
150152

151153

152154
@click.command()
@@ -186,6 +188,7 @@ def get_job(
186188
is_flag=True,
187189
help="Enable debug mode",
188190
)
191+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.list_jobs_cli")
189192
def list_jobs(
190193
namespace: Optional[str],
191194
all_namespaces: Optional[bool],
@@ -205,7 +208,8 @@ def list_jobs(
205208
)
206209
click.echo(result)
207210
except Exception as e:
208-
sys.exit(f"Unexpected error happens when trying to list training job : {e}")
211+
logger.error(f"Unexpected error happens when trying to list training job : {e}")
212+
raise
209213

210214

211215
@click.command()
@@ -228,6 +232,7 @@ def list_jobs(
228232
is_flag=True,
229233
help="Enable debug mode",
230234
)
235+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.list_pods_cli")
231236
def list_pods(
232237
job_name: str,
233238
namespace: Optional[str],
@@ -246,9 +251,8 @@ def list_pods(
246251
result = list_pods_service.list_pods_for_training_job(job_name, namespace, True)
247252
click.echo(result)
248253
except Exception as e:
249-
sys.exit(
250-
f"Unexpected error happens when trying to list pods for training job {job_name} : {e}"
251-
)
254+
logger.error(f"Unexpected error happens when trying to list pods for training job {job_name} : {e}")
255+
raise
252256

253257

254258
@click.command()
@@ -271,6 +275,7 @@ def list_pods(
271275
is_flag=True,
272276
help="Enable debug mode",
273277
)
278+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.cancel_job_cli")
274279
def cancel_job(
275280
job_name: str,
276281
namespace: Optional[str],
@@ -287,9 +292,8 @@ def cancel_job(
287292
result = cancel_training_job_service.cancel_training_job(job_name, namespace)
288293
click.echo(result)
289294
except Exception as e:
290-
sys.exit(
291-
f"Unexpected error happens when trying to cancel training job {job_name} : {e}"
292-
)
295+
logger.error(f"Unexpected error happens when trying to cancel training job {job_name} : {e}")
296+
raise
293297

294298

295299
@click.command()
@@ -536,6 +540,7 @@ def cancel_job(
536540
is_flag=True,
537541
help="Enable debug mode",
538542
)
543+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.start_job_cli")
539544
def start_job(
540545
config_file: Optional[str],
541546
job_name: Optional[str],
@@ -876,6 +881,7 @@ def start_job(
876881
help="Optional. The namespace to use. If not specified, this command will first use the namespace wh connecting the cluster."
877882
"Otherwise if namespace is not configured when connecting to the cluster, a namespace that is managed by SageMaker will be auto discovered.",
878883
)
884+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.patch_job_cli")
879885
def patch_job(patch_type: str, job_name: str, namespace: Optional[str]):
880886

881887
if patch_type not in JobPatchType.get_values():

src/hyperpod_cli/commands/pod.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
setup_logger,
2525
set_logging_level,
2626
)
27+
from hyperpod_cli.telemetry import _hyperpod_telemetry_emitter
28+
from hyperpod_cli.telemetry.constants import Feature
2729

2830
logger = setup_logger(__name__)
2931

@@ -54,6 +56,7 @@
5456
is_flag=True,
5557
help="Enable debug mode",
5658
)
59+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.get_log_cli")
5760
def get_log(
5861
job_name: str,
5962
pod: str,
@@ -73,9 +76,8 @@ def get_log(
7376
)
7477
click.echo(result)
7578
except Exception as e:
76-
sys.exit(
77-
f"Unexpected error happens when trying to get logs for training job {job_name} : {e}"
78-
)
79+
logger.error(f"Unexpected error happens when trying to get logs for training job {job_name} : {e}")
80+
raise
7981

8082
try:
8183
cloudwatch_link = get_logs_service.generate_cloudwatch_link(pod, namespace=namespace)
@@ -148,6 +150,7 @@ def invoke(self, ctx):
148150
is_flag=True,
149151
help="Enable debug mode",
150152
)
153+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_V2, "hyperpod_v2.exec_cli")
151154
def exec(
152155
job_name: str,
153156
namespace: Optional[str],
@@ -173,6 +176,5 @@ def exec(
173176
)
174177
click.echo(result)
175178
except Exception as e:
176-
sys.exit(
177-
f"Unexpected error happens when trying to exec command for pod {pod} : {e}"
178-
)
179+
logger.error(f"Unexpected error happens when trying to exec command for pod {pod} : {e}")
180+
raise

src/hyperpod_cli/telemetry/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
from .telemetry_logging import _hyperpod_telemetry_emitter
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import absolute_import
2+
from enum import Enum
3+
4+
5+
class Feature(Enum):
6+
"""Enumeration of feature names used in telemetry."""
7+
8+
HYPERPOD_V2 = 10
9+
10+
def __str__(self): # pylint: disable=E0307
11+
"""Return the feature name."""
12+
return self.name
13+
14+
15+
class Status(Enum):
16+
"""Enumeration of status values used in telemetry."""
17+
18+
SUCCESS = 1
19+
FAILURE = 0
20+
21+
def __str__(self): # pylint: disable=E0307
22+
"""Return the status name."""
23+
return self.name
24+
25+
26+
class Region(str, Enum):
27+
"""Telemetry: List of all supported AWS regions."""
28+
29+
# Classic
30+
US_EAST_1 = "us-east-1" # IAD
31+
US_EAST_2 = "us-east-2" # CMH
32+
US_WEST_1 = "us-west-1" # SFO
33+
US_WEST_2 = "us-west-2" # PDX
34+
AP_NORTHEAST_1 = "ap-northeast-1" # NRT
35+
AP_NORTHEAST_2 = "ap-northeast-2" # ICN
36+
AP_NORTHEAST_3 = "ap-northeast-3" # KIX
37+
AP_SOUTH_1 = "ap-south-1" # BOM
38+
AP_SOUTHEAST_1 = "ap-southeast-1" # SIN
39+
AP_SOUTHEAST_2 = "ap-southeast-2" # SYD
40+
CA_CENTRAL_1 = "ca-central-1" # YUL
41+
EU_CENTRAL_1 = "eu-central-1" # FRA
42+
EU_NORTH_1 = "eu-north-1" # ARN
43+
EU_WEST_1 = "eu-west-1" # DUB
44+
EU_WEST_2 = "eu-west-2" # LHR
45+
EU_WEST_3 = "eu-west-3" # CDG
46+
SA_EAST_1 = "sa-east-1" # GRU
47+
# Opt-in
48+
AP_EAST_1 = "ap-east-1" # HKG
49+
AP_SOUTHEAST_3 = "ap-southeast-3" # CGK
50+
AF_SOUTH_1 = "af-south-1" # CPT
51+
EU_SOUTH_1 = "eu-south-1" # MXP
52+
ME_SOUTH_1 = "me-south-1" # BAH
53+
MX_CENTRAL_1 = "mx-central-1" # QRO
54+
AP_SOUTHEAST_7 = "ap-southeast-7" # BKK
55+
AP_SOUTH_2 = "ap-south-2" # HYD
56+
AP_SOUTHEAST_4 = "ap-southeast-4" # MEL
57+
EU_CENTRAL_2 = "eu-central-2" # ZRH
58+
EU_SOUTH_2 = "eu-south-2" # ZAZ
59+
IL_CENTRAL_1 = "il-central-1" # TLV
60+
ME_CENTRAL_1 = "me-central-1" # DXB

0 commit comments

Comments
 (0)