Skip to content

Commit 16734e0

Browse files
authored
Add Cloudwatch console link for container insight logs (#74)
* Add Cloudwatch console link for container insight logs **Description** When executing get-logs function, after printing all the logs, we also provide the cloudwatch link to container insight logs (if the CW observability add-on is enabled) **Testing Done** Tested in my personal account and verified the link, and added unit test * fix unit tests * Add validation to cw url * Update cw url validation function * Add try except for CW link generation
1 parent 18bb566 commit 16734e0

File tree

8 files changed

+164
-6
lines changed

8 files changed

+164
-6
lines changed

src/hyperpod_cli/clients/kubernetes_client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,11 @@ def get_job(self, job_name: str, namespace: str):
275275
plural=PYTORCH_CUSTOM_OBJECT_PLURAL,
276276
name=job_name,
277277
)
278+
279+
def get_pod_details(self, pod_name: str, namespace: str):
280+
return client.CoreV1Api().read_namespaced_pod(
281+
name=pod_name, namespace=namespace
282+
)
278283

279284
def delete_training_job(self, job_name: str, namespace: str):
280285
return client.CustomObjectsApi().delete_namespaced_custom_object(

src/hyperpod_cli/commands/pod.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,13 @@ def get_log(
7676
sys.exit(
7777
f"Unexpected error happens when trying to get logs for training job {job_name} : {e}"
7878
)
79-
79+
80+
try:
81+
cloudwatch_link = get_logs_service.generate_cloudwatch_link(pod, namespace=namespace)
82+
if cloudwatch_link:
83+
click.echo(cloudwatch_link)
84+
except Exception as e:
85+
click.echo(f"WARNING: Failed to generate container insights cloudwatch link: {e}")
8086

8187
def _exec_command_required_option_pod_and_all_pods():
8288
class OptionRequiredClass(click.Command):

src/hyperpod_cli/service/get_logs.py

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
1313
from typing import Optional
14+
import boto3
1415

1516
from hyperpod_cli.clients.kubernetes_client import (
1617
KubernetesClient,
@@ -19,8 +20,17 @@
1920
from hyperpod_cli.service.list_pods import (
2021
ListPods,
2122
)
23+
from hyperpod_cli.utils import (
24+
get_eks_cluster_name,
25+
get_hyperpod_cluster_region,
26+
validate_region_and_cluster_name,
27+
)
2228
from kubernetes.client.rest import ApiException
2329
from kubernetes.client import V1ResourceAttributes
30+
import re
31+
32+
AMAZON_ClOUDWATCH_OBSERVABILITY = "amazon-cloudwatch-observability"
33+
CONTAINER_INSIGHTS_LOG_REGEX_PATTERN = "https:\/\/([a-z0-9-]+).console.aws.amazon.com\/cloudwatch\/home\?region=([a-z0-9-]+)#logsV2:log-groups\/log-group\/\$252Faws\$252Fcontainerinsights\$252F([a-zA-Z0-9-]+)\$252Fapplication\/log-events\/([a-z0-9-]+)-application.var.log.containers.([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)-([a-z0-9-]+).log"
2434

2535
class GetLogs:
2636
def __init__(self):
@@ -57,7 +67,70 @@ def get_training_job_logs(
5767
raise RuntimeError(
5868
f"Given pod name {pod_name} is not associated with training job {job_name} in namespace {namespace}"
5969
)
60-
6170
return k8s_client.get_logs_for_pod(pod_name, namespace)
6271
except ApiException as e:
6372
raise RuntimeError(f"Unexpected API error: {e.reason} ({e.status})")
73+
74+
def generate_cloudwatch_link(
75+
self,
76+
pod_name: str,
77+
namespace: Optional[str],
78+
):
79+
eks_cluster_name = get_eks_cluster_name()
80+
region = get_hyperpod_cluster_region()
81+
82+
if self.is_container_insights_addon_enabled(eks_cluster_name):
83+
k8s_client = KubernetesClient()
84+
85+
# pod_details is a V1Pod object
86+
pod_details = k8s_client.get_pod_details(pod_name, namespace)
87+
88+
# get node name
89+
if pod_details.spec and pod_details.spec.node_name:
90+
node_name = pod_details.spec.node_name
91+
else:
92+
node_name = None
93+
94+
# get container name
95+
if pod_details.spec and pod_details.spec.containers and pod_details.spec.containers[0].name:
96+
container_name = pod_details.spec.containers[0].name
97+
else:
98+
container_name = None
99+
100+
# get container_id
101+
if pod_details.status and pod_details.status.container_statuses and pod_details.status.container_statuses[0].container_id:
102+
full_container_id = pod_details.status.container_statuses[0].container_id
103+
104+
# full_container_id has format "containerd://xxxxxxxxxx"
105+
container_id = full_container_id[13:] if full_container_id.startswith('containerd://') else None
106+
else:
107+
container_id = None
108+
109+
# Cloudwatch container insight log groups should have the same pod log as API response
110+
cloudwatch_url = self.get_log_url(eks_cluster_name, region, node_name, pod_name, namespace, container_name, container_id)
111+
112+
if not validate_region_and_cluster_name(region, eks_cluster_name):
113+
raise ValueError('Eks cluster name or cluster region is invalid.')
114+
115+
if not re.match(CONTAINER_INSIGHTS_LOG_REGEX_PATTERN, cloudwatch_url):
116+
raise ValueError("Failed to validate cloudwatch log url. Please make sure pod's node name, container name and container id are valid")
117+
118+
cloudwatch_link = f'The pod cloudwatch log stream link is {cloudwatch_url}'
119+
else:
120+
cloudwatch_link = None
121+
122+
return cloudwatch_link
123+
124+
def get_log_url(self, eks_cluster_name, region, node_name, pod_name, namespace, container_name, container_id):
125+
console_prefix = f'https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#'
126+
log_group_prefix = f'logsV2:log-groups/log-group/$252Faws$252Fcontainerinsights$252F{eks_cluster_name}$252Fapplication/log-events/'
127+
log_stream = f'{node_name}-application.var.log.containers.{pod_name}_{namespace}_{container_name}-{container_id}.log'
128+
129+
return console_prefix + log_group_prefix + log_stream
130+
131+
def is_container_insights_addon_enabled(self, eks_cluster_name):
132+
response = boto3.client("eks").list_addons(clusterName=eks_cluster_name, maxResults=50)
133+
if AMAZON_ClOUDWATCH_OBSERVABILITY in response.get('addons', []):
134+
return True
135+
else:
136+
return False

src/hyperpod_cli/utils.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,15 @@ def _retrieve_current_hyperpod_context():
113113

114114

115115
def _validate_link(console_url):
116-
pattern = "https:\/\/([a-z0-9-]+).console.aws.amazon.com\/sagemaker\/home\?region=([a-z0-9-]+)#\/cluster-management\/([a-z0-9-]+)"
116+
pattern = "https:\/\/([a-z0-9-]+).console.aws.amazon.com\/sagemaker\/home\?region=([a-z0-9-]+)#\/cluster-management\/([a-zA-Z0-9-]+)"
117117
match = re.match(pattern, console_url)
118118
if match:
119119
return True
120120
else:
121121
return False
122122

123123

124-
def _validate_placeholders(region, cluster_name):
124+
def validate_region_and_cluster_name(region, cluster_name):
125125
output = False
126126
region_char_list = region.split("-")
127127

@@ -135,6 +135,9 @@ def _validate_placeholders(region, cluster_name):
135135
region_prefix_length = len(region_char_list[0])
136136
region_length = len(region_char_list[1])
137137
region_suffix_length = len(region_char_list[2])
138+
139+
cluster_name_match = re.match("[a-zA-Z0-9-]+", cluster_name)
140+
138141
if (
139142
region_prefix_match
140143
and region_match
@@ -143,6 +146,7 @@ def _validate_placeholders(region, cluster_name):
143146
and region_suffix_length == 1
144147
and region_length >= 4
145148
and region_length < 10
149+
and cluster_name_match
146150
and len(cluster_name) >= 1
147151
and len(cluster_name) <= 63
148152
):
@@ -165,6 +169,15 @@ def get_cluster_console_url():
165169
f"https://{region}.console.aws.amazon.com/sagemaker/"
166170
f"home?region={region}#/cluster-management/{cluster_name}"
167171
)
168-
if _validate_link(console_url) and _validate_placeholders(region, cluster_name):
172+
if _validate_link(console_url) and validate_region_and_cluster_name(region, cluster_name):
169173
return console_url
170174
return None
175+
176+
def get_eks_cluster_name():
177+
hyperpod_context_cluster = _retrieve_current_hyperpod_context()
178+
eks_cluster_arn = hyperpod_context_cluster.get("Orchestrator", {}).get("Eks", {}).get("ClusterArn", '')
179+
return eks_cluster_arn.split('cluster/')[-1]
180+
181+
def get_hyperpod_cluster_region():
182+
hyperpod_context_cluster = _retrieve_current_hyperpod_context()
183+
return hyperpod_context_cluster.get("ClusterArn").split(":")[3]

test/unit_tests/clients/test_kubernetes_client.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,15 @@ def test_logs_for_pods(self, mock_core_client: Mock):
520520
result = test_client.get_logs_for_pod("test", "kubeflow")
521521
self.assertIn("test log", result)
522522

523+
@patch(
524+
"kubernetes.client.CoreV1Api",
525+
return_value=Mock(read_namespaced_pod=Mock(return_value="pod details")),
526+
)
527+
def test_get_pod_details(self, mock_core_client: Mock):
528+
test_client = KubernetesClient()
529+
result = test_client.get_pod_details("test", "kubeflow")
530+
self.assertIn("pod details", result)
531+
523532
@patch(
524533
"kubernetes.client.CustomObjectsApi",
525534
return_value=Mock(

test/unit_tests/service/test_get_logs_service.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,20 @@ def test_get_logs_auto_discover_namespace(
125125
"sample-job", "test-pod", None
126126
)
127127
self.assertIn("test logs", result)
128+
129+
def test_get_log_url(
130+
self,
131+
):
132+
eks_cluster_name = 'eks_cluster_name'
133+
region = 'us-west-2'
134+
node_name = 'node_name'
135+
pod_name = 'pod_name'
136+
namespace = 'namespace'
137+
container_name = 'container_name'
138+
container_id = 'container_id'
139+
result_url = self.mock_get_logs.get_log_url(eks_cluster_name, region, node_name, pod_name, namespace, container_name, container_id)
140+
141+
self.assertEqual(
142+
result_url,
143+
'https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fcontainerinsights$252Feks_cluster_name$252Fapplication/log-events/node_name-application.var.log.containers.pod_name_namespace_container_name-container_id.log'
144+
)

test/unit_tests/test_pod.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,14 @@ def setUp(self):
3535

3636
@mock.patch("hyperpod_cli.service.get_logs.GetLogs")
3737
@mock.patch("hyperpod_cli.service.get_logs.GetLogs.get_training_job_logs")
38+
@mock.patch("hyperpod_cli.service.get_logs.GetLogs.generate_cloudwatch_link")
3839
def test_get_logs_happy_case(
3940
self,
41+
mock_get_logs_service_and_generate_cloudwatch_link: mock.Mock,
4042
mock_get_logs_service_and_get_logs: mock.Mock,
4143
mock_get_logs_service: mock.Mock,
4244
):
45+
mock_get_logs_service_and_generate_cloudwatch_link.return_value = 'link'
4346
mock_get_logs_service.return_value = self.mock_get_job_log
4447
mock_get_logs_service_and_get_logs.return_value = "{}"
4548
result = self.runner.invoke(
@@ -55,15 +58,19 @@ def test_get_logs_happy_case(
5558

5659
@mock.patch("hyperpod_cli.service.get_logs.GetLogs")
5760
@mock.patch("hyperpod_cli.service.get_logs.GetLogs.get_training_job_logs")
61+
@mock.patch("hyperpod_cli.service.get_logs.GetLogs.generate_cloudwatch_link")
5862
@mock.patch("logging.Logger.debug")
5963
def test_get_logs_happy_case_debug_mode(
6064
self,
6165
mock_debug: mock.Mock,
66+
mock_get_logs_service_and_generate_cloudwatch_link: mock.Mock,
6267
mock_get_logs_service_and_get_logs: mock.Mock,
6368
mock_get_logs_service: mock.Mock,
6469
):
6570
mock_get_logs_service.return_value = self.mock_get_job_log
6671
mock_get_logs_service_and_get_logs.return_value = "{}"
72+
mock_get_logs_service_and_generate_cloudwatch_link.return_value = 'link'
73+
6774
result = self.runner.invoke(
6875
get_log,
6976
[
@@ -79,13 +86,17 @@ def test_get_logs_happy_case_debug_mode(
7986

8087
@mock.patch("hyperpod_cli.service.get_logs.GetLogs")
8188
@mock.patch("hyperpod_cli.service.get_logs.GetLogs.get_training_job_logs")
89+
@mock.patch("hyperpod_cli.service.get_logs.GetLogs.generate_cloudwatch_link")
8290
def test_describe_job_happy_case_with_namespace(
8391
self,
92+
mock_get_logs_service_and_generate_cloudwatch_link: mock.Mock,
8493
mock_get_logs_service_and_get_logs: mock.Mock,
8594
mock_get_logs_service: mock.Mock,
8695
):
8796
mock_get_logs_service.return_value = self.mock_get_job_log
8897
mock_get_logs_service_and_get_logs.return_value = "{}"
98+
mock_get_logs_service_and_generate_cloudwatch_link.return_value = 'link'
99+
89100
result = self.runner.invoke(
90101
get_log,
91102
[

test/unit_tests/test_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
setup_logger,
2424
get_cluster_console_url,
2525
store_current_hyperpod_context,
26+
get_eks_cluster_name,
27+
get_hyperpod_cluster_region,
2628
)
2729

2830
DATA_JSON = {
@@ -36,7 +38,8 @@
3638
'{"ClusterArn": "arn:aws:sagemaker:us-west-2:1234567890:cluster/test",'
3739
' "ClusterName": "hyperpod-eks-test",'
3840
' "ClusterStatus": "InService",'
39-
' "CreationTime": "2024-08-17 01:26:35.921000+00:00"}'
41+
' "CreationTime": "2024-08-17 01:26:35.921000+00:00",'
42+
' "Orchestrator": {"Eks": {"ClusterArn":"arn:aws:eks:us-west-2:593793038179:cluster/hyperpod-eks-test"}}}'
4043
)
4144

4245
INVALID_DATA = (
@@ -296,3 +299,24 @@ def test_get_cluster_console_url_longer_cluster_name_null(
296299
"/tmp/hyperpod_current_context.json",
297300
"r",
298301
)
302+
303+
def test_get_eks_cluster_name(self):
304+
mock_read = mock_open(read_data=DATA)
305+
with patch("builtins.open", mock_read):
306+
result = get_eks_cluster_name()
307+
self.assertEqual(result, "hyperpod-eks-test")
308+
mock_read.assert_called_once_with(
309+
"/tmp/hyperpod_current_context.json",
310+
"r",
311+
)
312+
313+
def test_get_hyperpod_cluster_region(self):
314+
mock_read = mock_open(read_data=DATA)
315+
with patch("builtins.open", mock_read):
316+
result = get_hyperpod_cluster_region()
317+
self.assertEqual(result, "us-west-2")
318+
mock_read.assert_called_once_with(
319+
"/tmp/hyperpod_current_context.json",
320+
"r",
321+
)
322+

0 commit comments

Comments
 (0)