Skip to content

Commit f753ece

Browse files
committed
Add Cloudwatch console link for container insight logs
**Description** When executing get-logs function, after printing all the logs, we also provide the cloudwatch link to container insight logs (if the CW observability add-on is enabled) **Testing Done** Tested in my personal account and verified the link, and added unit test
1 parent e8b5b27 commit f753ece

File tree

7 files changed

+138
-3
lines changed

7 files changed

+138
-3
lines changed

src/hyperpod_cli/clients/kubernetes_client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,11 @@ def get_job(self, job_name: str, namespace: str):
275275
plural=PYTORCH_CUSTOM_OBJECT_PLURAL,
276276
name=job_name,
277277
)
278+
279+
def get_pod_details(self, pod_name: str, namespace: str):
280+
return client.CoreV1Api().read_namespaced_pod(
281+
name=pod_name, namespace=namespace
282+
)
278283

279284
def delete_training_job(self, job_name: str, namespace: str):
280285
return client.CustomObjectsApi().delete_namespaced_custom_object(

src/hyperpod_cli/commands/pod.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ def get_log(
7272
job_name, pod, namespace=namespace
7373
)
7474
click.echo(result)
75+
76+
cloudwatch_link = get_logs_service.generate_cloudwatch_link(pod, namespace=namespace)
77+
if cloudwatch_link:
78+
click.echo(cloudwatch_link)
7579
except Exception as e:
7680
sys.exit(
7781
f"Unexpected error happens when trying to get logs for training job {job_name} : {e}"

src/hyperpod_cli/service/get_logs.py

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
1313
from typing import Optional
14+
import boto3
1415

1516
from hyperpod_cli.clients.kubernetes_client import (
1617
KubernetesClient,
@@ -19,9 +20,15 @@
1920
from hyperpod_cli.service.list_pods import (
2021
ListPods,
2122
)
23+
from hyperpod_cli.utils import (
24+
get_eks_cluster_name,
25+
get_hyperpod_cluster_region,
26+
)
2227
from kubernetes.client.rest import ApiException
2328
from kubernetes.client import V1ResourceAttributes
2429

30+
AMAZON_ClOUDWATCH_OBSERVABILITY = "amazon-cloudwatch-observability"
31+
2532
class GetLogs:
2633
def __init__(self):
2734
return
@@ -57,7 +64,67 @@ def get_training_job_logs(
5764
raise RuntimeError(
5865
f"Given pod name {pod_name} is not associated with training job {job_name} in namespace {namespace}"
5966
)
60-
6167
return k8s_client.get_logs_for_pod(pod_name, namespace)
6268
except ApiException as e:
6369
raise RuntimeError(f"Unexpected API error: {e.reason} ({e.status})")
70+
71+
def generate_cloudwatch_link(
72+
self,
73+
pod_name: str,
74+
namespace: Optional[str],
75+
):
76+
eks_cluster_name = get_eks_cluster_name()
77+
78+
if self.is_container_insights_addon_enabled(eks_cluster_name):
79+
k8s_client = KubernetesClient()
80+
81+
# pod_details is a V1Pod object
82+
pod_details = k8s_client.get_pod_details(pod_name, namespace)
83+
84+
# get node name
85+
if pod_details.spec and pod_details.spec.node_name:
86+
node_name = pod_details.spec.node_name
87+
else:
88+
node_name = None
89+
90+
# get container name
91+
if pod_details.spec and pod_details.spec.containers and pod_details.spec.containers[0].name:
92+
container_name = pod_details.spec.containers[0].name
93+
else:
94+
container_name = None
95+
96+
# get container_id
97+
if pod_details.status and pod_details.status.container_statuses and pod_details.status.container_statuses[0].container_id:
98+
full_container_id = pod_details.status.container_statuses[0].container_id
99+
100+
# full_container_id has format "containerd://xxxxxxxxxx"
101+
container_id = full_container_id[13:] if full_container_id.startswith('containerd://') else None
102+
else:
103+
container_id = None
104+
105+
# Cloudwatch container insight log groups should have the same pod log as API response
106+
if node_name and pod_name and namespace and container_name and container_id:
107+
region = get_hyperpod_cluster_region()
108+
109+
cloudwatch_url = self.get_log_url(eks_cluster_name, region, node_name, pod_name, namespace, container_name, container_id)
110+
cloudwatch_link = f'The pod cloudwatch log stream link is {cloudwatch_url}'
111+
else:
112+
cloudwatch_link = 'Failed to load container insights CloudWatch Link!'
113+
else:
114+
cloudwatch_link = None
115+
116+
return cloudwatch_link
117+
118+
def get_log_url(self, eks_cluster_name, region, node_name, pod_name, namespace, container_name, container_id):
119+
console_prefix = f'https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#'
120+
log_group_prefix = f'logsV2:log-groups/log-group/$252Faws$252Fcontainerinsights$252F{eks_cluster_name}$252Fapplication/log-events/'
121+
log_stream = f'{node_name}-application.var.log.containers.{pod_name}_{namespace}_{container_name}-{container_id}.log'
122+
123+
return console_prefix + log_group_prefix + log_stream
124+
125+
def is_container_insights_addon_enabled(self, eks_cluster_name):
126+
response = boto3.client("eks").list_addons(clusterName=eks_cluster_name, maxResults=50)
127+
if AMAZON_ClOUDWATCH_OBSERVABILITY in response.get('addons', []):
128+
return True
129+
else:
130+
return False

src/hyperpod_cli/utils.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def _retrieve_current_hyperpod_context():
113113

114114

115115
def _validate_link(console_url):
116-
pattern = "https:\/\/([a-z0-9-]+).console.aws.amazon.com\/sagemaker\/home\?region=([a-z0-9-]+)#\/cluster-management\/([a-z0-9-]+)"
116+
pattern = "https:\/\/([a-z0-9-]+).console.aws.amazon.com\/sagemaker\/home\?region=([a-z0-9-]+)#\/cluster-management\/([a-zA-Z0-9-]+)"
117117
match = re.match(pattern, console_url)
118118
if match:
119119
return True
@@ -168,3 +168,12 @@ def get_cluster_console_url():
168168
if _validate_link(console_url) and _validate_placeholders(region, cluster_name):
169169
return console_url
170170
return None
171+
172+
def get_eks_cluster_name():
173+
hyperpod_context_cluster = _retrieve_current_hyperpod_context()
174+
eks_cluster_arn = hyperpod_context_cluster.get("Orchestrator", {}).get("Eks", {}).get("ClusterArn", '')
175+
return eks_cluster_arn.split('cluster/')[-1]
176+
177+
def get_hyperpod_cluster_region():
178+
hyperpod_context_cluster = _retrieve_current_hyperpod_context()
179+
return hyperpod_context_cluster.get("ClusterArn").split(":")[3]

test/unit_tests/clients/test_kubernetes_client.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,15 @@ def test_logs_for_pods(self, mock_core_client: Mock):
520520
result = test_client.get_logs_for_pod("test", "kubeflow")
521521
self.assertIn("test log", result)
522522

523+
@patch(
524+
"kubernetes.client.CoreV1Api",
525+
return_value=Mock(read_namespaced_pod=Mock(return_value="pod details")),
526+
)
527+
def test_get_pod_details(self, mock_core_client: Mock):
528+
test_client = KubernetesClient()
529+
result = test_client.get_pod_details("test", "kubeflow")
530+
self.assertIn("pod details", result)
531+
523532
@patch(
524533
"kubernetes.client.CustomObjectsApi",
525534
return_value=Mock(

test/unit_tests/service/test_get_logs_service.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,20 @@ def test_get_logs_auto_discover_namespace(
125125
"sample-job", "test-pod", None
126126
)
127127
self.assertIn("test logs", result)
128+
129+
def test_get_log_url(
130+
self,
131+
):
132+
eks_cluster_name = 'eks_cluster_name'
133+
region = 'us-west-2'
134+
node_name = 'node_name'
135+
pod_name = 'pod_name'
136+
namespace = 'namespace'
137+
container_name = 'container_name'
138+
container_id = 'container_id'
139+
result_url = self.mock_get_logs.get_log_url(eks_cluster_name, region, node_name, pod_name, namespace, container_name, container_id)
140+
141+
self.assertEqual(
142+
result_url,
143+
'https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fcontainerinsights$252Feks_cluster_name$252Fapplication/log-events/node_name-application.var.log.containers.pod_name_namespace_container_name-container_id.log'
144+
)

test/unit_tests/test_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
setup_logger,
2424
get_cluster_console_url,
2525
store_current_hyperpod_context,
26+
get_eks_cluster_name,
27+
get_hyperpod_cluster_region,
2628
)
2729

2830
DATA_JSON = {
@@ -36,7 +38,8 @@
3638
'{"ClusterArn": "arn:aws:sagemaker:us-west-2:1234567890:cluster/test",'
3739
' "ClusterName": "hyperpod-eks-test",'
3840
' "ClusterStatus": "InService",'
39-
' "CreationTime": "2024-08-17 01:26:35.921000+00:00"}'
41+
' "CreationTime": "2024-08-17 01:26:35.921000+00:00",'
42+
' "Orchestrator": {"Eks": {"ClusterArn":"arn:aws:eks:us-west-2:593793038179:cluster/hyperpod-eks-test"}}}'
4043
)
4144

4245
INVALID_DATA = (
@@ -296,3 +299,24 @@ def test_get_cluster_console_url_longer_cluster_name_null(
296299
"/tmp/hyperpod_current_context.json",
297300
"r",
298301
)
302+
303+
def test_get_eks_cluster_name(self):
304+
mock_read = mock_open(read_data=DATA)
305+
with patch("builtins.open", mock_read):
306+
result = get_eks_cluster_name()
307+
self.assertEqual(result, "hyperpod-eks-test")
308+
mock_read.assert_called_once_with(
309+
"/tmp/hyperpod_current_context.json",
310+
"r",
311+
)
312+
313+
def test_get_hyperpod_cluster_region(self):
314+
mock_read = mock_open(read_data=DATA)
315+
with patch("builtins.open", mock_read):
316+
result = get_hyperpod_cluster_region()
317+
self.assertEqual(result, "us-west-2")
318+
mock_read.assert_called_once_with(
319+
"/tmp/hyperpod_current_context.json",
320+
"r",
321+
)
322+

0 commit comments

Comments
 (0)