1111# ANY KIND, either express or implied. See the License for the specific
1212# language governing permissions and limitations under the License.
1313from typing import Optional
14+ import boto3
1415
1516from hyperpod_cli .clients .kubernetes_client import (
1617 KubernetesClient ,
1920from hyperpod_cli .service .list_pods import (
2021 ListPods ,
2122)
23+ from hyperpod_cli .utils import (
24+ get_eks_cluster_name ,
25+ get_hyperpod_cluster_region ,
26+ validate_region_and_cluster_name ,
27+ )
2228from kubernetes .client .rest import ApiException
2329from kubernetes .client import V1ResourceAttributes
30+ import re
31+
32+ AMAZON_ClOUDWATCH_OBSERVABILITY = "amazon-cloudwatch-observability"
33+ CONTAINER_INSIGHTS_LOG_REGEX_PATTERN = "https:\/\/([a-z0-9-]+).console.aws.amazon.com\/cloudwatch\/home\?region=([a-z0-9-]+)#logsV2:log-groups\/log-group\/\$252Faws\$252Fcontainerinsights\$252F([a-zA-Z0-9-]+)\$252Fapplication\/log-events\/([a-z0-9-]+)-application.var.log.containers.([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)-([a-z0-9-]+).log"
2434
2535class GetLogs :
2636 def __init__ (self ):
@@ -57,7 +67,70 @@ def get_training_job_logs(
5767 raise RuntimeError (
5868 f"Given pod name { pod_name } is not associated with training job { job_name } in namespace { namespace } "
5969 )
60-
6170 return k8s_client .get_logs_for_pod (pod_name , namespace )
6271 except ApiException as e :
6372 raise RuntimeError (f"Unexpected API error: { e .reason } ({ e .status } )" )
73+
74+ def generate_cloudwatch_link (
75+ self ,
76+ pod_name : str ,
77+ namespace : Optional [str ],
78+ ):
79+ eks_cluster_name = get_eks_cluster_name ()
80+ region = get_hyperpod_cluster_region ()
81+
82+ if self .is_container_insights_addon_enabled (eks_cluster_name ):
83+ k8s_client = KubernetesClient ()
84+
85+ # pod_details is a V1Pod object
86+ pod_details = k8s_client .get_pod_details (pod_name , namespace )
87+
88+ # get node name
89+ if pod_details .spec and pod_details .spec .node_name :
90+ node_name = pod_details .spec .node_name
91+ else :
92+ node_name = None
93+
94+ # get container name
95+ if pod_details .spec and pod_details .spec .containers and pod_details .spec .containers [0 ].name :
96+ container_name = pod_details .spec .containers [0 ].name
97+ else :
98+ container_name = None
99+
100+ # get container_id
101+ if pod_details .status and pod_details .status .container_statuses and pod_details .status .container_statuses [0 ].container_id :
102+ full_container_id = pod_details .status .container_statuses [0 ].container_id
103+
104+ # full_container_id has format "containerd://xxxxxxxxxx"
105+ container_id = full_container_id [13 :] if full_container_id .startswith ('containerd://' ) else None
106+ else :
107+ container_id = None
108+
109+ # Cloudwatch container insight log groups should have the same pod log as API response
110+ cloudwatch_url = self .get_log_url (eks_cluster_name , region , node_name , pod_name , namespace , container_name , container_id )
111+
112+ if not validate_region_and_cluster_name (region , eks_cluster_name ):
113+ raise ValueError ('Eks cluster name or cluster region is invalid.' )
114+
115+ if not re .match (CONTAINER_INSIGHTS_LOG_REGEX_PATTERN , cloudwatch_url ):
116+ raise ValueError ("Failed to validate cloudwatch log url. Please make sure pod's node name, container name and container id are valid" )
117+
118+ cloudwatch_link = f'The pod cloudwatch log stream link is { cloudwatch_url } '
119+ else :
120+ cloudwatch_link = None
121+
122+ return cloudwatch_link
123+
124+ def get_log_url (self , eks_cluster_name , region , node_name , pod_name , namespace , container_name , container_id ):
125+ console_prefix = f'https://{ region } .console.aws.amazon.com/cloudwatch/home?region={ region } #'
126+ log_group_prefix = f'logsV2:log-groups/log-group/$252Faws$252Fcontainerinsights$252F{ eks_cluster_name } $252Fapplication/log-events/'
127+ log_stream = f'{ node_name } -application.var.log.containers.{ pod_name } _{ namespace } _{ container_name } -{ container_id } .log'
128+
129+ return console_prefix + log_group_prefix + log_stream
130+
131+ def is_container_insights_addon_enabled (self , eks_cluster_name ):
132+ response = boto3 .client ("eks" ).list_addons (clusterName = eks_cluster_name , maxResults = 50 )
133+ if AMAZON_ClOUDWATCH_OBSERVABILITY in response .get ('addons' , []):
134+ return True
135+ else :
136+ return False
0 commit comments