33import requests
44from celery import shared_task
55from django .conf import settings
6+ from requests .exceptions import RequestException , Timeout
67
78from utils .time_utils import current_epoch_timestamp
89
910logger = logging .getLogger (__name__ )
1011
12+ # Timeout constants (in seconds)
13+ K8S_API_TIMEOUT = 30
14+ HTTP_REQUEST_TIMEOUT = 30
15+
1116
1217def get_pod_details (namespace = 'drdroid' ):
1318 # Import kubernetes here to avoid loading at module import time
1419 from kubernetes import client , config
15-
20+
1621 try :
1722 # Try to load in-cluster config first, then local config
1823 try :
@@ -21,13 +26,36 @@ def get_pod_details(namespace='drdroid'):
2126 config .load_kube_config ()
2227
2328 v1 = client .CoreV1Api ()
24-
25- # Get all pods in the namespace
26- pods = v1 .list_namespaced_pod (namespace )
27- events = v1 .list_namespaced_event (namespace )
28-
29+
30+ # Get all pods in the namespace with timeout
31+ pods = v1 .list_namespaced_pod (namespace , _request_timeout = K8S_API_TIMEOUT )
32+ events = v1 .list_namespaced_event (namespace , _request_timeout = K8S_API_TIMEOUT )
33+
34+ # Get pod metrics for CPU/memory usage
35+ pod_metrics = {}
36+ try :
37+ custom_api = client .CustomObjectsApi ()
38+ metrics = custom_api .list_namespaced_custom_object (
39+ group = "metrics.k8s.io" ,
40+ version = "v1beta1" ,
41+ namespace = namespace ,
42+ plural = "pods" ,
43+ _request_timeout = K8S_API_TIMEOUT
44+ )
45+ for item in metrics .get ('items' , []):
46+ pod_name = item ['metadata' ]['name' ]
47+ container_metrics = {}
48+ for container in item .get ('containers' , []):
49+ container_metrics [container ['name' ]] = {
50+ 'cpu' : container ['usage' ].get ('cpu' , 'unknown' ),
51+ 'memory' : container ['usage' ].get ('memory' , 'unknown' )
52+ }
53+ pod_metrics [pod_name ] = container_metrics
54+ except Exception as e :
55+ logger .warning (f"Could not fetch pod metrics (metrics-server may not be installed): { e } " )
56+
2957 pod_data = []
30-
58+
3159 for pod in pods .items :
3260 # Extract container statuses
3361 container_statuses = []
@@ -40,7 +68,7 @@ def get_pod_details(namespace='drdroid'):
4068 state = "terminated"
4169 elif container .state .waiting :
4270 state = "waiting"
43-
71+
4472 last_state = "unknown"
4573 if container .last_state .running :
4674 last_state = "running"
@@ -49,13 +77,21 @@ def get_pod_details(namespace='drdroid'):
4977 elif container .last_state .waiting :
5078 last_state = "waiting"
5179
52- container_statuses . append ( {
80+ container_info = {
5381 "name" : container .name ,
5482 "state" : state ,
5583 "last_state" : last_state ,
5684 "ready" : container .ready ,
5785 "restart_count" : container .restart_count
58- })
86+ }
87+
88+ # Add resource usage if available
89+ if pod .metadata .name in pod_metrics :
90+ container_usage = pod_metrics [pod .metadata .name ].get (container .name , {})
91+ container_info ["cpu_usage" ] = container_usage .get ('cpu' , 'unknown' )
92+ container_info ["memory_usage" ] = container_usage .get ('memory' , 'unknown' )
93+
94+ container_statuses .append (container_info )
5995
6096 # Extract events for this pod
6197 pod_events = []
@@ -75,38 +111,49 @@ def get_pod_details(namespace='drdroid'):
75111 "containers" : container_statuses ,
76112 "events" : pod_events
77113 })
78-
114+
79115 return pod_data
80116
81117 except Exception as e :
82- logger .error (f"Error fetching pod details: { e } " )
118+ logger .error (f"Error fetching pod details: { e } " , exc_info = True )
83119 return []
84120
85121
86- @shared_task ( max_retries = 3 , default_retry_delay = 10 )
122+ @shared_task
87123def send_ping_to_drd_cloud ():
88124 drd_cloud_host = settings .DRD_CLOUD_API_HOST
89125 drd_cloud_api_token = settings .DRD_CLOUD_API_TOKEN
90126 commit_hash = settings .VPC_AGENT_COMMIT_HASH
91127 current_epoch = current_epoch_timestamp ()
92128
93- # Get pod details
94- pod_details = get_pod_details ()
95-
96- # Establish reachability with DRD Cloud
97- payload = {
98- 'commit_hash' : commit_hash ,
99- 'pods' : pod_details
100- }
101-
102- response = requests .post (f'{ drd_cloud_host } /connectors/proxy/ping' ,
103- headers = {'Authorization' : f'Bearer { drd_cloud_api_token } ' },
104- json = payload )
105-
106- if response .status_code != 200 :
107- logger .error (f'Failed to connect to DRD Cloud at { current_epoch } with code: { response .status_code } '
108- f'and response { response .text } ' )
129+ try :
130+ # Get pod details
131+ pod_details = get_pod_details ()
132+
133+ # Establish reachability with DRD Cloud
134+ payload = {
135+ 'commit_hash' : commit_hash ,
136+ 'pods' : pod_details
137+ }
138+
139+ response = requests .post (
140+ f'{ drd_cloud_host } /connectors/proxy/ping' ,
141+ headers = {'Authorization' : f'Bearer { drd_cloud_api_token } ' },
142+ json = payload ,
143+ timeout = HTTP_REQUEST_TIMEOUT
144+ )
145+
146+ if response .status_code != 200 :
147+ logger .error (f'Failed to connect to DRD Cloud at { current_epoch } with code: { response .status_code } '
148+ f'and response { response .text } ' )
149+ return False
150+ else :
151+ logger .info (f'Successfully connected to DRD Cloud at { current_epoch } ' )
152+ return True
153+
154+ except (RequestException , Timeout ) as e :
155+ logger .error (f'Request error while pinging DRD Cloud at { current_epoch } : { str (e )} ' )
156+ return False
157+ except Exception as e :
158+ logger .error (f'Unexpected error while pinging DRD Cloud at { current_epoch } : { str (e )} ' )
109159 return False
110- else :
111- logger .info (f'Successfully connected to DRD Cloud at { current_epoch } ' )
112- return True
0 commit comments