Skip to content

Commit 862bd49

Browse files
committed
feat: Introduce Celery tasks overview, enhance pod detail fetching with resource metrics and API timeouts, and improve error handling.
1 parent 9bb9197 commit 862bd49

File tree

1 file changed

+79
-32
lines changed

1 file changed

+79
-32
lines changed

agent/tasks.py

Lines changed: 79 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,21 @@
33
import requests
44
from celery import shared_task
55
from django.conf import settings
6+
from requests.exceptions import RequestException, Timeout
67

78
from utils.time_utils import current_epoch_timestamp
89

910
logger = logging.getLogger(__name__)
1011

12+
# Timeout constants (in seconds)
13+
K8S_API_TIMEOUT = 30
14+
HTTP_REQUEST_TIMEOUT = 30
15+
1116

1217
def get_pod_details(namespace='drdroid'):
1318
# Import kubernetes here to avoid loading at module import time
1419
from kubernetes import client, config
15-
20+
1621
try:
1722
# Try to load in-cluster config first, then local config
1823
try:
@@ -21,13 +26,36 @@ def get_pod_details(namespace='drdroid'):
2126
config.load_kube_config()
2227

2328
v1 = client.CoreV1Api()
24-
25-
# Get all pods in the namespace
26-
pods = v1.list_namespaced_pod(namespace)
27-
events = v1.list_namespaced_event(namespace)
28-
29+
30+
# Get all pods in the namespace with timeout
31+
pods = v1.list_namespaced_pod(namespace, _request_timeout=K8S_API_TIMEOUT)
32+
events = v1.list_namespaced_event(namespace, _request_timeout=K8S_API_TIMEOUT)
33+
34+
# Get pod metrics for CPU/memory usage
35+
pod_metrics = {}
36+
try:
37+
custom_api = client.CustomObjectsApi()
38+
metrics = custom_api.list_namespaced_custom_object(
39+
group="metrics.k8s.io",
40+
version="v1beta1",
41+
namespace=namespace,
42+
plural="pods",
43+
_request_timeout=K8S_API_TIMEOUT
44+
)
45+
for item in metrics.get('items', []):
46+
pod_name = item['metadata']['name']
47+
container_metrics = {}
48+
for container in item.get('containers', []):
49+
container_metrics[container['name']] = {
50+
'cpu': container['usage'].get('cpu', 'unknown'),
51+
'memory': container['usage'].get('memory', 'unknown')
52+
}
53+
pod_metrics[pod_name] = container_metrics
54+
except Exception as e:
55+
logger.warning(f"Could not fetch pod metrics (metrics-server may not be installed): {e}")
56+
2957
pod_data = []
30-
58+
3159
for pod in pods.items:
3260
# Extract container statuses
3361
container_statuses = []
@@ -40,7 +68,7 @@ def get_pod_details(namespace='drdroid'):
4068
state = "terminated"
4169
elif container.state.waiting:
4270
state = "waiting"
43-
71+
4472
last_state = "unknown"
4573
if container.last_state.running:
4674
last_state = "running"
@@ -49,13 +77,21 @@ def get_pod_details(namespace='drdroid'):
4977
elif container.last_state.waiting:
5078
last_state = "waiting"
5179

52-
container_statuses.append({
80+
container_info = {
5381
"name": container.name,
5482
"state": state,
5583
"last_state": last_state,
5684
"ready": container.ready,
5785
"restart_count": container.restart_count
58-
})
86+
}
87+
88+
# Add resource usage if available
89+
if pod.metadata.name in pod_metrics:
90+
container_usage = pod_metrics[pod.metadata.name].get(container.name, {})
91+
container_info["cpu_usage"] = container_usage.get('cpu', 'unknown')
92+
container_info["memory_usage"] = container_usage.get('memory', 'unknown')
93+
94+
container_statuses.append(container_info)
5995

6096
# Extract events for this pod
6197
pod_events = []
@@ -75,38 +111,49 @@ def get_pod_details(namespace='drdroid'):
75111
"containers": container_statuses,
76112
"events": pod_events
77113
})
78-
114+
79115
return pod_data
80116

81117
except Exception as e:
82-
logger.error(f"Error fetching pod details: {e}")
118+
logger.error(f"Error fetching pod details: {e}", exc_info=True)
83119
return []
84120

85121

86-
@shared_task(max_retries=3, default_retry_delay=10)
122+
@shared_task
87123
def send_ping_to_drd_cloud():
88124
drd_cloud_host = settings.DRD_CLOUD_API_HOST
89125
drd_cloud_api_token = settings.DRD_CLOUD_API_TOKEN
90126
commit_hash = settings.VPC_AGENT_COMMIT_HASH
91127
current_epoch = current_epoch_timestamp()
92128

93-
# Get pod details
94-
pod_details = get_pod_details()
95-
96-
# Establish reachability with DRD Cloud
97-
payload = {
98-
'commit_hash': commit_hash,
99-
'pods': pod_details
100-
}
101-
102-
response = requests.post(f'{drd_cloud_host}/connectors/proxy/ping',
103-
headers={'Authorization': f'Bearer {drd_cloud_api_token}'},
104-
json=payload)
105-
106-
if response.status_code != 200:
107-
logger.error(f'Failed to connect to DRD Cloud at {current_epoch} with code: {response.status_code} '
108-
f'and response {response.text}')
129+
try:
130+
# Get pod details
131+
pod_details = get_pod_details()
132+
133+
# Establish reachability with DRD Cloud
134+
payload = {
135+
'commit_hash': commit_hash,
136+
'pods': pod_details
137+
}
138+
139+
response = requests.post(
140+
f'{drd_cloud_host}/connectors/proxy/ping',
141+
headers={'Authorization': f'Bearer {drd_cloud_api_token}'},
142+
json=payload,
143+
timeout=HTTP_REQUEST_TIMEOUT
144+
)
145+
146+
if response.status_code != 200:
147+
logger.error(f'Failed to connect to DRD Cloud at {current_epoch} with code: {response.status_code} '
148+
f'and response {response.text}')
149+
return False
150+
else:
151+
logger.info(f'Successfully connected to DRD Cloud at {current_epoch}')
152+
return True
153+
154+
except (RequestException, Timeout) as e:
155+
logger.error(f'Request error while pinging DRD Cloud at {current_epoch}: {str(e)}')
156+
return False
157+
except Exception as e:
158+
logger.error(f'Unexpected error while pinging DRD Cloud at {current_epoch}: {str(e)}')
109159
return False
110-
else:
111-
logger.info(f'Successfully connected to DRD Cloud at {current_epoch}')
112-
return True

0 commit comments

Comments
 (0)