|
15 | 15 | from fastapi import Depends, Request |
16 | 16 | from kubernetes import client as k8s_client |
17 | 17 | from kubernetes import config as k8s_config |
| 18 | +from kubernetes import watch |
18 | 19 | from kubernetes.client.rest import ApiException |
19 | 20 |
|
20 | 21 |
|
@@ -258,20 +259,27 @@ async def get_pod_logs(self, execution_id: str) -> tuple[dict, str]: |
258 | 259 | async def _wait_for_pod_completion(self, pod_name: str) -> k8s_client.V1Pod: |
259 | 260 | if not self.v1: |
260 | 261 | raise KubernetesServiceError(_K8S_CLIENT_NOT_INITIALIZED_MSG) |
261 | | - logger.info(f"Waiting for pod '{pod_name}' to complete...") |
262 | | - for _ in range(self.POD_RETRY_ATTEMPTS): |
263 | | - try: |
264 | | - pod = await asyncio.to_thread(self.v1.read_namespaced_pod, pod_name, self.NAMESPACE) |
265 | | - if pod.status and pod.status.phase in self.POD_SUCCESS_STATES: |
266 | | - logger.info(f"Pod '{pod_name}' reached terminal phase: {pod.status.phase}") |
267 | | - return pod |
268 | | - except ApiException as e: |
269 | | - if e.status == 404: |
270 | | - logger.warning(f"Pod '{pod_name}' not found, retrying...") |
271 | | - else: |
272 | | - logger.error(f"API Error while waiting for pod '{pod_name}': {e.reason}") |
273 | | - await asyncio.sleep(self.POD_RETRY_INTERVAL) |
274 | | - raise KubernetesPodError(f"Timeout waiting for pod '{pod_name}' to complete.") |
| 262 | + |
| 263 | + w = watch.Watch() |
| 264 | + return await asyncio.to_thread(self._watch_pod, w, pod_name) |
| 265 | + |
| 266 | + def _watch_pod(self, w: watch.Watch, pod_name: str) -> k8s_client.V1Pod: |
| 267 | + for event in w.stream( |
| 268 | + self.v1.list_namespaced_pod, |
| 269 | + namespace=self.NAMESPACE, |
| 270 | + field_selector=f"metadata.name={pod_name}", |
| 271 | + timeout_seconds=300 |
| 272 | + ): |
| 273 | + pod = event['object'] |
| 274 | + |
| 275 | + if event['type'] == 'DELETED': |
| 276 | + raise KubernetesPodError(f"Pod '{pod_name}' was deleted") |
| 277 | + |
| 278 | + if pod.status and pod.status.phase in self.POD_SUCCESS_STATES: |
| 279 | + logger.info(f"Pod '{pod_name}' completed: {pod.status.phase}") |
| 280 | + return pod |
| 281 | + |
| 282 | + raise KubernetesPodError(f"Pod '{pod_name}' watch timeout") |
275 | 283 |
|
276 | 284 | async def _get_container_logs(self, pod_name: str, container_name: str) -> str: |
277 | 285 | if not self.v1: |
|
0 commit comments