Skip to content

Commit 15e2555

Browse files
committed
added deamonset to pull required images beforehand
1 parent d9df986 commit 15e2555

File tree

6 files changed

+163
-36
lines changed

6 files changed

+163
-36
lines changed

backend/app/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class Settings(BaseSettings):
2525
K8S_POD_CPU_REQUEST: str = "100m"
2626
K8S_POD_MEMORY_REQUEST: str = "128Mi"
2727
K8S_POD_EXECUTION_TIMEOUT: int = 5 # in seconds
28+
K8S_POD_PRIORITY_CLASS_NAME: Optional[str] = None
2829

2930
SUPPORTED_RUNTIMES: dict[str, list[str]] = Field(
3031
default_factory=lambda: RUNTIME_MATRIX

backend/app/main.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from contextlib import asynccontextmanager
23
from typing import AsyncGenerator
34

@@ -7,7 +8,7 @@
78
from app.core.logging import logger
89
from app.core.middleware import RequestSizeLimitMiddleware
910
from app.db.mongodb import DatabaseManager
10-
from app.services.kubernetes_service import KubernetesServiceManager
11+
from app.services.kubernetes_service import KubernetesService, KubernetesServiceManager
1112
from fastapi import FastAPI
1213
from fastapi.middleware.cors import CORSMiddleware
1314
from prometheus_fastapi_instrumentator import Instrumentator
@@ -39,6 +40,13 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
3940
app.state.k8s_manager = k8s_manager
4041
logger.info("Kubernetes service manager initialized")
4142

43+
k8s_service = KubernetesService(k8s_manager)
44+
app.state.k8s_service = k8s_service
45+
logger.info("KubernetesService singleton instance created.")
46+
47+
daemonset_task = asyncio.create_task(k8s_service.ensure_image_pre_puller_daemonset())
48+
app.state.daemonset_task = daemonset_task
49+
logger.info("Kubernetes image pre-puller daemonset task scheduled.")
4250
except ConnectionError as e:
4351
logger.critical(f"Failed to initialize DatabaseManager: {e}", extra={"error": str(e)})
4452
raise RuntimeError("Application startup failed: Could not connect to database.") from e
@@ -53,6 +61,15 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
5361

5462
# Shutdown
5563
try:
64+
if hasattr(app.state, "daemonset_task") and app.state.daemonset_task:
65+
task = app.state.daemonset_task
66+
if not task.done():
67+
task.cancel()
68+
try:
69+
await task
70+
except asyncio.CancelledError:
71+
logger.info("Image pre-puller daemonset task cancelled successfully.")
72+
5673
if hasattr(app.state, "k8s_manager") and app.state.k8s_manager:
5774
await app.state.k8s_manager.shutdown_all()
5875
logger.info("All Kubernetes services shut down")

backend/app/runtime_registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class RuntimeConfig(NamedTuple):
3838
"versions": ["1.20", "1.21", "1.22"],
3939
"image_tpl": "golang:{version}-alpine",
4040
"file_ext": "go",
41-
"interpreter": ["bash", "-c", "go run /scripts/main.go"],
41+
"interpreter": ["sh", "-c", "go run /scripts/main.go"],
4242
},
4343
}
4444

backend/app/services/kubernetes_service.py

Lines changed: 100 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
import os
44
from datetime import datetime, timedelta, timezone
55
from pathlib import Path
6-
from typing import Any, Dict, List, Set
6+
from typing import Any, Dict, List, Optional, Set
77

88
from app.config import get_settings
99
from app.core.logging import logger
10+
from app.runtime_registry import RUNTIME_REGISTRY
1011
from app.services.circuit_breaker import CircuitBreaker
1112
from app.services.pod_manifest_builder import PodManifestBuilder
1213
from fastapi import Depends, Request
@@ -58,9 +59,16 @@ class KubernetesService:
5859
HEALTH_CHECK_INTERVAL = 60
5960
CONTAINER_KUBECONFIG_PATH = "/app/kubeconfig.yaml"
6061

62+
v1: Optional[k8s_client.CoreV1Api]
63+
apps_v1: Optional[k8s_client.AppsV1Api]
64+
version_api: Optional[k8s_client.VersionApi]
65+
6166
def __init__(self, manager: KubernetesServiceManager):
6267
self.settings = get_settings()
6368
self.manager = manager
69+
self.v1 = None
70+
self.apps_v1 = None
71+
self.version_api = None
6472
self._initialize_kubernetes_client()
6573

6674
self.circuit_breaker = CircuitBreaker()
@@ -74,6 +82,10 @@ def __del__(self) -> None:
7482
self.manager.unregister(self)
7583

7684
async def check_health(self) -> bool:
85+
if not self.version_api:
86+
logger.warning("Kubernetes client not available for health check.")
87+
self._is_healthy = False
88+
return False
7789
try:
7890
if (datetime.now(timezone.utc) - self._last_health_check).seconds < self.HEALTH_CHECK_INTERVAL:
7991
return self._is_healthy
@@ -91,7 +103,6 @@ async def check_health(self) -> bool:
91103

92104
async def graceful_shutdown(self) -> None:
93105
shutdown_deadline = datetime.now(timezone.utc) + timedelta(seconds=self.SHUTDOWN_TIMEOUT)
94-
# Make a copy of keys to avoid modification during iteration issues
95106
for pod_name in list(self._active_pods.keys()):
96107
if datetime.now(timezone.utc) > shutdown_deadline:
97108
logger.warning("Shutdown timeout reached, forcing pod termination")
@@ -109,12 +120,14 @@ def _initialize_kubernetes_client(self) -> None:
109120
try:
110121
self._setup_kubernetes_config()
111122
self.v1 = k8s_client.CoreV1Api()
123+
self.apps_v1 = k8s_client.AppsV1Api()
112124
self.version_api = k8s_client.VersionApi()
113125
self._test_api_connection()
114126
logger.info("Kubernetes client initialized successfully.")
115127
except Exception as e:
116128
logger.error(f"Failed to initialize Kubernetes client: {str(e)}")
117129
self.v1 = None
130+
self.apps_v1 = None
118131
self.version_api = None
119132
raise KubernetesConfigError(f"Failed to initialize Kubernetes client: {str(e)}") from e
120133

@@ -137,6 +150,8 @@ def _setup_kubernetes_config(self) -> None:
137150
logger.info(f"Kubernetes client configured for host: {default_config.host}")
138151

139152
def _test_api_connection(self) -> None:
153+
if not self.version_api:
154+
raise KubernetesConfigError("VersionAPI client not initialized.")
140155
try:
141156
version = self.version_api.get_code()
142157
logger.info(f"Successfully connected to Kubernetes API. Server version: {version.git_version}")
@@ -183,6 +198,7 @@ async def create_execution_pod(
183198
pod_memory_limit=self.settings.K8S_POD_MEMORY_LIMIT,
184199
pod_memory_request=self.settings.K8S_POD_MEMORY_REQUEST,
185200
pod_execution_timeout=self.settings.K8S_POD_EXECUTION_TIMEOUT,
201+
priority_class_name=self.settings.K8S_POD_PRIORITY_CLASS_NAME,
186202
)
187203
pod_manifest = builder.build()
188204
await self._create_namespaced_pod(pod_manifest)
@@ -207,7 +223,6 @@ async def get_pod_logs(self, execution_id: str) -> tuple[dict, str]:
207223
logger.info(f"Raw logs from pod {pod_name}:\n---\n{full_logs}\n---")
208224

209225
try:
210-
# https://stackoverflow.com/questions/15197673/using-pythons-eval-vs-ast-literal-eval
211226
metrics = ast.literal_eval(full_logs)
212227
return metrics, pod_phase
213228
except (ValueError, SyntaxError, TypeError) as e:
@@ -226,6 +241,8 @@ async def get_pod_logs(self, execution_id: str) -> tuple[dict, str]:
226241
self._active_pods.pop(execution_id, None)
227242

228243
async def _wait_for_pod_completion(self, pod_name: str) -> k8s_client.V1Pod:
244+
if not self.v1:
245+
raise KubernetesServiceError(_K8S_CLIENT_NOT_INITIALIZED_MSG)
229246
logger.info(f"Waiting for pod '{pod_name}' to complete...")
230247
for _ in range(self.POD_RETRY_ATTEMPTS):
231248
try:
@@ -242,6 +259,8 @@ async def _wait_for_pod_completion(self, pod_name: str) -> k8s_client.V1Pod:
242259
raise KubernetesPodError(f"Timeout waiting for pod '{pod_name}' to complete.")
243260

244261
async def _get_container_logs(self, pod_name: str, container_name: str) -> str:
262+
if not self.v1:
263+
return f"Error: {_K8S_CLIENT_NOT_INITIALIZED_MSG}"
245264
try:
246265
return await asyncio.to_thread(
247266
self.v1.read_namespaced_pod_log,
@@ -254,6 +273,8 @@ async def _get_container_logs(self, pod_name: str, container_name: str) -> str:
254273
return f"Error retrieving logs: {e.reason}"
255274

256275
async def _create_config_map(self, config_map: k8s_client.V1ConfigMap) -> None:
276+
if not self.v1:
277+
raise KubernetesServiceError(_K8S_CLIENT_NOT_INITIALIZED_MSG)
257278
try:
258279
await asyncio.to_thread(self.v1.create_namespaced_config_map, namespace=self.NAMESPACE, body=config_map)
259280
logger.info(f"ConfigMap '{config_map.metadata.name}' created successfully.")
@@ -262,6 +283,8 @@ async def _create_config_map(self, config_map: k8s_client.V1ConfigMap) -> None:
262283
raise KubernetesServiceError(f"Failed to create ConfigMap: {str(e)}") from e
263284

264285
async def _create_namespaced_pod(self, pod_manifest: Dict[str, Any]) -> None:
286+
if not self.v1:
287+
raise KubernetesPodError(_K8S_CLIENT_NOT_INITIALIZED_MSG)
265288
pod_name = pod_manifest.get("metadata", {}).get("name", "unknown-pod")
266289
try:
267290
await asyncio.to_thread(self.v1.create_namespaced_pod, body=pod_manifest, namespace=self.NAMESPACE)
@@ -286,6 +309,80 @@ async def _cleanup_resources(self, pod_name: str, config_map_name: str) -> None:
286309
except ApiException as e:
287310
logger.error(f"Failed to delete config map '{config_map_name}': {e.reason}")
288311

312+
# DaemonSet: https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/
313+
async def ensure_image_pre_puller_daemonset(self) -> None:
314+
if not self.apps_v1:
315+
logger.warning("Kubernetes AppsV1Api client not initialized. Skipping DaemonSet creation.")
316+
return
317+
318+
daemonset_name = "runtime-image-pre-puller"
319+
namespace = self.NAMESPACE
320+
await asyncio.sleep(5)
321+
322+
try:
323+
init_containers = []
324+
all_images = {
325+
config.image
326+
for lang in RUNTIME_REGISTRY.values()
327+
for config in lang.values()
328+
}
329+
330+
for i, image_ref in enumerate(sorted(list(all_images))):
331+
sanitized_image_ref = image_ref.split('/')[-1].replace(':', '-').replace('.', '-').replace('_', '-')
332+
logger.info(f"DAEMONSET: before: {image_ref} -> {sanitized_image_ref}")
333+
container_name = f"pull-{i}-{sanitized_image_ref}"
334+
init_containers.append({
335+
"name": container_name,
336+
"image": image_ref,
337+
"command": ["/bin/sh", "-c", f'echo "Image {image_ref} pulled."'],
338+
"imagePullPolicy": "Always",
339+
})
340+
341+
manifest: Dict[str, Any] = {
342+
"apiVersion": "apps/v1",
343+
"kind": "DaemonSet",
344+
"metadata": {"name": daemonset_name, "namespace": namespace},
345+
"spec": {
346+
"selector": {"matchLabels": {"name": daemonset_name}},
347+
"template": {
348+
"metadata": {"labels": {"name": daemonset_name}},
349+
"spec": {
350+
"initContainers": init_containers,
351+
"containers": [{
352+
"name": "pause",
353+
"image": "registry.k8s.io/pause:3.9"
354+
}],
355+
"tolerations": [{"operator": "Exists"}]
356+
}
357+
},
358+
"updateStrategy": {"type": "RollingUpdate"}
359+
}
360+
}
361+
362+
try:
363+
await asyncio.to_thread(self.apps_v1.read_namespaced_daemon_set, name=daemonset_name,
364+
namespace=namespace)
365+
logger.info(f"DaemonSet '{daemonset_name}' exists. Replacing to ensure it is up-to-date.")
366+
await asyncio.to_thread(
367+
self.apps_v1.replace_namespaced_daemon_set,
368+
name=daemonset_name, namespace=namespace, body=manifest
369+
)
370+
logger.info(f"DaemonSet '{daemonset_name}' replaced successfully.")
371+
except ApiException as e:
372+
if e.status == 404:
373+
logger.info(f"DaemonSet '{daemonset_name}' not found. Creating...")
374+
await asyncio.to_thread(
375+
self.apps_v1.create_namespaced_daemon_set, namespace=namespace, body=manifest
376+
)
377+
logger.info(f"DaemonSet '{daemonset_name}' created successfully.")
378+
else:
379+
raise
380+
381+
except ApiException as e:
382+
logger.error(f"K8s API error applying DaemonSet '{daemonset_name}': {e.reason}", exc_info=True)
383+
except Exception as e:
384+
logger.error(f"Unexpected error applying image-puller DaemonSet: {e}", exc_info=True)
385+
289386

290387
def get_k8s_manager(request: Request) -> KubernetesServiceManager:
291388
if not hasattr(request.app.state, "k8s_manager"):
Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Dict, List
1+
from typing import Any, Dict, List, Optional
22

33

44
class PodManifestBuilder:
@@ -13,6 +13,7 @@ def __init__(
1313
pod_memory_limit: str,
1414
pod_memory_request: str,
1515
pod_execution_timeout: int,
16+
priority_class_name: Optional[str],
1617
namespace: str = "default",
1718
):
1819
self.execution_id = execution_id
@@ -24,10 +25,44 @@ def __init__(
2425
self.pod_memory_limit = pod_memory_limit
2526
self.pod_memory_request = pod_memory_request
2627
self.pod_execution_timeout = pod_execution_timeout
28+
self.priority_class_name = priority_class_name
2729
self.namespace = namespace
2830

2931
def build(self) -> Dict[str, Any]:
30-
return {
32+
spec: Dict[str, Any] = {
33+
"containers": [
34+
{
35+
"name": "script-runner",
36+
"image": self.image,
37+
"imagePullPolicy": "IfNotPresent",
38+
"command": self.command,
39+
"args": [],
40+
"resources": {
41+
"limits": {"cpu": self.pod_cpu_limit, "memory": self.pod_memory_limit},
42+
"requests": {"cpu": self.pod_cpu_request, "memory": self.pod_memory_request},
43+
},
44+
"volumeMounts": [
45+
{"name": "script-volume", "mountPath": "/scripts"},
46+
],
47+
}
48+
],
49+
"volumes": [
50+
{
51+
"name": "script-volume",
52+
"configMap": {
53+
"name": self.config_map_name,
54+
"defaultMode": 0o755
55+
}
56+
},
57+
],
58+
"restartPolicy": "Never",
59+
"activeDeadlineSeconds": self.pod_execution_timeout + 5,
60+
}
61+
62+
if self.priority_class_name:
63+
spec["priorityClassName"] = self.priority_class_name
64+
65+
pod_manifest: Dict[str, Any] = {
3166
"apiVersion": "v1",
3267
"kind": "Pod",
3368
"metadata": {
@@ -38,33 +73,7 @@ def build(self) -> Dict[str, Any]:
3873
"execution-id": self.execution_id,
3974
},
4075
},
41-
"spec": {
42-
"containers": [
43-
{
44-
"name": "script-runner",
45-
"image": self.image,
46-
"imagePullPolicy": "IfNotPresent", # Only if not available locally
47-
"command": self.command,
48-
"args": [],
49-
"resources": {
50-
"limits": {"cpu": self.pod_cpu_limit, "memory": self.pod_memory_limit},
51-
"requests": {"cpu": self.pod_cpu_request, "memory": self.pod_memory_request},
52-
},
53-
"volumeMounts": [
54-
{"name": "script-volume", "mountPath": "/scripts"},
55-
],
56-
}
57-
],
58-
"volumes": [
59-
{
60-
"name": "script-volume",
61-
"configMap": {
62-
"name": self.config_map_name,
63-
"defaultMode": 0o755
64-
}
65-
},
66-
],
67-
"restartPolicy": "Never",
68-
"activeDeadlineSeconds": self.pod_execution_timeout + 5,
69-
},
76+
"spec": spec,
7077
}
78+
79+
return pod_manifest

cert-generator/setup-k8s.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ rules:
8787
- apiGroups: ["", "metrics.k8s.io"]
8888
resources: ["configmaps", "pods", "pods/log", "pods/exec", "nodes", "services"]
8989
verbs: ["create", "get", "list", "watch", "delete"]
90+
- apiGroups: ["apps"]
91+
resources: ["daemonsets"]
92+
verbs: ["get", "list", "watch", "create", "delete", "replace"]
9093
EOF
9194
kubectl apply -f - <<EOF
9295
apiVersion: rbac.authorization.k8s.io/v1

0 commit comments

Comments
 (0)