diff --git a/.docker/.env-template b/.docker/.env-template
index 40e175071..2a8a8dcb3 100644
--- a/.docker/.env-template
+++ b/.docker/.env-template
@@ -40,6 +40,6 @@ AZURE_AI_KEY=
 # Generate with: openssl rand -hex 32
 LLM_API_KEY=
 
-# Public domain for the Caddy reverse proxy (e.g., llm.example.com)
-# Required when running llm_proxy. Must point to this server's IP.
-LLM_DOMAIN=
+# LiteLLM Proxy (client-facing auth key for llm_proxy)
+# Generate with: openssl rand -hex 32
+LITELLM_MASTER_KEY=
diff --git a/.docker/compose.controller.yaml b/.docker/compose.controller.yaml
index adacdd68d..aca790411 100644
--- a/.docker/compose.controller.yaml
+++ b/.docker/compose.controller.yaml
@@ -403,6 +403,8 @@ services:
       - "8095:8095"
       - "8096:8096"
       - "8097:8097"
+      - "8098:8098"
+      - "8099:8099"
     volumes:
       - llm_models:/models
     healthcheck:
@@ -416,35 +418,37 @@ services:
       - zuba
   ######################## End LLM SERVER ##########################################
 
-  ######################## LLM PROXY (Caddy reverse proxy) ######################
-  # TLS-terminating reverse proxy for public LLM API access
-  # Routes: https://{LLM_DOMAIN}/{model-name}/v1/chat/completions
-  # Info:   https://{LLM_DOMAIN}/models (no auth, lists available models)
+  ######################## LLM PROXY (LiteLLM) ##################################
+  # OpenAI-compatible proxy that routes by model name in the request body.
+  # TLS is terminated at the OpenShift router (edge termination).
+  #
+  # Endpoints:
+  #   POST https://<openshift-route>/v1/chat/completions  (auth: Bearer <LITELLM_MASTER_KEY>)
+  #   GET  https://<openshift-route>/v1/models            (auth: Bearer <LITELLM_MASTER_KEY>)
   #
   # Usage:
   #   docker compose -f .docker/compose.controller.yaml up llm_proxy
   #   (automatically starts llm_server as a dependency)
   llm_proxy:
     container_name: citz-imb-ai-llm-proxy
-    image: caddy:latest
+    image: docker.litellm.ai/berriai/litellm:main-stable
     depends_on:
       llm_server:
         condition: service_healthy
     ports:
-      - "443:443"
-      - "80:80"
-    environment:
-      LLM_DOMAIN: ${LLM_DOMAIN}
+      - "8080:8080"
     volumes:
-      - ./llm/Caddyfile:/etc/caddy/Caddyfile:ro
-      - ./llm/models.json:/srv/models.json:ro
-      - caddy_data:/data
-      - caddy_config:/config
+      - ./llm/litellm_config.yaml:/app/config.yaml:ro
+    environment:
+      LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY}
+      LLM_API_KEY: ${LLM_API_KEY}
+    command: ["--config", "/app/config.yaml", "--port", "8080"]
     restart: unless-stopped
     networks:
       - zuba
   ######################## End LLM PROXY #########################################
 
+
 ########################### Networks Definition ################################
 networks:
   zuba:
@@ -463,6 +467,4 @@ volumes:
   airflow_logs:
   analytics_data:
   llm_models:
-  caddy_data:
-  caddy_config:
 ########################### End Volumes Definition ##########################
diff --git a/.docker/llm/Caddyfile b/.docker/llm/Caddyfile
deleted file mode 100644
index f39451bd8..000000000
--- a/.docker/llm/Caddyfile
+++ /dev/null
@@ -1,66 +0,0 @@
-# ─── LLM Server Reverse Proxy ────────────────────────────────────────────────
-#
-# Routing decision: Path-based routing is used because each llama-server
-# instance serves exactly one model and ignores the "model" field in the
-# JSON request body. Unlike the OpenAI API (which routes by request body),
-# llama-server instances are independent processes on separate ports.
-# Caddy cannot inspect JSON request bodies for routing. Path-based routing
-# is the simplest approach that lets clients select models from a single
-# domain without custom middleware.
-#
-# Usage:
-#   POST https://{domain}/{model-name}/v1/chat/completions
-#   GET  https://{domain}/models  (no auth, returns available model list)
-#
-# ──────────────────────────────────────────────────────────────────────────────
-
-{$LLM_DOMAIN} {
-	# ─── Public model list (no auth) ─────────────────────────────────────────
-	handle /models {
-		root * /srv
-		rewrite * /models.json
-		header Content-Type "application/json"
-		file_server
-	}
-
-	# ─── Model routes ────────────────────────────────────────────────────────
-	# Each handle_path block strips the model prefix before forwarding.
-	# Models that aren't running will return 502 from Caddy automatically.
-
-	handle_path /qwen25-3b/* {
-		reverse_proxy llm_server:8090
-	}
-
-	handle_path /qwen3-vl-4b/* {
-		reverse_proxy llm_server:8091
-	}
-
-	handle_path /glm-41v-9b/* {
-		reverse_proxy llm_server:8092
-	}
-
-	handle_path /kimi-vl-a3b/* {
-		reverse_proxy llm_server:8093
-	}
-
-	handle_path /devstral-24b/* {
-		reverse_proxy llm_server:8094
-	}
-
-	handle_path /qwen2-05b/* {
-		reverse_proxy llm_server:8095
-	}
-
-	handle_path /qwen3-8b/* {
-		reverse_proxy llm_server:8096
-	}
-
-	handle_path /qwen3-vl-8b/* {
-		reverse_proxy llm_server:8097
-	}
-
-	# ─── Catch-all ───────────────────────────────────────────────────────────
-	handle {
-		respond "Use /models to see available endpoints" 404
-	}
-}
diff --git a/.docker/llm/litellm_config.yaml b/.docker/llm/litellm_config.yaml
new file mode 100644
index 000000000..11585e378
--- /dev/null
+++ b/.docker/llm/litellm_config.yaml
@@ -0,0 +1,78 @@
+# ─── LiteLLM Proxy Configuration ─────────────────────────────────────────────
+#
+# OpenAI-compatible proxy that routes requests to llama-server backends
+# by the "model" field in the request body.
+#
+# Usage:
+#   POST https://<openshift-route>/v1/chat/completions
+#     { "model": "qwen25-3b", "messages": [...] }
+#   GET  https://<openshift-route>/v1/models
+#
+# Auth:
+#   Client → LiteLLM:    Authorization: Bearer <LITELLM_MASTER_KEY>
+#   LiteLLM → llama-server: Authorization: Bearer <LLM_API_KEY>
+#
+# ──────────────────────────────────────────────────────────────────────────────
+
+model_list:
+  # ── Model 1: Qwen 2.5 3B - Fast text generation ───────────────────────────
+  - model_name: qwen25-3b
+    litellm_params:
+      model: openai/qwen25-3b
+      api_base: http://llm-server-svc:8090/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 2: Qwen3 VL 4B - Vision + OCR ───────────────────────────────────
+  - model_name: qwen3-vl-4b
+    litellm_params:
+      model: openai/qwen3-vl-4b
+      api_base: http://llm-server-svc:8091/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 3: GLM 4.1V 9B - Vision + Thinking ─────────────────────────────
+  - model_name: glm-41v-9b
+    litellm_params:
+      model: openai/glm-41v-9b
+      api_base: http://llm-server-svc:8092/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 4: Kimi VL A3B - Vision + Thinking + OCR ────────────────────────
+  - model_name: kimi-vl-a3b
+    litellm_params:
+      model: openai/kimi-vl-a3b
+      api_base: http://llm-server-svc:8093/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 5: Devstral 24B - Code specialist ───────────────────────────────
+  - model_name: devstral-24b
+    litellm_params:
+      model: openai/devstral-24b
+      api_base: http://llm-server-svc:8094/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 6: Qwen2 0.5B - Ultra-fast tiny model ──────────────────────────
+  - model_name: qwen2-05b
+    litellm_params:
+      model: openai/qwen2-05b
+      api_base: http://llm-server-svc:8095/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 7: Qwen3 8B - Balanced general purpose ─────────────────────────
+  - model_name: qwen3-8b
+    litellm_params:
+      model: openai/qwen3-8b
+      api_base: http://llm-server-svc:8096/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 8: Qwen3 VL 8B - Vision + OCR ──────────────────────────────────
+  - model_name: qwen3-vl-8b
+    litellm_params:
+      model: openai/qwen3-vl-8b
+      api_base: http://llm-server-svc:8097/v1
+      api_key: os.environ/LLM_API_KEY
+
+general_settings:
+  master_key: os.environ/LITELLM_MASTER_KEY
+
+litellm_settings:
+  drop_params: true
diff --git a/.docker/llm/models.json b/.docker/llm/models.json
deleted file mode 100644
index fcf2c7505..000000000
--- a/.docker/llm/models.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "info": "Path-based routing. Each model has its own URL prefix. Append /v1/chat/completions to the endpoint path. Authorization: Bearer <LLM_API_KEY> header required.",
-  "models": [
-    {"name": "qwen25-3b",    "endpoint": "/qwen25-3b/v1/chat/completions",   "port": 8090, "description": "Qwen 2.5 3B - Fast text generation"},
-    {"name": "qwen3-vl-4b",  "endpoint": "/qwen3-vl-4b/v1/chat/completions", "port": 8091, "description": "Qwen3 VL 4B - Vision + OCR"},
-    {"name": "glm-41v-9b",   "endpoint": "/glm-41v-9b/v1/chat/completions",  "port": 8092, "description": "GLM 4.1V 9B - Vision + Thinking"},
-    {"name": "kimi-vl-a3b",  "endpoint": "/kimi-vl-a3b/v1/chat/completions", "port": 8093, "description": "Kimi VL A3B - Vision + Thinking + OCR"},
-    {"name": "devstral-24b", "endpoint": "/devstral-24b/v1/chat/completions", "port": 8094, "description": "Devstral 24B - Code specialist"},
-    {"name": "qwen2-05b",    "endpoint": "/qwen2-05b/v1/chat/completions",   "port": 8095, "description": "Qwen2 0.5B - Ultra-fast tiny model"},
-    {"name": "qwen3-8b",     "endpoint": "/qwen3-8b/v1/chat/completions",    "port": 8096, "description": "Qwen3 8B - Balanced general purpose"},
-    {"name": "qwen3-vl-8b",  "endpoint": "/qwen3-vl-8b/v1/chat/completions", "port": 8097, "description": "Qwen3 VL 8B - Vision + OCR"}
-  ]
-}