bcgov · NoorChasib · Mar 19, 2026 · Mar 19, 2026
@@ -40,6 +40,6 @@ AZURE_AI_KEY=
 # Generate with: openssl rand -hex 32
 LLM_API_KEY=
 
-# Public domain for the Caddy reverse proxy (e.g., llm.example.com)
-# Required when running llm_proxy. Must point to this server's IP.
-LLM_DOMAIN=
+# LiteLLM Proxy (client-facing auth key for llm_proxy)
+# Generate with: openssl rand -hex 32
+LITELLM_MASTER_KEY=
@@ -416,35 +416,37 @@ services:
       - zuba
   ######################## End LLM SERVER ##########################################
 
-  ######################## LLM PROXY (Caddy reverse proxy) ######################
-  # TLS-terminating reverse proxy for public LLM API access
-  # Routes: https://{LLM_DOMAIN}/{model-name}/v1/chat/completions
-  # Info:   https://{LLM_DOMAIN}/models (no auth, lists available models)
+  ######################## LLM PROXY (LiteLLM) ##################################
+  # OpenAI-compatible proxy that routes by model name in the request body.
+  # TLS is terminated at the OpenShift router (edge termination).
+  #
+  # Endpoints:
+  #   POST https://<openshift-route>/v1/chat/completions  (auth: Bearer <LITELLM_MASTER_KEY>)
+  #   GET  https://<openshift-route>/v1/models            (auth: Bearer <LITELLM_MASTER_KEY>)
   #
   # Usage:
   #   docker compose -f .docker/compose.controller.yaml up llm_proxy
   #   (automatically starts llm_server as a dependency)
   llm_proxy:
     container_name: citz-imb-ai-llm-proxy
-    image: caddy:latest
+    image: docker.litellm.ai/berriai/litellm:main-stable
     depends_on:
       llm_server:
         condition: service_healthy
     ports:
-      - "443:443"
-      - "80:80"
-    environment:
-      LLM_DOMAIN: ${LLM_DOMAIN}
+      - "8080:8080"
     volumes:
-      - ./llm/Caddyfile:/etc/caddy/Caddyfile:ro
-      - ./llm/models.json:/srv/models.json:ro
-      - caddy_data:/data
-      - caddy_config:/config
+      - ./llm/litellm_config.yaml:/app/config.yaml:ro
+    environment:
+      LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY}
+      LLM_API_KEY: ${LLM_API_KEY}
+    command: ["--config", "/app/config.yaml", "--port", "8080"]
     restart: unless-stopped
     networks:
       - zuba
   ######################## End LLM PROXY #########################################
 
+
 ########################### Networks Definition ################################
 networks:
   zuba:
@@ -463,6 +465,4 @@ volumes:
   airflow_logs:
   analytics_data:
   llm_models:
-  caddy_data:
-  caddy_config:
 ########################### End Volumes Definition ##########################
@@ -0,0 +1,78 @@
+# ─── LiteLLM Proxy Configuration ─────────────────────────────────────────────
+#
+# OpenAI-compatible proxy that routes requests to llama-server backends
+# by the "model" field in the request body.
+#
+# Usage:
+#   POST https://<openshift-route>/v1/chat/completions
+#     { "model": "qwen25-3b", "messages": [...] }
+#   GET  https://<openshift-route>/v1/models
+#
+# Auth:
+#   Client → LiteLLM:    Authorization: Bearer <LITELLM_MASTER_KEY>
+#   LiteLLM → llama-server: Authorization: Bearer <LLM_API_KEY>
+#
+# ──────────────────────────────────────────────────────────────────────────────
+
+model_list:
+  # ── Model 1: Qwen 2.5 3B - Fast text generation ───────────────────────────
+  - model_name: qwen25-3b
+    litellm_params:
+      model: openai/qwen25-3b
+      api_base: http://llm_server:8090/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 2: Qwen3 VL 4B - Vision + OCR ───────────────────────────────────
+  - model_name: qwen3-vl-4b
+    litellm_params:
+      model: openai/qwen3-vl-4b
+      api_base: http://llm_server:8091/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 3: GLM 4.1V 9B - Vision + Thinking ─────────────────────────────
+  - model_name: glm-41v-9b
+    litellm_params:
+      model: openai/glm-41v-9b
+      api_base: http://llm_server:8092/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 4: Kimi VL A3B - Vision + Thinking + OCR ────────────────────────
+  - model_name: kimi-vl-a3b
+    litellm_params:
+      model: openai/kimi-vl-a3b
+      api_base: http://llm_server:8093/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 5: Devstral 24B - Code specialist ───────────────────────────────
+  - model_name: devstral-24b
+    litellm_params:
+      model: openai/devstral-24b
+      api_base: http://llm_server:8094/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 6: Qwen2 0.5B - Ultra-fast tiny model ──────────────────────────
+  - model_name: qwen2-05b
+    litellm_params:
+      model: openai/qwen2-05b
+      api_base: http://llm_server:8095/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 7: Qwen3 8B - Balanced general purpose ─────────────────────────
+  - model_name: qwen3-8b
+    litellm_params:
+      model: openai/qwen3-8b
+      api_base: http://llm_server:8096/v1
+      api_key: os.environ/LLM_API_KEY
+
+  # ── Model 8: Qwen3 VL 8B - Vision + OCR ──────────────────────────────────
+  - model_name: qwen3-vl-8b
+    litellm_params:
+      model: openai/qwen3-vl-8b
+      api_base: http://llm_server:8097/v1
+      api_key: os.environ/LLM_API_KEY
+
+general_settings:
+  master_key: os.environ/LITELLM_MASTER_KEY
+
+litellm_settings:
+  drop_params: true