diff --git a/.docker/.env-template b/.docker/.env-template index 40e175071..2a8a8dcb3 100644 --- a/.docker/.env-template +++ b/.docker/.env-template @@ -40,6 +40,6 @@ AZURE_AI_KEY= # Generate with: openssl rand -hex 32 LLM_API_KEY= -# Public domain for the Caddy reverse proxy (e.g., llm.example.com) -# Required when running llm_proxy. Must point to this server's IP. -LLM_DOMAIN= +# LiteLLM Proxy (client-facing auth key for llm_proxy) +# Generate with: openssl rand -hex 32 +LITELLM_MASTER_KEY= diff --git a/.docker/compose.controller.yaml b/.docker/compose.controller.yaml index adacdd68d..aca790411 100644 --- a/.docker/compose.controller.yaml +++ b/.docker/compose.controller.yaml @@ -403,6 +403,8 @@ services: - "8095:8095" - "8096:8096" - "8097:8097" + - "8098:8098" + - "8099:8099" volumes: - llm_models:/models healthcheck: @@ -416,35 +418,37 @@ services: - zuba ######################## End LLM SERVER ########################################## - ######################## LLM PROXY (Caddy reverse proxy) ###################### - # TLS-terminating reverse proxy for public LLM API access - # Routes: https://{LLM_DOMAIN}/{model-name}/v1/chat/completions - # Info: https://{LLM_DOMAIN}/models (no auth, lists available models) + ######################## LLM PROXY (LiteLLM) ################################## + # OpenAI-compatible proxy that routes by model name in the request body. + # TLS is terminated at the OpenShift router (edge termination). + # + # Endpoints: + # POST https:///v1/chat/completions (auth: Bearer ) + # GET https:///v1/models (auth: Bearer ) # # Usage: # docker compose -f .docker/compose.controller.yaml up llm_proxy # (automatically starts llm_server as a dependency) llm_proxy: container_name: citz-imb-ai-llm-proxy - image: caddy:latest + image: docker.litellm.ai/berriai/litellm:main-stable depends_on: llm_server: condition: service_healthy ports: - - "443:443" - - "80:80" - environment: - LLM_DOMAIN: ${LLM_DOMAIN} + - "8080:8080" volumes: - - ./llm/Caddyfile:/etc/caddy/Caddyfile:ro - - ./llm/models.json:/srv/models.json:ro - - caddy_data:/data - - caddy_config:/config + - ./llm/litellm_config.yaml:/app/config.yaml:ro + environment: + LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY} + LLM_API_KEY: ${LLM_API_KEY} + command: ["--config", "/app/config.yaml", "--port", "8080"] restart: unless-stopped networks: - zuba ######################## End LLM PROXY ######################################### + ########################### Networks Definition ################################ networks: zuba: @@ -463,6 +467,4 @@ volumes: airflow_logs: analytics_data: llm_models: - caddy_data: - caddy_config: ########################### End Volumes Definition ########################## diff --git a/.docker/llm/Caddyfile b/.docker/llm/Caddyfile deleted file mode 100644 index f39451bd8..000000000 --- a/.docker/llm/Caddyfile +++ /dev/null @@ -1,66 +0,0 @@ -# ─── LLM Server Reverse Proxy ──────────────────────────────────────────────── -# -# Routing decision: Path-based routing is used because each llama-server -# instance serves exactly one model and ignores the "model" field in the -# JSON request body. Unlike the OpenAI API (which routes by request body), -# llama-server instances are independent processes on separate ports. -# Caddy cannot inspect JSON request bodies for routing. Path-based routing -# is the simplest approach that lets clients select models from a single -# domain without custom middleware. -# -# Usage: -# POST https://{domain}/{model-name}/v1/chat/completions -# GET https://{domain}/models (no auth, returns available model list) -# -# ────────────────────────────────────────────────────────────────────────────── - -{$LLM_DOMAIN} { - # ─── Public model list (no auth) ───────────────────────────────────────── - handle /models { - root * /srv - rewrite * /models.json - header Content-Type "application/json" - file_server - } - - # ─── Model routes ──────────────────────────────────────────────────────── - # Each handle_path block strips the model prefix before forwarding. - # Models that aren't running will return 502 from Caddy automatically. - - handle_path /qwen25-3b/* { - reverse_proxy llm_server:8090 - } - - handle_path /qwen3-vl-4b/* { - reverse_proxy llm_server:8091 - } - - handle_path /glm-41v-9b/* { - reverse_proxy llm_server:8092 - } - - handle_path /kimi-vl-a3b/* { - reverse_proxy llm_server:8093 - } - - handle_path /devstral-24b/* { - reverse_proxy llm_server:8094 - } - - handle_path /qwen2-05b/* { - reverse_proxy llm_server:8095 - } - - handle_path /qwen3-8b/* { - reverse_proxy llm_server:8096 - } - - handle_path /qwen3-vl-8b/* { - reverse_proxy llm_server:8097 - } - - # ─── Catch-all ─────────────────────────────────────────────────────────── - handle { - respond "Use /models to see available endpoints" 404 - } -} diff --git a/.docker/llm/litellm_config.yaml b/.docker/llm/litellm_config.yaml new file mode 100644 index 000000000..11585e378 --- /dev/null +++ b/.docker/llm/litellm_config.yaml @@ -0,0 +1,78 @@ +# ─── LiteLLM Proxy Configuration ───────────────────────────────────────────── +# +# OpenAI-compatible proxy that routes requests to llama-server backends +# by the "model" field in the request body. +# +# Usage: +# POST https:///v1/chat/completions +# { "model": "qwen25-3b", "messages": [...] } +# GET https:///v1/models +# +# Auth: +# Client → LiteLLM: Authorization: Bearer +# LiteLLM → llama-server: Authorization: Bearer +# +# ────────────────────────────────────────────────────────────────────────────── + +model_list: + # ── Model 1: Qwen 2.5 3B - Fast text generation ─────────────────────────── + - model_name: qwen25-3b + litellm_params: + model: openai/qwen25-3b + api_base: http://llm-server-svc:8090/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 2: Qwen3 VL 4B - Vision + OCR ─────────────────────────────────── + - model_name: qwen3-vl-4b + litellm_params: + model: openai/qwen3-vl-4b + api_base: http://llm-server-svc:8091/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 3: GLM 4.1V 9B - Vision + Thinking ───────────────────────────── + - model_name: glm-41v-9b + litellm_params: + model: openai/glm-41v-9b + api_base: http://llm-server-svc:8092/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 4: Kimi VL A3B - Vision + Thinking + OCR ──────────────────────── + - model_name: kimi-vl-a3b + litellm_params: + model: openai/kimi-vl-a3b + api_base: http://llm-server-svc:8093/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 5: Devstral 24B - Code specialist ─────────────────────────────── + - model_name: devstral-24b + litellm_params: + model: openai/devstral-24b + api_base: http://llm-server-svc:8094/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 6: Qwen2 0.5B - Ultra-fast tiny model ────────────────────────── + - model_name: qwen2-05b + litellm_params: + model: openai/qwen2-05b + api_base: http://llm-server-svc:8095/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 7: Qwen3 8B - Balanced general purpose ───────────────────────── + - model_name: qwen3-8b + litellm_params: + model: openai/qwen3-8b + api_base: http://llm-server-svc:8096/v1 + api_key: os.environ/LLM_API_KEY + + # ── Model 8: Qwen3 VL 8B - Vision + OCR ────────────────────────────────── + - model_name: qwen3-vl-8b + litellm_params: + model: openai/qwen3-vl-8b + api_base: http://llm-server-svc:8097/v1 + api_key: os.environ/LLM_API_KEY + +general_settings: + master_key: os.environ/LITELLM_MASTER_KEY + +litellm_settings: + drop_params: true diff --git a/.docker/llm/models.json b/.docker/llm/models.json deleted file mode 100644 index fcf2c7505..000000000 --- a/.docker/llm/models.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "info": "Path-based routing. Each model has its own URL prefix. Append /v1/chat/completions to the endpoint path. Authorization: Bearer header required.", - "models": [ - {"name": "qwen25-3b", "endpoint": "/qwen25-3b/v1/chat/completions", "port": 8090, "description": "Qwen 2.5 3B - Fast text generation"}, - {"name": "qwen3-vl-4b", "endpoint": "/qwen3-vl-4b/v1/chat/completions", "port": 8091, "description": "Qwen3 VL 4B - Vision + OCR"}, - {"name": "glm-41v-9b", "endpoint": "/glm-41v-9b/v1/chat/completions", "port": 8092, "description": "GLM 4.1V 9B - Vision + Thinking"}, - {"name": "kimi-vl-a3b", "endpoint": "/kimi-vl-a3b/v1/chat/completions", "port": 8093, "description": "Kimi VL A3B - Vision + Thinking + OCR"}, - {"name": "devstral-24b", "endpoint": "/devstral-24b/v1/chat/completions", "port": 8094, "description": "Devstral 24B - Code specialist"}, - {"name": "qwen2-05b", "endpoint": "/qwen2-05b/v1/chat/completions", "port": 8095, "description": "Qwen2 0.5B - Ultra-fast tiny model"}, - {"name": "qwen3-8b", "endpoint": "/qwen3-8b/v1/chat/completions", "port": 8096, "description": "Qwen3 8B - Balanced general purpose"}, - {"name": "qwen3-vl-8b", "endpoint": "/qwen3-vl-8b/v1/chat/completions", "port": 8097, "description": "Qwen3 VL 8B - Vision + OCR"} - ] -}