Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .docker/.env-template
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,6 @@ AZURE_AI_KEY=
# Generate with: openssl rand -hex 32
LLM_API_KEY=

# Public domain for the Caddy reverse proxy (e.g., llm.example.com)
# Required when running llm_proxy. Must point to this server's IP.
LLM_DOMAIN=
# LiteLLM Proxy (client-facing auth key for llm_proxy)
# Generate with: openssl rand -hex 32
LITELLM_MASTER_KEY=
30 changes: 15 additions & 15 deletions .docker/compose.controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -416,35 +416,37 @@ services:
- zuba
######################## End LLM SERVER ##########################################

######################## LLM PROXY (Caddy reverse proxy) ######################
# TLS-terminating reverse proxy for public LLM API access
# Routes: https://{LLM_DOMAIN}/{model-name}/v1/chat/completions
# Info: https://{LLM_DOMAIN}/models (no auth, lists available models)
######################## LLM PROXY (LiteLLM) ##################################
# OpenAI-compatible proxy that routes by model name in the request body.
# TLS is terminated at the OpenShift router (edge termination).
#
# Endpoints:
# POST https://<openshift-route>/v1/chat/completions (auth: Bearer <LITELLM_MASTER_KEY>)
# GET https://<openshift-route>/v1/models (auth: Bearer <LITELLM_MASTER_KEY>)
#
# Usage:
# docker compose -f .docker/compose.controller.yaml up llm_proxy
# (automatically starts llm_server as a dependency)
llm_proxy:
container_name: citz-imb-ai-llm-proxy
image: caddy:latest
image: docker.litellm.ai/berriai/litellm:main-stable
depends_on:
llm_server:
condition: service_healthy
ports:
- "443:443"
- "80:80"
environment:
LLM_DOMAIN: ${LLM_DOMAIN}
- "8080:8080"
volumes:
- ./llm/Caddyfile:/etc/caddy/Caddyfile:ro
- ./llm/models.json:/srv/models.json:ro
- caddy_data:/data
- caddy_config:/config
- ./llm/litellm_config.yaml:/app/config.yaml:ro
environment:
LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY}
LLM_API_KEY: ${LLM_API_KEY}
command: ["--config", "/app/config.yaml", "--port", "8080"]
restart: unless-stopped
networks:
- zuba
######################## End LLM PROXY #########################################


########################### Networks Definition ################################
networks:
zuba:
Expand All @@ -463,6 +465,4 @@ volumes:
airflow_logs:
analytics_data:
llm_models:
caddy_data:
caddy_config:
########################### End Volumes Definition ##########################
78 changes: 78 additions & 0 deletions .docker/llm/litellm_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# ─── LiteLLM Proxy Configuration ─────────────────────────────────────────────
#
# OpenAI-compatible proxy that routes requests to llama-server backends
# by the "model" field in the request body.
#
# Usage:
# POST https://<openshift-route>/v1/chat/completions
# { "model": "qwen25-3b", "messages": [...] }
# GET https://<openshift-route>/v1/models
#
# Auth:
# Client → LiteLLM: Authorization: Bearer <LITELLM_MASTER_KEY>
# LiteLLM → llama-server: Authorization: Bearer <LLM_API_KEY>
#
# ──────────────────────────────────────────────────────────────────────────────

model_list:
# ── Model 1: Qwen 2.5 3B - Fast text generation ───────────────────────────
- model_name: qwen25-3b
litellm_params:
model: openai/qwen25-3b
api_base: http://llm_server:8090/v1
api_key: os.environ/LLM_API_KEY

# ── Model 2: Qwen3 VL 4B - Vision + OCR ───────────────────────────────────
- model_name: qwen3-vl-4b
litellm_params:
model: openai/qwen3-vl-4b
api_base: http://llm_server:8091/v1
api_key: os.environ/LLM_API_KEY

# ── Model 3: GLM 4.1V 9B - Vision + Thinking ─────────────────────────────
- model_name: glm-41v-9b
litellm_params:
model: openai/glm-41v-9b
api_base: http://llm_server:8092/v1
api_key: os.environ/LLM_API_KEY

# ── Model 4: Kimi VL A3B - Vision + Thinking + OCR ────────────────────────
- model_name: kimi-vl-a3b
litellm_params:
model: openai/kimi-vl-a3b
api_base: http://llm_server:8093/v1
api_key: os.environ/LLM_API_KEY

# ── Model 5: Devstral 24B - Code specialist ───────────────────────────────
- model_name: devstral-24b
litellm_params:
model: openai/devstral-24b
api_base: http://llm_server:8094/v1
api_key: os.environ/LLM_API_KEY

# ── Model 6: Qwen2 0.5B - Ultra-fast tiny model ──────────────────────────
- model_name: qwen2-05b
litellm_params:
model: openai/qwen2-05b
api_base: http://llm_server:8095/v1
api_key: os.environ/LLM_API_KEY

# ── Model 7: Qwen3 8B - Balanced general purpose ─────────────────────────
- model_name: qwen3-8b
litellm_params:
model: openai/qwen3-8b
api_base: http://llm_server:8096/v1
api_key: os.environ/LLM_API_KEY

# ── Model 8: Qwen3 VL 8B - Vision + OCR ──────────────────────────────────
- model_name: qwen3-vl-8b
litellm_params:
model: openai/qwen3-vl-8b
api_base: http://llm_server:8097/v1
api_key: os.environ/LLM_API_KEY

general_settings:
master_key: os.environ/LITELLM_MASTER_KEY

litellm_settings:
drop_params: true
Loading