diff --git a/config/config.yaml b/config/config.yaml index 085f0cdf9..45ae36250 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -45,17 +45,28 @@ prompt_guard: # Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1 # NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field) vllm_endpoints: - - name: "endpoint1" - address: "172.28.0.20" # Static IPv4 of llm-katan within docker compose network - port: 8002 - weight: 1 + - name: "math-endpoint" + address: "172.28.0.30" # 数学模型服务器 IP + port: 8006 # 数学模型端口 + weight: 1 + + - name: "text-endpoint" + address: "172.28.0.30" # 文字模型服务器 IP + port: 8007 # 文字模型端口 + weight: 1 -model_config: - "qwen3": - reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint1"] # Optional: omit to let upstream handle endpoint selection - pii_policy: - allow_by_default: true +model_config: + "DeepSeek-R1-Distill-Qwen-7B": + reasoning_family: "deepseek" # DeepSeek 使用 deepseek reasoning 语法 + preferred_endpoints: ["math-endpoint"] + pii_policy: + allow_by_default: true + + "qwen3": + reasoning_family: "qwen3" # Qwen3 使用 qwen3 reasoning 语法 + preferred_endpoints: ["text-endpoint"] + pii_policy: + allow_by_default: true # Classifier configuration classifier: @@ -139,7 +150,7 @@ categories: - name: math system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." model_scores: - - model: qwen3 + - model: DeepSeek-R1-Distill-Qwen-7B score: 1.0 use_reasoning: true # Enable reasoning for complex math - name: physics diff --git a/dashboard/Dockerfile b/dashboard/Dockerfile index c7732eafa..1f40bc8a5 100644 --- a/dashboard/Dockerfile +++ b/dashboard/Dockerfile @@ -1,5 +1,7 @@ # Build frontend FROM node:18-alpine AS frontend-builder +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 WORKDIR /app/frontend COPY frontend/package*.json ./ RUN npm ci @@ -8,6 +10,8 @@ RUN npm run build # Build backend FROM golang:1.21-alpine AS backend-builder +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 WORKDIR /app/backend COPY backend/go.* ./ RUN go mod download @@ -16,6 +20,8 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o dashboard-server . # Final image FROM alpine:3.18 +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 RUN apk add --no-cache ca-certificates WORKDIR /app COPY --from=backend-builder /app/backend/dashboard-server . diff --git a/dashboard/backend/Dockerfile b/dashboard/backend/Dockerfile index f513ffb5f..2b1974ddf 100644 --- a/dashboard/backend/Dockerfile +++ b/dashboard/backend/Dockerfile @@ -1,5 +1,7 @@ # Stage 1: Build frontend with Node.js FROM node:20-alpine AS frontend-builder +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 WORKDIR /app/frontend COPY dashboard/frontend/package.json dashboard/frontend/package-lock.json dashboard/frontend/tsconfig.json dashboard/frontend/tsconfig.node.json dashboard/frontend/vite.config.ts ./ COPY dashboard/frontend/src ./src @@ -11,7 +13,8 @@ RUN npm run build # Stage 2: Build backend with Go FROM golang:1.24 AS backend-builder WORKDIR /app - +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 # Use Chinese Go proxy to avoid network timeout issues ENV GOPROXY=https://goproxy.cn,direct ENV GOSUMDB=sum.golang.google.cn @@ -31,6 +34,8 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s # Stage 3: Final runtime image FROM alpine:3.19 +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 RUN apk --no-cache add ca-certificates wget WORKDIR /app COPY --from=backend-builder /app/dashboard-backend /app/dashboard-backend diff --git a/deploy/docker-compose/addons/grafana.ini b/deploy/docker-compose/addons/grafana.ini index c30fe2016..3b0152db8 100644 --- a/deploy/docker-compose/addons/grafana.ini +++ b/deploy/docker-compose/addons/grafana.ini @@ -36,7 +36,8 @@ enabled = false [server] # Configure root URL -root_url = %(protocol)s://%(domain)s:%(http_port)s/ +# root_url = %(protocol)s://%(domain)s:%(http_port)s/ +root_url = http://10.112.229.41:8700/embedded/grafana/ # Disable serving from sub path serve_from_sub_path = false diff --git a/deploy/docker-compose/docker-compose-bk.yml b/deploy/docker-compose/docker-compose-bk.yml new file mode 100644 index 000000000..90d7d8606 --- /dev/null +++ b/deploy/docker-compose/docker-compose-bk.yml @@ -0,0 +1,381 @@ +services: + + # Semantic Router External Processor Service + semantic-router: + image: ghcr.io/vllm-project/semantic-router/extproc:latest + container_name: semantic-router + ports: + - "50051:50051" + volumes: + - ../../config:/app/config:ro,z + - ../../models:/app/models:ro,z + - ~/.cache/huggingface:/root/.cache/huggingface:z + environment: + - LD_LIBRARY_PATH=/app/lib + # Use main config by default; override via CONFIG_FILE if needed + - CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml} + # Optional informational envs (router reads YAML for tracing config) + - OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317 + - OTEL_SERVICE_NAME=vllm-semantic-router + - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface + - HF_HUB_ENABLE_HF_TRANSFER=1 + - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + networks: + - semantic-network + healthcheck: + test: ["CMD", "curl", "-f", "localhost:8080/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + depends_on: + mock-vllm-1: + condition: service_healthy + mock-vllm-2: + condition: service_healthy + + # Envoy Proxy Service + envoy: + image: envoyproxy/envoy:v1.31.7 + container_name: envoy-proxy + environment: + - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + security_opt: + - label=disable + ports: + - "8801:8801" # Main proxy port + - "19000:19000" # Admin interface + volumes: + - ./addons/envoy.yaml:/etc/envoy/envoy.yaml:ro,z + command: ["/usr/local/bin/envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "ext_proc:trace,router:trace,http:trace"] + depends_on: + semantic-router: + condition: service_healthy + networks: + - semantic-network + healthcheck: + test: ["CMD", "bash", "-c", "(echo -e 'GET /ready HTTP/1.1\\r\\nHost: localhost\\r\\n\\r\\n' >&3; timeout 2 cat <&3) 3<>/dev/tcp/localhost/19000 | grep -q LIVE"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + # Mock vLLM service for testing profile + mock-vllm-1: + image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better + container_name: mock-vllm-1 + profiles: ["testing"] + privileged: true + networks: + semantic-network: + ipv4_address: 172.28.0.30 + devices: + - /dev/dri:/dev/dri # GPU 设备 + volumes: + - /home/intel/LLM/:/llm/models/ + - /home/intel/LLM2/:/llm/models2/ + environment: + - no_proxy=localhost,127.0.0.1 + - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + - VLLM_WORKER_MULTIPROC_METHOD=spawn + - ZE_AFFINITY_MASK=0 + shm_size: "32g" + entrypoint: /bin/bash + command: > + -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server + --model /llm/models/DeepSeek-R1-Distill-Qwen-7B/ + --served-model-name DeepSeek-R1-Distill-Qwen-7B + --dtype=float16 + --enforce-eager + --port 8006 + --host 0.0.0.0 + --trust-remote-code + --disable-sliding-window + --gpu-memory-util=0.9 + --no-enable-prefix-caching + --max-num-batched-tokens=2000 + --disable-log-requests + --max-model-len=3000 + --block-size 64 + -tp=1 + -pp=1 + --quantization fp8" + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8006/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s # 增加启动时间,因为加载大模型需要更长时间 + + # Mock vLLM service for testing profile + mock-vllm-2: + image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better + container_name: mock-vllm-2 + profiles: ["testing"] + privileged: true + networks: + semantic-network: + ipv4_address: 172.28.0.31 + devices: + - /dev/dri:/dev/dri # GPU 设备 + volumes: + - /home/intel/LLM/:/llm/models/ + - /home/intel/LLM2/:/llm/models2/ + environment: + - no_proxy=localhost,127.0.0.1 + - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + - VLLM_WORKER_MULTIPROC_METHOD=spawn + - ZE_AFFINITY_MASK=1 + shm_size: "32g" + entrypoint: /bin/bash + command: > + -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server + --model /llm/models/Qwen3-8B/ + --served-model-name qwen3 + --dtype=float16 + --enforce-eager + --port 8007 + --host 0.0.0.0 + --trust-remote-code + --disable-sliding-window + --gpu-memory-util=0.9 + --no-enable-prefix-caching + --max-num-batched-tokens=2000 + --disable-log-requests + --max-model-len=3000 + --block-size 64 + -tp=1 + -pp=1 + --quantization fp8" + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8007/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s # 增加启动时间,因为加载大模型需要更长时间 + + # Jaeger for distributed tracing (OTLP gRPC + UI) + jaeger: + image: jaegertracing/all-in-one:latest + container_name: jaeger + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "4318:4317" # OTLP gRPC (mapped to 4318 on host to avoid conflicts) + - "16686:16686" # Web UI + networks: + - semantic-network + + # Prometheus and Grafana for observability + prometheus: + image: prom/prometheus:v2.53.0 + container_name: prometheus + volumes: + - ./addons/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro,z + - prometheus-data:/prometheus + command: + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.retention.time=15d + ports: + - "9090:9090" + networks: + - semantic-network + + grafana: + image: grafana/grafana:11.5.1 + container_name: grafana + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - PROMETHEUS_URL=prometheus:9090 + - GF_SECURITY_ALLOW_EMBEDDING=true + ports: + - "3000:3000" + volumes: + - ./addons/grafana.ini:/etc/grafana/grafana.ini:ro,z + - ./addons/grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro,z + - ./addons/grafana-datasource-jaeger.yaml:/etc/grafana/provisioning/datasources/datasource_jaeger.yaml:ro,z + - ./addons/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml:ro,z + - ./addons/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro,z + - grafana-data:/var/lib/grafana + networks: + - semantic-network + depends_on: + - prometheus + + # Open WebUI (kept for feature parity and user choice) + openwebui: + image: ghcr.io/open-webui/open-webui:main + container_name: openwebui + ports: + - "3001:8080" # Expose Open WebUI on host 3001 + environment: + - WEBUI_NAME=Open WebUI + # Route Open WebUI's OpenAI-compatible calls through Pipelines by default + - OPENAI_API_BASE_URL=http://pipelines:9099 + - OPENAI_API_KEY=0p3n-w3bu! + volumes: + - openwebui-data:/app/backend/data + networks: + - semantic-network + + + # Chat UI (Hugging Face) replacing Open WebUI + chat-ui: + image: ghcr.io/huggingface/chat-ui-db:sha-a2b39bc + container_name: chat-ui + ports: + - "3002:3000" # Expose Chat UI on host 3002 + environment: + # Point Chat UI to Envoy's OpenAI-compatible endpoint + - OPENAI_BASE_URL=http://envoy-proxy:8801/v1 + # Provide a token if your upstream requires it; for HF router use hf_xxx + - OPENAI_API_KEY=${OPENAI_API_KEY:-changeme} + # MongoDB for persistence (use Atlas by overriding MONGODB_URL) + - MONGODB_URL=${MONGODB_URL:-mongodb://mongo:27017} + - MONGODB_DB_NAME=${MONGODB_DB_NAME:-chat-ui} + # Optional theming (override as needed) + - PUBLIC_APP_NAME=${PUBLIC_APP_NAME:-HuggingChat} + - PUBLIC_APP_ASSETS=${PUBLIC_APP_ASSETS:-chatui} + - PUBLIC_APP_DATA_SHARING=1 + - LOG_LEVEL=${LOG_LEVEL:-debug} + - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + depends_on: + envoy: + condition: service_started + semantic-router: + condition: service_healthy + mongo: + condition: service_started + healthcheck: + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3000/', (res) => process.exit(res.statusCode === 200 ? 0 : 1))\""] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + networks: + - semantic-network + + + # Open WebUI Pipelines server (executes Python pipelines) + pipelines: + image: ghcr.io/open-webui/pipelines:main + container_name: pipelines + environment: + - PYTHONUNBUFFERED=1 + volumes: + # Persistent pipelines storage (auto-loaded on start) + - type: volume + source: openwebui-pipelines + target: /app/pipelines + volume: + nocopy: true + # Mount our vLLM Semantic Router pipeline (read-only) into the persistent dir + - type: bind + source: ./addons/vllm_semantic_router_pipe.py + target: /app/pipelines/vllm_semantic_router_pipe.py + read_only: true + networks: + - semantic-network + + # MongoDB for Chat UI persistence (dev default; use Atlas in prod) + mongo: + image: mongo:7 + container_name: mongo + restart: unless-stopped + command: ["mongod", "--setParameter", "logLevel=2"] + volumes: + - mongo-data:/data/db + networks: + - semantic-network + + # LLM Katan service for testing + llm-katan: + image: ${LLM_KATAN_IMAGE:-ghcr.io/vllm-project/semantic-router/llm-katan:latest} + container_name: llm-katan + profiles: ["testing", "llm-katan"] + ports: + - "8002:8002" + environment: + - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN:-} + - HF_HUB_ENABLE_HF_TRANSFER=1 + volumes: + - ../../models:/app/models:ro,z + - hf-cache:/home/llmkatan/.cache/huggingface + networks: + semantic-network: + ipv4_address: 172.28.0.20 + command: ["llm-katan", "--model", "/app/models/Qwen/Qwen3-0.6B", "--served-model-name", "qwen3", "--host", "0.0.0.0", "--port", "8002"] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8002/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + # Semantic Router Dashboard + dashboard: + # Use pre-built image from GHCR, fallback to local build for development + # image: ${DASHBOARD_IMAGE:-ghcr.io/vllm-project/semantic-router/dashboard:latest} + build: + context: ../../ + dockerfile: dashboard/backend/Dockerfile + container_name: semantic-router-dashboard + command: ["/app/dashboard-backend", "-port=8700", "-static=/app/frontend", "-config=/app/config/config.yaml"] + environment: + - DASHBOARD_PORT=8700 + - TARGET_GRAFANA_URL=http://grafana:3000 + - TARGET_PROMETHEUS_URL=http://prometheus:9090 + - TARGET_JAEGER_URL=http://jaeger:16686 + - TARGET_ROUTER_API_URL=http://semantic-router:8080 + - TARGET_ROUTER_METRICS_URL=http://semantic-router:9190/metrics + - TARGET_OPENWEBUI_URL=http://openwebui:8080 + - TARGET_CHATUI_URL=http://chat-ui:3000 + - ROUTER_CONFIG_PATH=/app/config/config.yaml + - NO_PROXY=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1 + - no_proxy=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1 + volumes: + - ../../config:/app/config:rw,z + ports: + - "8700:8700" + networks: + - semantic-network + depends_on: + semantic-router: + condition: service_healthy + grafana: + condition: service_started + prometheus: + condition: service_started + chat-ui: + condition: service_started + openwebui: + condition: service_started + pipelines: + condition: service_started + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8700/healthz"] + interval: 10s + timeout: 5s + retries: 3 + start_period: 10s + +networks: + semantic-network: + driver: bridge + ipam: + config: + - subnet: 172.28.0.0/16 + +volumes: + models-cache: + driver: local + prometheus-data: + grafana-data: + mongo-data: + openwebui-data: + openwebui-pipelines: + hf-cache: diff --git a/deploy/docker-compose/docker-compose.yml b/deploy/docker-compose/docker-compose.yml index b7b1e14b1..90d7d8606 100644 --- a/deploy/docker-compose/docker-compose.yml +++ b/deploy/docker-compose/docker-compose.yml @@ -19,6 +19,8 @@ services: - OTEL_SERVICE_NAME=vllm-semantic-router - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface - HF_HUB_ENABLE_HF_TRANSFER=1 + - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 networks: - semantic-network healthcheck: @@ -27,11 +29,19 @@ services: timeout: 5s retries: 5 start_period: 30s - + depends_on: + mock-vllm-1: + condition: service_healthy + mock-vllm-2: + condition: service_healthy + # Envoy Proxy Service envoy: image: envoyproxy/envoy:v1.31.7 container_name: envoy-proxy + environment: + - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 security_opt: - label=disable ports: @@ -52,24 +62,99 @@ services: retries: 5 start_period: 10s - # Mock vLLM service for testing profile - mock-vllm: - build: - context: ../../tools/mock-vllm - dockerfile: Dockerfile - container_name: mock-vllm - profiles: ["testing"] - ports: - - "8000:8000" - networks: - semantic-network: - ipv4_address: 172.28.0.10 - healthcheck: - test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"] - interval: 10s - timeout: 5s - retries: 5 - start_period: 5s + # Mock vLLM service for testing profile + mock-vllm-1: + image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better + container_name: mock-vllm-1 + profiles: ["testing"] + privileged: true + networks: + semantic-network: + ipv4_address: 172.28.0.30 + devices: + - /dev/dri:/dev/dri # GPU 设备 + volumes: + - /home/intel/LLM/:/llm/models/ + - /home/intel/LLM2/:/llm/models2/ + environment: + - no_proxy=localhost,127.0.0.1 + - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + - VLLM_WORKER_MULTIPROC_METHOD=spawn + - ZE_AFFINITY_MASK=0 + shm_size: "32g" + entrypoint: /bin/bash + command: > + -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server + --model /llm/models/DeepSeek-R1-Distill-Qwen-7B/ + --served-model-name DeepSeek-R1-Distill-Qwen-7B + --dtype=float16 + --enforce-eager + --port 8006 + --host 0.0.0.0 + --trust-remote-code + --disable-sliding-window + --gpu-memory-util=0.9 + --no-enable-prefix-caching + --max-num-batched-tokens=2000 + --disable-log-requests + --max-model-len=3000 + --block-size 64 + -tp=1 + -pp=1 + --quantization fp8" + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8006/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s # 增加启动时间,因为加载大模型需要更长时间 + + # Mock vLLM service for testing profile + mock-vllm-2: + image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better + container_name: mock-vllm-2 + profiles: ["testing"] + privileged: true + networks: + semantic-network: + ipv4_address: 172.28.0.31 + devices: + - /dev/dri:/dev/dri # GPU 设备 + volumes: + - /home/intel/LLM/:/llm/models/ + - /home/intel/LLM2/:/llm/models2/ + environment: + - no_proxy=localhost,127.0.0.1 + - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + - VLLM_WORKER_MULTIPROC_METHOD=spawn + - ZE_AFFINITY_MASK=1 + shm_size: "32g" + entrypoint: /bin/bash + command: > + -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server + --model /llm/models/Qwen3-8B/ + --served-model-name qwen3 + --dtype=float16 + --enforce-eager + --port 8007 + --host 0.0.0.0 + --trust-remote-code + --disable-sliding-window + --gpu-memory-util=0.9 + --no-enable-prefix-caching + --max-num-batched-tokens=2000 + --disable-log-requests + --max-model-len=3000 + --block-size 64 + -tp=1 + -pp=1 + --quantization fp8" + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8007/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s # 增加启动时间,因为加载大模型需要更长时间 # Jaeger for distributed tracing (OTLP gRPC + UI) jaeger: @@ -105,6 +190,7 @@ services: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=admin - PROMETHEUS_URL=prometheus:9090 + - GF_SECURITY_ALLOW_EMBEDDING=true ports: - "3000:3000" volumes: @@ -135,9 +221,10 @@ services: networks: - semantic-network + # Chat UI (Hugging Face) replacing Open WebUI chat-ui: - image: ghcr.io/huggingface/chat-ui-db:latest + image: ghcr.io/huggingface/chat-ui-db:sha-a2b39bc container_name: chat-ui ports: - "3002:3000" # Expose Chat UI on host 3002 @@ -152,7 +239,10 @@ services: # Optional theming (override as needed) - PUBLIC_APP_NAME=${PUBLIC_APP_NAME:-HuggingChat} - PUBLIC_APP_ASSETS=${PUBLIC_APP_ASSETS:-chatui} - - LOG_LEVEL=${LOG_LEVEL:-info} + - PUBLIC_APP_DATA_SHARING=1 + - LOG_LEVEL=${LOG_LEVEL:-debug} + - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 + - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1 depends_on: envoy: condition: service_started @@ -161,13 +251,14 @@ services: mongo: condition: service_started healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3000/', (res) => process.exit(res.statusCode === 200 ? 0 : 1))\""] interval: 10s timeout: 5s retries: 5 start_period: 20s networks: - semantic-network + # Open WebUI Pipelines server (executes Python pipelines) pipelines: @@ -195,6 +286,7 @@ services: image: mongo:7 container_name: mongo restart: unless-stopped + command: ["mongod", "--setParameter", "logLevel=2"] volumes: - mongo-data:/data/db networks: @@ -243,6 +335,8 @@ services: - TARGET_OPENWEBUI_URL=http://openwebui:8080 - TARGET_CHATUI_URL=http://chat-ui:3000 - ROUTER_CONFIG_PATH=/app/config/config.yaml + - NO_PROXY=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1 + - no_proxy=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1 volumes: - ../../config:/app/config:rw,z ports: diff --git a/tools/mock-vllm/Dockerfile b/tools/mock-vllm/Dockerfile index 63292c9d4..06b0fe002 100644 --- a/tools/mock-vllm/Dockerfile +++ b/tools/mock-vllm/Dockerfile @@ -1,4 +1,6 @@ FROM python:3.11-slim +ENV http_proxy=http://proxy.iil.intel.com:911 +ENV https_proxy=http://proxy.iil.intel.com:911 WORKDIR /app