@@ -19,6 +19,8 @@ services:
1919 - OTEL_SERVICE_NAME=vllm-semantic-router
2020 - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface
2121 - HF_HUB_ENABLE_HF_TRANSFER=1
22+ - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
23+ - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
2224 networks :
2325 - semantic-network
2426 healthcheck :
@@ -27,11 +29,19 @@ services:
2729 timeout : 5s
2830 retries : 5
2931 start_period : 30s
30-
32+ depends_on :
33+ mock-vllm-1 :
34+ condition : service_healthy
35+ mock-vllm-2 :
36+ condition : service_healthy
37+
3138 # Envoy Proxy Service
3239 envoy :
3340 image : envoyproxy/envoy:v1.31.7
3441 container_name : envoy-proxy
42+ environment :
43+ - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
44+ - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
3545 security_opt :
3646 - label=disable
3747 ports :
@@ -52,24 +62,99 @@ services:
5262 retries : 5
5363 start_period : 10s
5464
55- # Mock vLLM service for testing profile
56- mock-vllm :
57- build :
58- context : ../../tools/mock-vllm
59- dockerfile : Dockerfile
60- container_name : mock-vllm
61- profiles : ["testing"]
62- ports :
63- - " 8000:8000"
64- networks :
65- semantic-network :
66- ipv4_address : 172.28.0.10
67- healthcheck :
68- test : ["CMD", "curl", "-fsS", "http://localhost:8000/health"]
69- interval : 10s
70- timeout : 5s
71- retries : 5
72- start_period : 5s
65+ # Mock vLLM service for testing profile
66+ mock-vllm-1 :
67+ image : amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better
68+ container_name : mock-vllm-1
69+ profiles : ["testing"]
70+ privileged : true
71+ networks :
72+ semantic-network :
73+ ipv4_address : 172.28.0.30
74+ devices :
75+ - /dev/dri:/dev/dri # GPU 设备
76+ volumes :
77+ - /home/intel/LLM/:/llm/models/
78+ - /home/intel/LLM2/:/llm/models2/
79+ environment :
80+ - no_proxy=localhost,127.0.0.1
81+ - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
82+ - VLLM_WORKER_MULTIPROC_METHOD=spawn
83+ - ZE_AFFINITY_MASK=0
84+ shm_size : " 32g"
85+ entrypoint : /bin/bash
86+ command : >
87+ -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server
88+ --model /llm/models/DeepSeek-R1-Distill-Qwen-7B/
89+ --served-model-name DeepSeek-R1-Distill-Qwen-7B
90+ --dtype=float16
91+ --enforce-eager
92+ --port 8006
93+ --host 0.0.0.0
94+ --trust-remote-code
95+ --disable-sliding-window
96+ --gpu-memory-util=0.9
97+ --no-enable-prefix-caching
98+ --max-num-batched-tokens=2000
99+ --disable-log-requests
100+ --max-model-len=3000
101+ --block-size 64
102+ -tp=1
103+ -pp=1
104+ --quantization fp8"
105+ healthcheck :
106+ test : ["CMD", "curl", "-fsS", "http://localhost:8006/health"]
107+ interval : 10s
108+ timeout : 5s
109+ retries : 5
110+ start_period : 30s # 增加启动时间,因为加载大模型需要更长时间
111+
112+ # Mock vLLM service for testing profile
113+ mock-vllm-2 :
114+ image : amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better
115+ container_name : mock-vllm-2
116+ profiles : ["testing"]
117+ privileged : true
118+ networks :
119+ semantic-network :
120+ ipv4_address : 172.28.0.31
121+ devices :
122+ - /dev/dri:/dev/dri # GPU 设备
123+ volumes :
124+ - /home/intel/LLM/:/llm/models/
125+ - /home/intel/LLM2/:/llm/models2/
126+ environment :
127+ - no_proxy=localhost,127.0.0.1
128+ - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
129+ - VLLM_WORKER_MULTIPROC_METHOD=spawn
130+ - ZE_AFFINITY_MASK=1
131+ shm_size : " 32g"
132+ entrypoint : /bin/bash
133+ command : >
134+ -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server
135+ --model /llm/models/Qwen3-8B/
136+ --served-model-name qwen3
137+ --dtype=float16
138+ --enforce-eager
139+ --port 8007
140+ --host 0.0.0.0
141+ --trust-remote-code
142+ --disable-sliding-window
143+ --gpu-memory-util=0.9
144+ --no-enable-prefix-caching
145+ --max-num-batched-tokens=2000
146+ --disable-log-requests
147+ --max-model-len=3000
148+ --block-size 64
149+ -tp=1
150+ -pp=1
151+ --quantization fp8"
152+ healthcheck :
153+ test : ["CMD", "curl", "-fsS", "http://localhost:8007/health"]
154+ interval : 10s
155+ timeout : 5s
156+ retries : 5
157+ start_period : 30s # 增加启动时间,因为加载大模型需要更长时间
73158
74159 # Jaeger for distributed tracing (OTLP gRPC + UI)
75160 jaeger :
@@ -105,6 +190,7 @@ services:
105190 - GF_SECURITY_ADMIN_USER=admin
106191 - GF_SECURITY_ADMIN_PASSWORD=admin
107192 - PROMETHEUS_URL=prometheus:9090
193+ - GF_SECURITY_ALLOW_EMBEDDING=true
108194 ports :
109195 - " 3000:3000"
110196 volumes :
@@ -135,9 +221,10 @@ services:
135221 networks :
136222 - semantic-network
137223
224+
138225 # Chat UI (Hugging Face) replacing Open WebUI
139226 chat-ui :
140- image : ghcr.io/huggingface/chat-ui-db:latest
227+ image : ghcr.io/huggingface/chat-ui-db:sha-a2b39bc
141228 container_name : chat-ui
142229 ports :
143230 - " 3002:3000" # Expose Chat UI on host 3002
@@ -152,7 +239,10 @@ services:
152239 # Optional theming (override as needed)
153240 - PUBLIC_APP_NAME=${PUBLIC_APP_NAME:-HuggingChat}
154241 - PUBLIC_APP_ASSETS=${PUBLIC_APP_ASSETS:-chatui}
155- - LOG_LEVEL=${LOG_LEVEL:-info}
242+ - PUBLIC_APP_DATA_SHARING=1
243+ - LOG_LEVEL=${LOG_LEVEL:-debug}
244+ - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
245+ - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
156246 depends_on :
157247 envoy :
158248 condition : service_started
@@ -161,13 +251,14 @@ services:
161251 mongo :
162252 condition : service_started
163253 healthcheck :
164- test : ["CMD", "wget", "--no-verbose ", "--tries=1", "--spider", " http://localhost:3000/"]
254+ test : ["CMD-SHELL ", "node -e \"require(' http').get('http ://localhost:3000/', (res) => process.exit(res.statusCode === 200 ? 0 : 1))\""]
165255 interval : 10s
166256 timeout : 5s
167257 retries : 5
168258 start_period : 20s
169259 networks :
170260 - semantic-network
261+
171262
172263 # Open WebUI Pipelines server (executes Python pipelines)
173264 pipelines :
@@ -195,6 +286,7 @@ services:
195286 image : mongo:7
196287 container_name : mongo
197288 restart : unless-stopped
289+ command : ["mongod", "--setParameter", "logLevel=2"]
198290 volumes :
199291 - mongo-data:/data/db
200292 networks :
@@ -243,6 +335,8 @@ services:
243335 - TARGET_OPENWEBUI_URL=http://openwebui:8080
244336 - TARGET_CHATUI_URL=http://chat-ui:3000
245337 - ROUTER_CONFIG_PATH=/app/config/config.yaml
338+ - NO_PROXY=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1
339+ - no_proxy=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1
246340 volumes :
247341 - ../../config:/app/config:rw,z
248342 ports :
0 commit comments