diff --git a/config/config.yaml b/config/config.yaml
index 085f0cdf9..45ae36250 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -45,17 +45,28 @@ prompt_guard:
 # Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
 # NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
 vllm_endpoints:
-  - name: "endpoint1"
-    address: "172.28.0.20"  # Static IPv4 of llm-katan within docker compose network
-    port: 8002
-    weight: 1
+  - name: "math-endpoint"  
+    address: "172.28.0.30"      # 数学模型服务器 IP  
+    port: 8006                # 数学模型端口  
+    weight: 1  
+    
+  - name: "text-endpoint"  
+    address: "172.28.0.30"      # 文字模型服务器 IP  
+    port: 8007                # 文字模型端口  
+    weight: 1  
 
-model_config:
-  "qwen3":
-    reasoning_family: "qwen3"  # This model uses Qwen-3 reasoning syntax
-    preferred_endpoints: ["endpoint1"]  # Optional: omit to let upstream handle endpoint selection
-    pii_policy:
-      allow_by_default: true
+model_config:  
+  "DeepSeek-R1-Distill-Qwen-7B":  
+    reasoning_family: "deepseek"    # DeepSeek 使用 deepseek reasoning 语法  
+    preferred_endpoints: ["math-endpoint"]  
+    pii_policy:  
+      allow_by_default: true  
+    
+  "qwen3":  
+    reasoning_family: "qwen3"       # Qwen3 使用 qwen3 reasoning 语法  
+    preferred_endpoints: ["text-endpoint"]  
+    pii_policy:  
+      allow_by_default: true  
 
 # Classifier configuration
 classifier:
@@ -139,7 +150,7 @@ categories:
   - name: math
     system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
     model_scores:
-      - model: qwen3
+      - model: DeepSeek-R1-Distill-Qwen-7B
         score: 1.0
         use_reasoning: true  # Enable reasoning for complex math
   - name: physics
diff --git a/dashboard/Dockerfile b/dashboard/Dockerfile
index c7732eafa..1f40bc8a5 100644
--- a/dashboard/Dockerfile
+++ b/dashboard/Dockerfile
@@ -1,5 +1,7 @@
 # Build frontend
 FROM node:18-alpine AS frontend-builder
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 WORKDIR /app/frontend
 COPY frontend/package*.json ./
 RUN npm ci
@@ -8,6 +10,8 @@ RUN npm run build
 
 # Build backend
 FROM golang:1.21-alpine AS backend-builder
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 WORKDIR /app/backend
 COPY backend/go.* ./
 RUN go mod download
@@ -16,6 +20,8 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o dashboard-server .
 
 # Final image
 FROM alpine:3.18
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 RUN apk add --no-cache ca-certificates
 WORKDIR /app
 COPY --from=backend-builder /app/backend/dashboard-server .
diff --git a/dashboard/backend/Dockerfile b/dashboard/backend/Dockerfile
index f513ffb5f..2b1974ddf 100644
--- a/dashboard/backend/Dockerfile
+++ b/dashboard/backend/Dockerfile
@@ -1,5 +1,7 @@
 # Stage 1: Build frontend with Node.js
 FROM node:20-alpine AS frontend-builder
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 WORKDIR /app/frontend
 COPY dashboard/frontend/package.json dashboard/frontend/package-lock.json dashboard/frontend/tsconfig.json dashboard/frontend/tsconfig.node.json dashboard/frontend/vite.config.ts ./
 COPY dashboard/frontend/src ./src
@@ -11,7 +13,8 @@ RUN npm run build
 # Stage 2: Build backend with Go
 FROM golang:1.24 AS backend-builder
 WORKDIR /app
-
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 # Use Chinese Go proxy to avoid network timeout issues
 ENV GOPROXY=https://goproxy.cn,direct
 ENV GOSUMDB=sum.golang.google.cn
@@ -31,6 +34,8 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s
 
 # Stage 3: Final runtime image
 FROM alpine:3.19
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 RUN apk --no-cache add ca-certificates wget
 WORKDIR /app
 COPY --from=backend-builder /app/dashboard-backend /app/dashboard-backend
diff --git a/deploy/docker-compose/addons/grafana.ini b/deploy/docker-compose/addons/grafana.ini
index c30fe2016..3b0152db8 100644
--- a/deploy/docker-compose/addons/grafana.ini
+++ b/deploy/docker-compose/addons/grafana.ini
@@ -36,7 +36,8 @@ enabled = false
 
 [server]
 # Configure root URL
-root_url = %(protocol)s://%(domain)s:%(http_port)s/
+# root_url = %(protocol)s://%(domain)s:%(http_port)s/
+root_url = http://10.112.229.41:8700/embedded/grafana/
 
 # Disable serving from sub path
 serve_from_sub_path = false
diff --git a/deploy/docker-compose/docker-compose-bk.yml b/deploy/docker-compose/docker-compose-bk.yml
new file mode 100644
index 000000000..90d7d8606
--- /dev/null
+++ b/deploy/docker-compose/docker-compose-bk.yml
@@ -0,0 +1,381 @@
+services:
+
+  # Semantic Router External Processor Service
+  semantic-router:
+    image: ghcr.io/vllm-project/semantic-router/extproc:latest
+    container_name: semantic-router
+    ports:
+      - "50051:50051"
+    volumes:
+      - ../../config:/app/config:ro,z
+      - ../../models:/app/models:ro,z
+      - ~/.cache/huggingface:/root/.cache/huggingface:z
+    environment:
+      - LD_LIBRARY_PATH=/app/lib
+      # Use main config by default; override via CONFIG_FILE if needed
+      - CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml}
+      # Optional informational envs (router reads YAML for tracing config)
+      - OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317
+      - OTEL_SERVICE_NAME=vllm-semantic-router
+      - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+      - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1  
+      - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "curl", "-f", "localhost:8080/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+    depends_on:  
+      mock-vllm-1:  
+        condition: service_healthy
+      mock-vllm-2:  
+        condition: service_healthy
+        
+  # Envoy Proxy Service
+  envoy:
+    image: envoyproxy/envoy:v1.31.7
+    container_name: envoy-proxy
+    environment:
+      - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1  
+      - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
+    security_opt:
+      - label=disable
+    ports:
+      - "8801:8801"  # Main proxy port
+      - "19000:19000"  # Admin interface
+    volumes:
+      - ./addons/envoy.yaml:/etc/envoy/envoy.yaml:ro,z
+    command: ["/usr/local/bin/envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "ext_proc:trace,router:trace,http:trace"]
+    depends_on:
+      semantic-router:
+        condition: service_healthy
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "bash", "-c", "(echo -e 'GET /ready HTTP/1.1\\r\\nHost: localhost\\r\\n\\r\\n' >&3; timeout 2 cat <&3) 3<>/dev/tcp/localhost/19000 | grep -q LIVE"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+
+  # Mock vLLM service for testing profile  
+  mock-vllm-1:  
+    image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better  
+    container_name: mock-vllm-1
+    profiles: ["testing"]  
+    privileged: true  
+    networks:  
+      semantic-network:  
+        ipv4_address: 172.28.0.30
+    devices:  
+      - /dev/dri:/dev/dri  # GPU 设备  
+    volumes:  
+      - /home/intel/LLM/:/llm/models/  
+      - /home/intel/LLM2/:/llm/models2/  
+    environment:  
+      - no_proxy=localhost,127.0.0.1  
+      - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1  
+      - VLLM_WORKER_MULTIPROC_METHOD=spawn  
+      - ZE_AFFINITY_MASK=0
+    shm_size: "32g"  
+    entrypoint: /bin/bash  
+    command: >  
+      -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server  
+      --model /llm/models/DeepSeek-R1-Distill-Qwen-7B/  
+      --served-model-name DeepSeek-R1-Distill-Qwen-7B  
+      --dtype=float16  
+      --enforce-eager  
+      --port 8006  
+      --host 0.0.0.0  
+      --trust-remote-code  
+      --disable-sliding-window  
+      --gpu-memory-util=0.9  
+      --no-enable-prefix-caching  
+      --max-num-batched-tokens=2000  
+      --disable-log-requests  
+      --max-model-len=3000  
+      --block-size 64  
+      -tp=1  
+      -pp=1  
+      --quantization fp8"  
+    healthcheck:  
+      test: ["CMD", "curl", "-fsS", "http://localhost:8006/health"]  
+      interval: 10s  
+      timeout: 5s  
+      retries: 5  
+      start_period: 30s  # 增加启动时间,因为加载大模型需要更长时间
+
+  # Mock vLLM service for testing profile  
+  mock-vllm-2:  
+    image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better  
+    container_name: mock-vllm-2 
+    profiles: ["testing"]  
+    privileged: true  
+    networks:  
+      semantic-network:  
+        ipv4_address: 172.28.0.31
+    devices:  
+      - /dev/dri:/dev/dri  # GPU 设备  
+    volumes:  
+      - /home/intel/LLM/:/llm/models/  
+      - /home/intel/LLM2/:/llm/models2/  
+    environment:  
+      - no_proxy=localhost,127.0.0.1  
+      - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1  
+      - VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - ZE_AFFINITY_MASK=1
+    shm_size: "32g"  
+    entrypoint: /bin/bash  
+    command: >  
+      -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server  
+      --model /llm/models/Qwen3-8B/  
+      --served-model-name qwen3  
+      --dtype=float16  
+      --enforce-eager  
+      --port 8007  
+      --host 0.0.0.0  
+      --trust-remote-code  
+      --disable-sliding-window  
+      --gpu-memory-util=0.9  
+      --no-enable-prefix-caching  
+      --max-num-batched-tokens=2000  
+      --disable-log-requests  
+      --max-model-len=3000  
+      --block-size 64  
+      -tp=1  
+      -pp=1  
+      --quantization fp8"  
+    healthcheck:  
+      test: ["CMD", "curl", "-fsS", "http://localhost:8007/health"]  
+      interval: 10s  
+      timeout: 5s  
+      retries: 5  
+      start_period: 30s  # 增加启动时间,因为加载大模型需要更长时间
+
+  # Jaeger for distributed tracing (OTLP gRPC + UI)
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+    ports:
+      - "4318:4317"  # OTLP gRPC (mapped to 4318 on host to avoid conflicts)
+      - "16686:16686"  # Web UI
+    networks:
+      - semantic-network
+
+  # Prometheus and Grafana for observability
+  prometheus:
+    image: prom/prometheus:v2.53.0
+    container_name: prometheus
+    volumes:
+      - ./addons/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro,z
+      - prometheus-data:/prometheus
+    command:
+      - --config.file=/etc/prometheus/prometheus.yaml
+      - --storage.tsdb.retention.time=15d
+    ports:
+      - "9090:9090"
+    networks:
+      - semantic-network
+
+  grafana:
+    image: grafana/grafana:11.5.1
+    container_name: grafana
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - PROMETHEUS_URL=prometheus:9090
+      - GF_SECURITY_ALLOW_EMBEDDING=true
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./addons/grafana.ini:/etc/grafana/grafana.ini:ro,z
+      - ./addons/grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro,z
+      - ./addons/grafana-datasource-jaeger.yaml:/etc/grafana/provisioning/datasources/datasource_jaeger.yaml:ro,z
+      - ./addons/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml:ro,z
+      - ./addons/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro,z
+      - grafana-data:/var/lib/grafana
+    networks:
+      - semantic-network
+    depends_on:
+      - prometheus
+
+  # Open WebUI (kept for feature parity and user choice)
+  openwebui:
+    image: ghcr.io/open-webui/open-webui:main
+    container_name: openwebui
+    ports:
+      - "3001:8080"  # Expose Open WebUI on host 3001
+    environment:
+      - WEBUI_NAME=Open WebUI
+      # Route Open WebUI's OpenAI-compatible calls through Pipelines by default
+      - OPENAI_API_BASE_URL=http://pipelines:9099
+      - OPENAI_API_KEY=0p3n-w3bu!
+    volumes:
+      - openwebui-data:/app/backend/data
+    networks:
+      - semantic-network
+
+
+  # Chat UI (Hugging Face) replacing Open WebUI
+  chat-ui:
+    image: ghcr.io/huggingface/chat-ui-db:sha-a2b39bc
+    container_name: chat-ui
+    ports:
+      - "3002:3000"  # Expose Chat UI on host 3002
+    environment:
+      # Point Chat UI to Envoy's OpenAI-compatible endpoint
+      - OPENAI_BASE_URL=http://envoy-proxy:8801/v1
+      # Provide a token if your upstream requires it; for HF router use hf_xxx
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-changeme}
+      # MongoDB for persistence (use Atlas by overriding MONGODB_URL)
+      - MONGODB_URL=${MONGODB_URL:-mongodb://mongo:27017}
+      - MONGODB_DB_NAME=${MONGODB_DB_NAME:-chat-ui}
+      # Optional theming (override as needed)
+      - PUBLIC_APP_NAME=${PUBLIC_APP_NAME:-HuggingChat}
+      - PUBLIC_APP_ASSETS=${PUBLIC_APP_ASSETS:-chatui}
+      - PUBLIC_APP_DATA_SHARING=1
+      - LOG_LEVEL=${LOG_LEVEL:-debug}
+      - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1  
+      - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
+    depends_on:
+      envoy:
+        condition: service_started
+      semantic-router:
+        condition: service_healthy
+      mongo:
+        condition: service_started
+    healthcheck:
+      test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3000/', (res) => process.exit(res.statusCode === 200 ? 0 : 1))\""]  
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 20s
+    networks:
+      - semantic-network
+      
+
+  # Open WebUI Pipelines server (executes Python pipelines)
+  pipelines:
+    image: ghcr.io/open-webui/pipelines:main
+    container_name: pipelines
+    environment:
+      - PYTHONUNBUFFERED=1
+    volumes:
+      # Persistent pipelines storage (auto-loaded on start)
+      - type: volume
+        source: openwebui-pipelines
+        target: /app/pipelines
+        volume:
+          nocopy: true
+      # Mount our vLLM Semantic Router pipeline (read-only) into the persistent dir
+      - type: bind
+        source: ./addons/vllm_semantic_router_pipe.py
+        target: /app/pipelines/vllm_semantic_router_pipe.py
+        read_only: true
+    networks:
+      - semantic-network
+
+  # MongoDB for Chat UI persistence (dev default; use Atlas in prod)
+  mongo:
+    image: mongo:7
+    container_name: mongo
+    restart: unless-stopped
+    command: ["mongod", "--setParameter", "logLevel=2"]
+    volumes:
+      - mongo-data:/data/db
+    networks:
+      - semantic-network
+
+  # LLM Katan service for testing
+  llm-katan:
+    image: ${LLM_KATAN_IMAGE:-ghcr.io/vllm-project/semantic-router/llm-katan:latest}
+    container_name: llm-katan
+    profiles: ["testing", "llm-katan"]
+    ports:
+      - "8002:8002"
+    environment:
+      - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN:-}
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+    volumes:
+      - ../../models:/app/models:ro,z
+      - hf-cache:/home/llmkatan/.cache/huggingface
+    networks:
+      semantic-network:
+        ipv4_address: 172.28.0.20
+    command: ["llm-katan", "--model", "/app/models/Qwen/Qwen3-0.6B", "--served-model-name", "qwen3", "--host", "0.0.0.0", "--port", "8002"]
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost:8002/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+
+  # Semantic Router Dashboard
+  dashboard:
+    # Use pre-built image from GHCR, fallback to local build for development
+    # image: ${DASHBOARD_IMAGE:-ghcr.io/vllm-project/semantic-router/dashboard:latest}
+    build:
+      context: ../../
+      dockerfile: dashboard/backend/Dockerfile
+    container_name: semantic-router-dashboard
+    command: ["/app/dashboard-backend", "-port=8700", "-static=/app/frontend", "-config=/app/config/config.yaml"]
+    environment:
+      - DASHBOARD_PORT=8700
+      - TARGET_GRAFANA_URL=http://grafana:3000
+      - TARGET_PROMETHEUS_URL=http://prometheus:9090
+      - TARGET_JAEGER_URL=http://jaeger:16686
+      - TARGET_ROUTER_API_URL=http://semantic-router:8080
+      - TARGET_ROUTER_METRICS_URL=http://semantic-router:9190/metrics
+      - TARGET_OPENWEBUI_URL=http://openwebui:8080
+      - TARGET_CHATUI_URL=http://chat-ui:3000
+      - ROUTER_CONFIG_PATH=/app/config/config.yaml
+      - NO_PROXY=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1  
+      - no_proxy=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1
+    volumes:
+      - ../../config:/app/config:rw,z
+    ports:
+      - "8700:8700"
+    networks:
+      - semantic-network
+    depends_on:
+      semantic-router:
+        condition: service_healthy
+      grafana:
+        condition: service_started
+      prometheus:
+        condition: service_started
+      chat-ui:
+        condition: service_started
+      openwebui:
+        condition: service_started
+      pipelines:
+        condition: service_started
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8700/healthz"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+
+networks:
+  semantic-network:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.28.0.0/16
+
+volumes:
+  models-cache:
+    driver: local
+  prometheus-data:
+  grafana-data:
+  mongo-data:
+  openwebui-data:
+  openwebui-pipelines:
+  hf-cache:
diff --git a/deploy/docker-compose/docker-compose.yml b/deploy/docker-compose/docker-compose.yml
index b7b1e14b1..90d7d8606 100644
--- a/deploy/docker-compose/docker-compose.yml
+++ b/deploy/docker-compose/docker-compose.yml
@@ -19,6 +19,8 @@ services:
       - OTEL_SERVICE_NAME=vllm-semantic-router
       - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface
       - HF_HUB_ENABLE_HF_TRANSFER=1
+      - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1  
+      - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
     networks:
       - semantic-network
     healthcheck:
@@ -27,11 +29,19 @@ services:
       timeout: 5s
       retries: 5
       start_period: 30s
-
+    depends_on:  
+      mock-vllm-1:  
+        condition: service_healthy
+      mock-vllm-2:  
+        condition: service_healthy
+        
   # Envoy Proxy Service
   envoy:
     image: envoyproxy/envoy:v1.31.7
     container_name: envoy-proxy
+    environment:
+      - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1  
+      - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
     security_opt:
       - label=disable
     ports:
@@ -52,24 +62,99 @@ services:
       retries: 5
       start_period: 10s
 
-  # Mock vLLM service for testing profile
-  mock-vllm:
-    build:
-      context: ../../tools/mock-vllm
-      dockerfile: Dockerfile
-    container_name: mock-vllm
-    profiles: ["testing"]
-    ports:
-      - "8000:8000"
-    networks:
-      semantic-network:
-        ipv4_address: 172.28.0.10
-    healthcheck:
-      test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"]
-      interval: 10s
-      timeout: 5s
-      retries: 5
-      start_period: 5s
+  # Mock vLLM service for testing profile  
+  mock-vllm-1:  
+    image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better  
+    container_name: mock-vllm-1
+    profiles: ["testing"]  
+    privileged: true  
+    networks:  
+      semantic-network:  
+        ipv4_address: 172.28.0.30
+    devices:  
+      - /dev/dri:/dev/dri  # GPU 设备  
+    volumes:  
+      - /home/intel/LLM/:/llm/models/  
+      - /home/intel/LLM2/:/llm/models2/  
+    environment:  
+      - no_proxy=localhost,127.0.0.1  
+      - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1  
+      - VLLM_WORKER_MULTIPROC_METHOD=spawn  
+      - ZE_AFFINITY_MASK=0
+    shm_size: "32g"  
+    entrypoint: /bin/bash  
+    command: >  
+      -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server  
+      --model /llm/models/DeepSeek-R1-Distill-Qwen-7B/  
+      --served-model-name DeepSeek-R1-Distill-Qwen-7B  
+      --dtype=float16  
+      --enforce-eager  
+      --port 8006  
+      --host 0.0.0.0  
+      --trust-remote-code  
+      --disable-sliding-window  
+      --gpu-memory-util=0.9  
+      --no-enable-prefix-caching  
+      --max-num-batched-tokens=2000  
+      --disable-log-requests  
+      --max-model-len=3000  
+      --block-size 64  
+      -tp=1  
+      -pp=1  
+      --quantization fp8"  
+    healthcheck:  
+      test: ["CMD", "curl", "-fsS", "http://localhost:8006/health"]  
+      interval: 10s  
+      timeout: 5s  
+      retries: 5  
+      start_period: 30s  # 增加启动时间,因为加载大模型需要更长时间
+
+  # Mock vLLM service for testing profile  
+  mock-vllm-2:  
+    image: amr-registry.caas.intel.com/intelanalytics/llm-scaler-vllm:temp-b6-better  
+    container_name: mock-vllm-2 
+    profiles: ["testing"]  
+    privileged: true  
+    networks:  
+      semantic-network:  
+        ipv4_address: 172.28.0.31
+    devices:  
+      - /dev/dri:/dev/dri  # GPU 设备  
+    volumes:  
+      - /home/intel/LLM/:/llm/models/  
+      - /home/intel/LLM2/:/llm/models2/  
+    environment:  
+      - no_proxy=localhost,127.0.0.1  
+      - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1  
+      - VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - ZE_AFFINITY_MASK=1
+    shm_size: "32g"  
+    entrypoint: /bin/bash  
+    command: >  
+      -c "numactl -C 0-15 python3 -m vllm.entrypoints.openai.api_server  
+      --model /llm/models/Qwen3-8B/  
+      --served-model-name qwen3  
+      --dtype=float16  
+      --enforce-eager  
+      --port 8007  
+      --host 0.0.0.0  
+      --trust-remote-code  
+      --disable-sliding-window  
+      --gpu-memory-util=0.9  
+      --no-enable-prefix-caching  
+      --max-num-batched-tokens=2000  
+      --disable-log-requests  
+      --max-model-len=3000  
+      --block-size 64  
+      -tp=1  
+      -pp=1  
+      --quantization fp8"  
+    healthcheck:  
+      test: ["CMD", "curl", "-fsS", "http://localhost:8007/health"]  
+      interval: 10s  
+      timeout: 5s  
+      retries: 5  
+      start_period: 30s  # 增加启动时间,因为加载大模型需要更长时间
 
   # Jaeger for distributed tracing (OTLP gRPC + UI)
   jaeger:
@@ -105,6 +190,7 @@ services:
       - GF_SECURITY_ADMIN_USER=admin
       - GF_SECURITY_ADMIN_PASSWORD=admin
       - PROMETHEUS_URL=prometheus:9090
+      - GF_SECURITY_ALLOW_EMBEDDING=true
     ports:
       - "3000:3000"
     volumes:
@@ -135,9 +221,10 @@ services:
     networks:
       - semantic-network
 
+
   # Chat UI (Hugging Face) replacing Open WebUI
   chat-ui:
-    image: ghcr.io/huggingface/chat-ui-db:latest
+    image: ghcr.io/huggingface/chat-ui-db:sha-a2b39bc
     container_name: chat-ui
     ports:
       - "3002:3000"  # Expose Chat UI on host 3002
@@ -152,7 +239,10 @@ services:
       # Optional theming (override as needed)
       - PUBLIC_APP_NAME=${PUBLIC_APP_NAME:-HuggingChat}
       - PUBLIC_APP_ASSETS=${PUBLIC_APP_ASSETS:-chatui}
-      - LOG_LEVEL=${LOG_LEVEL:-info}
+      - PUBLIC_APP_DATA_SHARING=1
+      - LOG_LEVEL=${LOG_LEVEL:-debug}
+      - NO_PROXY=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1  
+      - no_proxy=envoy-proxy,mongo,semantic-router,localhost,127.0.0.1
     depends_on:
       envoy:
         condition: service_started
@@ -161,13 +251,14 @@ services:
       mongo:
         condition: service_started
     healthcheck:
-      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/"]
+      test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3000/', (res) => process.exit(res.statusCode === 200 ? 0 : 1))\""]  
       interval: 10s
       timeout: 5s
       retries: 5
       start_period: 20s
     networks:
       - semantic-network
+      
 
   # Open WebUI Pipelines server (executes Python pipelines)
   pipelines:
@@ -195,6 +286,7 @@ services:
     image: mongo:7
     container_name: mongo
     restart: unless-stopped
+    command: ["mongod", "--setParameter", "logLevel=2"]
     volumes:
       - mongo-data:/data/db
     networks:
@@ -243,6 +335,8 @@ services:
       - TARGET_OPENWEBUI_URL=http://openwebui:8080
       - TARGET_CHATUI_URL=http://chat-ui:3000
       - ROUTER_CONFIG_PATH=/app/config/config.yaml
+      - NO_PROXY=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1  
+      - no_proxy=grafana,chat-ui,semantic-router,prometheus,jaeger,openwebui,mongo,localhost,127.0.0.1
     volumes:
       - ../../config:/app/config:rw,z
     ports:
diff --git a/tools/mock-vllm/Dockerfile b/tools/mock-vllm/Dockerfile
index 63292c9d4..06b0fe002 100644
--- a/tools/mock-vllm/Dockerfile
+++ b/tools/mock-vllm/Dockerfile
@@ -1,4 +1,6 @@
 FROM python:3.11-slim
+ENV http_proxy=http://proxy.iil.intel.com:911
+ENV https_proxy=http://proxy.iil.intel.com:911
 
 WORKDIR /app