update config & update docker compose README

JaredforReal · JaredforReal · commit 274d0298c23f · 2025-10-23T23:59:41.000+08:00
Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;
diff --git a/deploy/docker-compose/README.md b/deploy/docker-compose/README.md
@@ -4,8 +4,9 @@ This directory contains the primary `docker-compose.yml` used to run the Semanti
 
 - Envoy proxy (ExtProc integration)
 - Semantic Router (extproc)
-- Observability (Prometheus + Grafana)
+- Observability (Prometheus + Grafana + Jaeger)
 - Dashboard (unified UI: config, monitoring, topology, playground)
+- Chat UI (Hugging Face Chat UI with MongoDB)
 - Open WebUI + Pipelines (for the Playground tab)
 - Optional test services (mock-vllm, llm-katan via profiles)
 
@@ -26,11 +27,14 @@ Example mappings:
 - `semantic-router` (port: 50051 for gRPC ExtProc; has internal health on 8080)
 - `prometheus` (port: 9090)
 - `grafana` (port: 3000)
+- `jaeger` (ports: 4318, 16686)
+- `chat-ui` (port: 3002 → 3000 in-container)
+- `mongo` (no host port by default)
 - `openwebui` (port: 3001 → 8080 in-container)
 - `pipelines` (no host port by default)
 - `dashboard` (port: 8700)
 - `mock-vllm` (port: 8000; profile: testing)
-- `llm-katan` (port: 8002 → 8000; profiles: testing, llm-katan)
+- `llm-katan` (port: 8002; profiles: testing, llm-katan)
 
 ## Profiles
 
@@ -46,6 +50,8 @@ These host ports are exposed when you bring the stack up:
 - Envoy admin: http://localhost:19000
 - Grafana: http://localhost:3000 (admin/admin)
 - Prometheus: http://localhost:9090
+- Jaeger: http://localhost:16686 (tracing UI)
+- Chat UI: http://localhost:3002 (Hugging Face Chat UI)
 - Open WebUI: http://localhost:3001
 - Mock vLLM (testing profile): http://localhost:8000
 - LLM Katan (testing/llm-katan profiles): http://localhost:8002
@@ -107,15 +113,17 @@ The `dashboard` service exposes a unified UI at http://localhost:8700 with:
 - Monitoring: iframe embed of Grafana
 - Config: `GET /api/router/config/all` and `POST /api/router/config/update` mapped to `/app/config/config.yaml`
 - Topology: visualizes routing/config
-- Playground: iframe embed of Open WebUI
+- Playground: iframe embed of Open WebUI and Chat UI
 
 Environment variables set in Compose:
 
 - `TARGET_GRAFANA_URL=http://grafana:3000`
 - `TARGET_PROMETHEUS_URL=http://prometheus:9090`
+- `TARGET_JAEGER_URL=http://jaeger:16686`
 - `TARGET_ROUTER_API_URL=http://semantic-router:8080`
 - `TARGET_ROUTER_METRICS_URL=http://semantic-router:9190/metrics`
 - `TARGET_OPENWEBUI_URL=http://openwebui:8080`
+- `TARGET_CHATUI_URL=http://chat-ui:3000`
 - `ROUTER_CONFIG_PATH=/app/config/config.yaml`
 
 Volumes:
@@ -126,11 +134,66 @@ Image selection:
 
 - Uses `DASHBOARD_IMAGE` if provided; otherwise builds from `dashboard/backend/Dockerfile` at `docker compose up` time.
 
+## Chat UI (Hugging Face)
+
+The `chat-ui` service provides a modern chat interface using Hugging Face's Chat UI:
+
+- **URL**: http://localhost:3002
+- **Database**: MongoDB for conversation persistence
+- **API Integration**: Routes through Envoy proxy for OpenAI-compatible API calls
+- **Configuration**:
+  - `OPENAI_BASE_URL=http://envoy-proxy:8801/v1` (routes through Envoy)
+  - `OPENAI_API_KEY` (configurable via environment variable)
+  - `MONGODB_URL=mongodb://mongo:27017` (local MongoDB by default)
+
+### Environment Variables
+
+You can customize Chat UI behavior by setting these environment variables:
+
+```bash
+# API Configuration
+export OPENAI_API_KEY="your-api-key-here"
+export MONGODB_URL="mongodb://mongo:27017"  # or Atlas URL for production
+export MONGODB_DB_NAME="chat-ui"
+
+# UI Customization
+export PUBLIC_APP_NAME="HuggingChat"
+export PUBLIC_APP_ASSETS="chatui"
+export LOG_LEVEL="info"
+```
+
 ## Open WebUI + Pipelines
 
 - `openwebui` is exposed at http://localhost:3001 (proxied via the Dashboard too)
 - `pipelines` mounts `./addons/vllm_semantic_router_pipe.py` into `/app/pipelines/` for easy integration
 
+## Observability Stack
+
+The stack includes a complete observability solution:
+
+### Prometheus
+
+- **URL**: http://localhost:9090
+- **Configuration**: `./addons/prometheus.yaml`
+- **Data Retention**: 15 days
+- **Storage**: Persistent volume `prometheus-data`
+
+### Grafana
+
+- **URL**: http://localhost:3000
+- **Credentials**: admin/admin
+- **Configuration**:
+  - Datasources: Prometheus and Jaeger
+  - Dashboard: LLM Router dashboard
+  - Storage: Persistent volume `grafana-data`
+
+### Jaeger (Distributed Tracing)
+
+- **URL**: http://localhost:16686
+- **OTLP Endpoint**: http://localhost:4318 (gRPC)
+- **Configuration**: OTLP collector enabled
+- **Integration**: Semantic Router sends traces via OTLP
+
 ## Networking
 
 All services join the `semantic-network` bridge network with a fixed subnet to make in-network lookups stable. Host-published ports are listed above under Services & Ports.
diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml
@@ -5,11 +5,11 @@ bert_model:
 
 semantic_cache:
   enabled: true
-  backend_type: "memory"  # Options: "memory" or "milvus"
+  backend_type: "memory" # Options: "memory" or "milvus"
   similarity_threshold: 0.8
-  max_entries: 1000  # Only applies to memory backend
+  max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
-  eviction_policy: "fifo"  
+  eviction_policy: "fifo"
 
 tools:
   enabled: true
@@ -19,7 +19,7 @@ tools:
   fallback_to_empty: true
 
 prompt_guard:
-  enabled: true
+  enabled: true  # Global default - can be overridden per category with jailbreak_enabled
   use_modernbert: true
   model_id: "models/jailbreak_classifier_modernbert-base_model"
   threshold: 0.7
@@ -32,13 +32,13 @@ prompt_guard:
 # NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
 vllm_endpoints:
   - name: "endpoint1"
-    address: "127.0.0.1"  # IPv4 address - REQUIRED format
-    port: 8000
+    address: "172.28.0.20" # Static IPv4 of llm-katan within docker compose network
+    port: 8002
     weight: 1
 
 model_config:
-  "openai/gpt-oss-20b":
-    reasoning_family: "gpt-oss"  # This model uses GPT-OSS reasoning syntax
+  "qwen3":
+    reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax
     preferred_endpoints: ["endpoint1"]
     pii_policy:
       allow_by_default: true
@@ -61,77 +61,113 @@ classifier:
 # Categories with new use_reasoning field structure
 categories:
   - name: business
+    system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
+    # jailbreak_enabled: true  # Optional: Override global jailbreak detection per category
+    # jailbreak_threshold: 0.8  # Optional: Override global jailbreak threshold per category
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
-        use_reasoning: false  # Business performs better without reasoning
+        use_reasoning: false # Business performs better without reasoning
   - name: law
+    system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.4
         use_reasoning: false
   - name: psychology
+    system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
         use_reasoning: false
   - name: biology
+    system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.9
         use_reasoning: false
   - name: chemistry
+    system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
-        use_reasoning: true  # Enable reasoning for complex chemistry
+        use_reasoning: true # Enable reasoning for complex chemistry
   - name: history
+    system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: other
+    system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: health
+    system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.5
         use_reasoning: false
   - name: economics
+    system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 1.0
         use_reasoning: false
   - name: math
+    system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 1.0
-        use_reasoning: true  # Enable reasoning for complex math
+        use_reasoning: true # Enable reasoning for complex math
   - name: physics
+    system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
-        use_reasoning: true  # Enable reasoning for physics
+        use_reasoning: true # Enable reasoning for physics
   - name: computer science
+    system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
         use_reasoning: false
   - name: philosophy
+    system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.5
         use_reasoning: false
   - name: engineering
+    system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards."
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
 
-default_model: openai/gpt-oss-20b
+default_model: "qwen3"
+
+# Auto model name for automatic model selection (optional)
+# This is the model name that clients should use to trigger automatic model selection
+# If not specified, defaults to "MoM" (Mixture of Models)
+# For backward compatibility, "auto" is always accepted as an alias
+# Example: auto_model_name: "MoM"  # or any other name you prefer
+# auto_model_name: "MoM"
+
+# Include configured models in /v1/models list endpoint (optional, default: false)
+# When false (default): only the auto model name is returned in the /v1/models endpoint
+# When true: all models configured in model_config are also included in the /v1/models endpoint
+# This is useful for clients that need to discover all available models
+# Example: include_config_models_in_list: true
+# include_config_models_in_list: false
 
 # Reasoning family configurations
 reasoning_families:
@@ -164,5 +200,23 @@ api:
       detailed_goroutine_tracking: true
       high_resolution_timing: false
       sample_rate: 1.0
-      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      duration_buckets:
+        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
       size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
+
+# Observability Configuration
+observability:
+  tracing:
+    enabled: true # Enable distributed tracing for docker-compose stack
+    provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
+    exporter:
+      type: "otlp" # Export spans to Jaeger (via OTLP gRPC)
+      endpoint: "jaeger:4317" # Jaeger collector inside compose network
+      insecure: true # Use insecure connection (no TLS)
+    sampling:
+      type: "always_on" # Sampling: always_on, always_off, probabilistic
+      rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
+    resource:
+      service_name: "vllm-semantic-router"
+      service_version: "v0.1.0"
+      deployment_environment: "development"
diff --git a/deploy/kubernetes/observability/dashboard/config.yaml b/deploy/kubernetes/observability/dashboard/config.yaml