Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 37 additions & 20 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,32 @@ prompt_guard:
# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
vllm_endpoints:
- name: "endpoint1"
address: "172.28.0.20" # Static IPv4 of llm-katan within docker compose network
- name: "nemotron-super-endpoint"
address: "172.32.0.1" # Host machine IP address (accessible from Docker containers)
port: 8001
weight: 1
- name: "gpt-oss-120b-endpoint"
address: "172.32.0.1" # Host machine IP address (accessible from Docker containers)
port: 8002
weight: 1
- name: "qwen3-32b-endpoint"
address: "172.32.0.1" # Host machine IP address (accessible from Docker containers)
port: 8003
weight: 1

model_config:
"qwen3":
reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax
preferred_endpoints: ["endpoint1"]
"gpt-oss-120b":
reasoning_family: "gpt-oss" # GPT-OSS reasoning syntax
preferred_endpoints: ["gpt-oss-120b-endpoint"]
pii_policy:
allow_by_default: true
"qwen3-32b":
reasoning_family: "qwen3" # Qwen-3 reasoning syntax
preferred_endpoints: ["qwen3-32b-endpoint"]
pii_policy:
allow_by_default: true
"nemotron-super-1_5":
preferred_endpoints: ["nemotron-super-endpoint"]
pii_policy:
allow_by_default: true

Expand All @@ -79,91 +96,91 @@ categories:
# jailbreak_enabled: true # Optional: Override global jailbreak detection per category
# jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category
model_scores:
- model: qwen3
- model: qwen3-32b
score: 0.7
use_reasoning: false # Business performs better without reasoning
- name: law
system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters."
model_scores:
- model: qwen3
- model: nemotron-super-1_5
score: 0.4
use_reasoning: false
- name: psychology
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
model_scores:
- model: qwen3
- model: gpt-oss-120b
score: 0.6
use_reasoning: false
- name: biology
system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems."
model_scores:
- model: qwen3
- model: qwen3-32b
score: 0.9
use_reasoning: false
- name: chemistry
system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations."
model_scores:
- model: qwen3
- model: gpt-oss-120b
score: 0.6
use_reasoning: true # Enable reasoning for complex chemistry
- name: history
system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
model_scores:
- model: qwen3
- model: nemotron-super-1_5
score: 0.7
use_reasoning: false
- name: other
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
model_scores:
- model: qwen3
- model: qwen3-32b
score: 0.7
use_reasoning: false
- name: health
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
model_scores:
- model: qwen3
- model: gpt-oss-120b
score: 0.5
use_reasoning: false
- name: economics
system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses."
model_scores:
- model: qwen3
- model: nemotron-super-1_5
score: 1.0
use_reasoning: false
- name: math
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
model_scores:
- model: qwen3
- model: gpt-oss-120b
score: 1.0
use_reasoning: true # Enable reasoning for complex math
- name: physics
system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
model_scores:
- model: qwen3
- model: qwen3-32b
score: 0.7
use_reasoning: true # Enable reasoning for physics
- name: computer science
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
model_scores:
- model: qwen3
- model: nemotron-super-1_5
score: 0.6
use_reasoning: false
- name: philosophy
system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates."
model_scores:
- model: qwen3
- model: gpt-oss-120b
score: 0.5
use_reasoning: false
- name: engineering
system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards."
model_scores:
- model: qwen3
- model: nemotron-super-1_5
score: 0.7
use_reasoning: false

Expand Down Expand Up @@ -214,7 +231,7 @@ router:
traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability
tie_break_confidence: 0.5 # Confidence value for tie-breaking situations

default_model: qwen3
default_model: qwen3-32b

# Reasoning family configurations
reasoning_families:
Expand Down
10 changes: 7 additions & 3 deletions deploy/docker-compose/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ services:
- OTEL_SERVICE_NAME=vllm-semantic-router
- HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface
- HF_HUB_ENABLE_HF_TRANSFER=1
extra_hosts:
- "host.docker.internal:172.32.0.1" # Allow container to reach host LLM endpoints
networks:
- semantic-network
healthcheck:
Expand All @@ -40,6 +42,8 @@ services:
volumes:
- ./addons/envoy.yaml:/etc/envoy/envoy.yaml:ro,z
command: ["/usr/local/bin/envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "ext_proc:trace,router:trace,http:trace"]
extra_hosts:
- "host.docker.internal:172.32.0.1" # Allow container to reach host LLM endpoints
depends_on:
semantic-router:
condition: service_healthy
Expand All @@ -63,7 +67,7 @@ services:
- "8000:8000"
networks:
semantic-network:
ipv4_address: 172.28.0.10
ipv4_address: 172.32.0.10
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"]
interval: 10s
Expand Down Expand Up @@ -215,7 +219,7 @@ services:
- hf-cache:/home/llmkatan/.cache/huggingface
networks:
semantic-network:
ipv4_address: 172.28.0.20
ipv4_address: 172.32.0.20
command: ["llm-katan", "--model", "/app/models/Qwen/Qwen3-0.6B", "--served-model-name", "qwen3", "--host", "0.0.0.0", "--port", "8002"]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:8002/health"]
Expand Down Expand Up @@ -274,7 +278,7 @@ networks:
driver: bridge
ipam:
config:
- subnet: 172.28.0.0/16
- subnet: 172.32.0.0/16

volumes:
models-cache:
Expand Down
Loading