Skip to content

Commit 4310d92

Browse files
committed
feat: add RedisVL as new semantic cache storage
Signed-off-by: Huamin Chen <[email protected]>
1 parent 37e5941 commit 4310d92

File tree

13 files changed

+1777
-0
lines changed

13 files changed

+1777
-0
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ _run:
1212
-f tools/make/docs.mk \
1313
-f tools/make/linter.mk \
1414
-f tools/make/milvus.mk \
15+
-f tools/make/redis.mk \
1516
-f tools/make/models.mk \
1617
-f tools/make/pre-commit.mk \
1718
-f tools/make/docker.mk \

config/config.redis.yaml

Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
bert_model:
2+
model_id: models/all-MiniLM-L12-v2
3+
threshold: 0.6
4+
use_cpu: true
5+
6+
semantic_cache:
7+
enabled: true # Global cache enabled (applies to all requests)
8+
backend_type: "redis" # Using Redis vector database for semantic cache
9+
similarity_threshold: 0.80 # Global threshold (lowered for better matching)
10+
ttl_seconds: 3600
11+
backend_config_path: "config/semantic-cache/redis.yaml"
12+
13+
# Embedding model for semantic similarity matching
14+
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
15+
# Default: "bert" (fastest, lowest memory)
16+
embedding_model: "bert"
17+
18+
tools:
19+
enabled: true
20+
top_k: 3
21+
similarity_threshold: 0.2
22+
tools_db_path: "config/tools_db.json"
23+
fallback_to_empty: true
24+
25+
prompt_guard:
26+
enabled: true # Global default - can be overridden per category with jailbreak_enabled
27+
use_modernbert: true
28+
model_id: "models/jailbreak_classifier_modernbert-base_model"
29+
threshold: 0.7
30+
use_cpu: true
31+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
32+
33+
# vLLM Endpoints Configuration
34+
# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
35+
# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
36+
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
37+
vllm_endpoints:
38+
- name: "local_vllm"
39+
address: "127.0.0.1" # Local vLLM instance
40+
port: 8000
41+
weight: 1
42+
43+
model_config:
44+
"openai/gpt-oss-20b":
45+
reasoning_family: "gpt-oss" # GPT-OSS uses reasoning_effort parameter
46+
preferred_endpoints: ["local_vllm"]
47+
48+
# Classifier configuration
49+
classifier:
50+
category_model:
51+
model_id: "models/category_classifier_modernbert-base_model"
52+
use_modernbert: true
53+
threshold: 0.6
54+
use_cpu: true
55+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
56+
pii_model:
57+
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
58+
use_modernbert: true
59+
threshold: 0.7
60+
use_cpu: true
61+
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
62+
63+
# Categories define domain metadata only (no routing logic)
64+
categories:
65+
- name: business
66+
description: "Business and management related queries"
67+
mmlu_categories: ["business"]
68+
- name: law
69+
description: "Legal questions and law-related topics"
70+
mmlu_categories: ["law"]
71+
- name: psychology
72+
description: "Psychology and mental health topics"
73+
mmlu_categories: ["psychology"]
74+
- name: biology
75+
description: "Biology and life sciences questions"
76+
mmlu_categories: ["biology"]
77+
- name: chemistry
78+
description: "Chemistry and chemical sciences questions"
79+
mmlu_categories: ["chemistry"]
80+
- name: history
81+
description: "Historical questions and cultural topics"
82+
mmlu_categories: ["history"]
83+
- name: other
84+
description: "General knowledge and miscellaneous topics"
85+
mmlu_categories: ["other"]
86+
- name: health
87+
description: "Health and medical information queries"
88+
mmlu_categories: ["health"]
89+
- name: economics
90+
description: "Economics and financial topics"
91+
mmlu_categories: ["economics"]
92+
- name: math
93+
description: "Mathematics and quantitative reasoning"
94+
mmlu_categories: ["math"]
95+
- name: physics
96+
description: "Physics and physical sciences"
97+
mmlu_categories: ["physics"]
98+
- name: computer_science
99+
description: "Computer science and programming"
100+
mmlu_categories: ["computer_science"]
101+
- name: philosophy
102+
description: "Philosophy and ethical questions"
103+
mmlu_categories: ["philosophy"]
104+
- name: engineering
105+
description: "Engineering and technical problem-solving"
106+
mmlu_categories: ["engineering"]
107+
108+
# Decisions define routing logic with domain-based conditions
109+
# Redis semantic cache is enabled for selected high-value categories
110+
strategy: "priority"
111+
112+
decisions:
113+
- name: "psychology_decision"
114+
description: "Psychology and mental health topics - with Redis semantic cache"
115+
priority: 100
116+
rules:
117+
operator: "AND"
118+
conditions:
119+
- type: "domain"
120+
name: "psychology"
121+
modelRefs:
122+
- model: "openai/gpt-oss-20b"
123+
use_reasoning: false
124+
plugins:
125+
- type: "system_prompt"
126+
configuration:
127+
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
128+
- type: "semantic-cache"
129+
configuration:
130+
enabled: true
131+
similarity_threshold: 0.92
132+
- type: "pii"
133+
configuration:
134+
enabled: true
135+
pii_types_allowed: []
136+
137+
- name: "health_decision"
138+
description: "Health and medical information queries - with Redis semantic cache"
139+
priority: 100
140+
rules:
141+
operator: "AND"
142+
conditions:
143+
- type: "domain"
144+
name: "health"
145+
modelRefs:
146+
- model: "openai/gpt-oss-20b"
147+
use_reasoning: false
148+
plugins:
149+
- type: "system_prompt"
150+
configuration:
151+
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
152+
- type: "semantic-cache"
153+
configuration:
154+
enabled: true
155+
similarity_threshold: 0.95
156+
- type: "pii"
157+
configuration:
158+
enabled: true
159+
pii_types_allowed: []
160+
161+
- name: "general_decision"
162+
description: "General knowledge and miscellaneous topics - with Redis semantic cache"
163+
priority: 50
164+
rules:
165+
operator: "AND"
166+
conditions:
167+
- type: "domain"
168+
name: "other"
169+
modelRefs:
170+
- model: "openai/gpt-oss-20b"
171+
use_reasoning: false
172+
plugins:
173+
- type: "system_prompt"
174+
configuration:
175+
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
176+
- type: "semantic-cache"
177+
configuration:
178+
enabled: true
179+
similarity_threshold: 0.75
180+
- type: "pii"
181+
configuration:
182+
enabled: true
183+
pii_types_allowed: []
184+
185+
# Other categories without semantic-cache for comparison
186+
- name: "business_decision"
187+
description: "Business and management queries"
188+
priority: 100
189+
rules:
190+
operator: "AND"
191+
conditions:
192+
- type: "domain"
193+
name: "business"
194+
modelRefs:
195+
- model: "openai/gpt-oss-20b"
196+
use_reasoning: false
197+
plugins:
198+
- type: "system_prompt"
199+
configuration:
200+
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development."
201+
- type: "pii"
202+
configuration:
203+
enabled: true
204+
pii_types_allowed: []
205+
206+
- name: "math_decision"
207+
description: "Mathematics and quantitative reasoning"
208+
priority: 100
209+
rules:
210+
operator: "AND"
211+
conditions:
212+
- type: "domain"
213+
name: "math"
214+
modelRefs:
215+
- model: "openai/gpt-oss-20b"
216+
use_reasoning: true
217+
plugins:
218+
- type: "system_prompt"
219+
configuration:
220+
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
221+
- type: "pii"
222+
configuration:
223+
enabled: true
224+
pii_types_allowed: []
225+
226+
- name: "computer_science_decision"
227+
description: "Computer science and programming"
228+
priority: 100
229+
rules:
230+
operator: "AND"
231+
conditions:
232+
- type: "domain"
233+
name: "computer_science"
234+
modelRefs:
235+
- model: "openai/gpt-oss-20b"
236+
use_reasoning: false
237+
plugins:
238+
- type: "system_prompt"
239+
configuration:
240+
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering."
241+
- type: "pii"
242+
configuration:
243+
enabled: true
244+
pii_types_allowed: []
245+
246+
# Router Configuration for Dual-Path Selection
247+
router:
248+
high_confidence_threshold: 0.99
249+
low_latency_threshold_ms: 2000
250+
lora_baseline_score: 0.8
251+
traditional_baseline_score: 0.7
252+
embedding_baseline_score: 0.75
253+
success_confidence_threshold: 0.8
254+
large_batch_threshold: 4
255+
lora_default_execution_time_ms: 1345
256+
traditional_default_execution_time_ms: 4567
257+
default_confidence_threshold: 0.95
258+
default_max_latency_ms: 5000
259+
default_batch_size: 4
260+
default_avg_execution_time_ms: 3000
261+
lora_default_confidence: 0.99
262+
traditional_default_confidence: 0.95
263+
lora_default_success_rate: 0.98
264+
traditional_default_success_rate: 0.95
265+
multi_task_lora_weight: 0.30
266+
single_task_traditional_weight: 0.30
267+
large_batch_lora_weight: 0.25
268+
small_batch_traditional_weight: 0.25
269+
medium_batch_weight: 0.10
270+
high_confidence_lora_weight: 0.25
271+
low_confidence_traditional_weight: 0.25
272+
low_latency_lora_weight: 0.30
273+
high_latency_traditional_weight: 0.10
274+
performance_history_weight: 0.20
275+
traditional_bert_confidence_threshold: 0.95
276+
traditional_modernbert_confidence_threshold: 0.8
277+
traditional_pii_detection_threshold: 0.5
278+
traditional_token_classification_threshold: 0.9
279+
traditional_dropout_prob: 0.1
280+
traditional_attention_dropout_prob: 0.1
281+
tie_break_confidence: 0.5
282+
283+
default_model: openai/gpt-oss-20b
284+
285+
# Reasoning family configurations
286+
reasoning_families:
287+
deepseek:
288+
type: "chat_template_kwargs"
289+
parameter: "thinking"
290+
291+
qwen3:
292+
type: "chat_template_kwargs"
293+
parameter: "enable_thinking"
294+
295+
gpt-oss:
296+
type: "reasoning_effort"
297+
parameter: "reasoning_effort"
298+
gpt:
299+
type: "reasoning_effort"
300+
parameter: "reasoning_effort"
301+
302+
# Global default reasoning effort level
303+
default_reasoning_effort: high
304+
305+
# API Configuration
306+
api:
307+
batch_classification:
308+
max_batch_size: 100
309+
concurrency_threshold: 5
310+
max_concurrency: 8
311+
metrics:
312+
enabled: true
313+
detailed_goroutine_tracking: true
314+
high_resolution_timing: false
315+
sample_rate: 1.0
316+
duration_buckets:
317+
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
318+
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
319+
320+
# Embedding Models Configuration
321+
embedding_models:
322+
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
323+
gemma_model_path: "models/embeddinggemma-300m"
324+
use_cpu: true # Set to false for GPU acceleration (requires CUDA)
325+
326+
# Observability Configuration
327+
observability:
328+
tracing:
329+
enabled: true # Enable distributed tracing for docker-compose stack
330+
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
331+
exporter:
332+
type: "otlp" # Export spans to Jaeger (via OTLP gRPC)
333+
endpoint: "jaeger:4317" # Jaeger collector inside compose network
334+
insecure: true # Use insecure connection (no TLS)
335+
sampling:
336+
type: "always_on" # Sampling: always_on, always_off, probabilistic
337+
rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
338+
resource:
339+
service_name: "vllm-semantic-router"
340+
service_version: "v0.1.0"
341+
deployment_environment: "development"
342+

0 commit comments

Comments
 (0)