Skip to content

Commit 519d4d8

Browse files
authored
feat: add RedisVL as new semantic cache storage (#734)
1 parent 37e5941 commit 519d4d8

File tree

15 files changed

+1789
-3
lines changed

15 files changed

+1789
-3
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ _run:
1212
-f tools/make/docs.mk \
1313
-f tools/make/linter.mk \
1414
-f tools/make/milvus.mk \
15+
-f tools/make/redis.mk \
1516
-f tools/make/models.mk \
1617
-f tools/make/pre-commit.mk \
1718
-f tools/make/docker.mk \

candle-binding/src/model_architectures/traditional/deberta_v3_test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ fn test_deberta_v3_invalid_path() {
2929
#[test]
3030
fn test_deberta_v3_debug_format() {
3131
// Test that the Debug trait exists
32-
let _type_check: Option<Box<dyn std::fmt::Debug>> = None::<Box<DebertaV3Classifier>>;
32+
let _type_check: Option<Box<dyn std::fmt::Debug>> = None;
3333
}
3434

3535
#[cfg(test)]

config/config.redis.yaml

Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
bert_model:
2+
model_id: models/all-MiniLM-L12-v2
3+
threshold: 0.6
4+
use_cpu: true
5+
6+
semantic_cache:
7+
enabled: true # Global cache enabled (applies to all requests)
8+
backend_type: "redis" # Using Redis vector database for semantic cache
9+
similarity_threshold: 0.80 # Global threshold (lowered for better matching)
10+
ttl_seconds: 3600
11+
backend_config_path: "config/semantic-cache/redis.yaml"
12+
# Embedding model for semantic similarity matching
13+
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
14+
# Default: "bert" (fastest, lowest memory)
15+
embedding_model: "bert"
16+
17+
tools:
18+
enabled: true
19+
top_k: 3
20+
similarity_threshold: 0.2
21+
tools_db_path: "config/tools_db.json"
22+
fallback_to_empty: true
23+
24+
prompt_guard:
25+
enabled: true # Global default - can be overridden per category with jailbreak_enabled
26+
use_modernbert: true
27+
model_id: "models/jailbreak_classifier_modernbert-base_model"
28+
threshold: 0.7
29+
use_cpu: true
30+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
31+
32+
# vLLM Endpoints Configuration
33+
# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
34+
# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
35+
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
36+
vllm_endpoints:
37+
- name: "local_vllm"
38+
address: "127.0.0.1" # Local vLLM instance
39+
port: 8000
40+
weight: 1
41+
42+
model_config:
43+
"openai/gpt-oss-20b":
44+
reasoning_family: "gpt-oss" # GPT-OSS uses reasoning_effort parameter
45+
preferred_endpoints: ["local_vllm"]
46+
47+
# Classifier configuration
48+
classifier:
49+
category_model:
50+
model_id: "models/category_classifier_modernbert-base_model"
51+
use_modernbert: true
52+
threshold: 0.6
53+
use_cpu: true
54+
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
55+
pii_model:
56+
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
57+
use_modernbert: true
58+
threshold: 0.7
59+
use_cpu: true
60+
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
61+
62+
# Categories define domain metadata only (no routing logic)
63+
categories:
64+
- name: business
65+
description: "Business and management related queries"
66+
mmlu_categories: ["business"]
67+
- name: law
68+
description: "Legal questions and law-related topics"
69+
mmlu_categories: ["law"]
70+
- name: psychology
71+
description: "Psychology and mental health topics"
72+
mmlu_categories: ["psychology"]
73+
- name: biology
74+
description: "Biology and life sciences questions"
75+
mmlu_categories: ["biology"]
76+
- name: chemistry
77+
description: "Chemistry and chemical sciences questions"
78+
mmlu_categories: ["chemistry"]
79+
- name: history
80+
description: "Historical questions and cultural topics"
81+
mmlu_categories: ["history"]
82+
- name: other
83+
description: "General knowledge and miscellaneous topics"
84+
mmlu_categories: ["other"]
85+
- name: health
86+
description: "Health and medical information queries"
87+
mmlu_categories: ["health"]
88+
- name: economics
89+
description: "Economics and financial topics"
90+
mmlu_categories: ["economics"]
91+
- name: math
92+
description: "Mathematics and quantitative reasoning"
93+
mmlu_categories: ["math"]
94+
- name: physics
95+
description: "Physics and physical sciences"
96+
mmlu_categories: ["physics"]
97+
- name: computer_science
98+
description: "Computer science and programming"
99+
mmlu_categories: ["computer_science"]
100+
- name: philosophy
101+
description: "Philosophy and ethical questions"
102+
mmlu_categories: ["philosophy"]
103+
- name: engineering
104+
description: "Engineering and technical problem-solving"
105+
mmlu_categories: ["engineering"]
106+
107+
# Decisions define routing logic with domain-based conditions
108+
# Redis semantic cache is enabled for selected high-value categories
109+
strategy: "priority"
110+
111+
decisions:
112+
- name: "psychology_decision"
113+
description: "Psychology and mental health topics - with Redis semantic cache"
114+
priority: 100
115+
rules:
116+
operator: "AND"
117+
conditions:
118+
- type: "domain"
119+
name: "psychology"
120+
modelRefs:
121+
- model: "openai/gpt-oss-20b"
122+
use_reasoning: false
123+
plugins:
124+
- type: "system_prompt"
125+
configuration:
126+
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
127+
- type: "semantic-cache"
128+
configuration:
129+
enabled: true
130+
similarity_threshold: 0.92
131+
- type: "pii"
132+
configuration:
133+
enabled: true
134+
pii_types_allowed: []
135+
136+
- name: "health_decision"
137+
description: "Health and medical information queries - with Redis semantic cache"
138+
priority: 100
139+
rules:
140+
operator: "AND"
141+
conditions:
142+
- type: "domain"
143+
name: "health"
144+
modelRefs:
145+
- model: "openai/gpt-oss-20b"
146+
use_reasoning: false
147+
plugins:
148+
- type: "system_prompt"
149+
configuration:
150+
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
151+
- type: "semantic-cache"
152+
configuration:
153+
enabled: true
154+
similarity_threshold: 0.95
155+
- type: "pii"
156+
configuration:
157+
enabled: true
158+
pii_types_allowed: []
159+
160+
- name: "general_decision"
161+
description: "General knowledge and miscellaneous topics - with Redis semantic cache"
162+
priority: 50
163+
rules:
164+
operator: "AND"
165+
conditions:
166+
- type: "domain"
167+
name: "other"
168+
modelRefs:
169+
- model: "openai/gpt-oss-20b"
170+
use_reasoning: false
171+
plugins:
172+
- type: "system_prompt"
173+
configuration:
174+
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
175+
- type: "semantic-cache"
176+
configuration:
177+
enabled: true
178+
similarity_threshold: 0.75
179+
- type: "pii"
180+
configuration:
181+
enabled: true
182+
pii_types_allowed: []
183+
184+
# Other categories without semantic-cache for comparison
185+
- name: "business_decision"
186+
description: "Business and management queries"
187+
priority: 100
188+
rules:
189+
operator: "AND"
190+
conditions:
191+
- type: "domain"
192+
name: "business"
193+
modelRefs:
194+
- model: "openai/gpt-oss-20b"
195+
use_reasoning: false
196+
plugins:
197+
- type: "system_prompt"
198+
configuration:
199+
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development."
200+
- type: "pii"
201+
configuration:
202+
enabled: true
203+
pii_types_allowed: []
204+
205+
- name: "math_decision"
206+
description: "Mathematics and quantitative reasoning"
207+
priority: 100
208+
rules:
209+
operator: "AND"
210+
conditions:
211+
- type: "domain"
212+
name: "math"
213+
modelRefs:
214+
- model: "openai/gpt-oss-20b"
215+
use_reasoning: true
216+
plugins:
217+
- type: "system_prompt"
218+
configuration:
219+
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
220+
- type: "pii"
221+
configuration:
222+
enabled: true
223+
pii_types_allowed: []
224+
225+
- name: "computer_science_decision"
226+
description: "Computer science and programming"
227+
priority: 100
228+
rules:
229+
operator: "AND"
230+
conditions:
231+
- type: "domain"
232+
name: "computer_science"
233+
modelRefs:
234+
- model: "openai/gpt-oss-20b"
235+
use_reasoning: false
236+
plugins:
237+
- type: "system_prompt"
238+
configuration:
239+
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering."
240+
- type: "pii"
241+
configuration:
242+
enabled: true
243+
pii_types_allowed: []
244+
245+
# Router Configuration for Dual-Path Selection
246+
router:
247+
high_confidence_threshold: 0.99
248+
low_latency_threshold_ms: 2000
249+
lora_baseline_score: 0.8
250+
traditional_baseline_score: 0.7
251+
embedding_baseline_score: 0.75
252+
success_confidence_threshold: 0.8
253+
large_batch_threshold: 4
254+
lora_default_execution_time_ms: 1345
255+
traditional_default_execution_time_ms: 4567
256+
default_confidence_threshold: 0.95
257+
default_max_latency_ms: 5000
258+
default_batch_size: 4
259+
default_avg_execution_time_ms: 3000
260+
lora_default_confidence: 0.99
261+
traditional_default_confidence: 0.95
262+
lora_default_success_rate: 0.98
263+
traditional_default_success_rate: 0.95
264+
multi_task_lora_weight: 0.30
265+
single_task_traditional_weight: 0.30
266+
large_batch_lora_weight: 0.25
267+
small_batch_traditional_weight: 0.25
268+
medium_batch_weight: 0.10
269+
high_confidence_lora_weight: 0.25
270+
low_confidence_traditional_weight: 0.25
271+
low_latency_lora_weight: 0.30
272+
high_latency_traditional_weight: 0.10
273+
performance_history_weight: 0.20
274+
traditional_bert_confidence_threshold: 0.95
275+
traditional_modernbert_confidence_threshold: 0.8
276+
traditional_pii_detection_threshold: 0.5
277+
traditional_token_classification_threshold: 0.9
278+
traditional_dropout_prob: 0.1
279+
traditional_attention_dropout_prob: 0.1
280+
tie_break_confidence: 0.5
281+
282+
default_model: openai/gpt-oss-20b
283+
284+
# Reasoning family configurations
285+
reasoning_families:
286+
deepseek:
287+
type: "chat_template_kwargs"
288+
parameter: "thinking"
289+
290+
qwen3:
291+
type: "chat_template_kwargs"
292+
parameter: "enable_thinking"
293+
294+
gpt-oss:
295+
type: "reasoning_effort"
296+
parameter: "reasoning_effort"
297+
gpt:
298+
type: "reasoning_effort"
299+
parameter: "reasoning_effort"
300+
301+
# Global default reasoning effort level
302+
default_reasoning_effort: high
303+
304+
# API Configuration
305+
api:
306+
batch_classification:
307+
max_batch_size: 100
308+
concurrency_threshold: 5
309+
max_concurrency: 8
310+
metrics:
311+
enabled: true
312+
detailed_goroutine_tracking: true
313+
high_resolution_timing: false
314+
sample_rate: 1.0
315+
duration_buckets:
316+
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
317+
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
318+
319+
# Embedding Models Configuration
320+
embedding_models:
321+
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
322+
gemma_model_path: "models/embeddinggemma-300m"
323+
use_cpu: true # Set to false for GPU acceleration (requires CUDA)
324+
325+
# Observability Configuration
326+
observability:
327+
tracing:
328+
enabled: true # Enable distributed tracing for docker-compose stack
329+
provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
330+
exporter:
331+
type: "otlp" # Export spans to Jaeger (via OTLP gRPC)
332+
endpoint: "jaeger:4317" # Jaeger collector inside compose network
333+
insecure: true # Use insecure connection (no TLS)
334+
sampling:
335+
type: "always_on" # Sampling: always_on, always_off, probabilistic
336+
rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
337+
resource:
338+
service_name: "vllm-semantic-router"
339+
service_version: "v0.1.0"
340+
deployment_environment: "development"
341+

0 commit comments

Comments
 (0)