Skip to content

Commit 46be391

Browse files
committed
feat: add HNSW index to inmemory semantic cache and implement hybrid cache that use in-memory index and milvus based doc store
Signed-off-by: Huamin Chen <[email protected]>
1 parent 108374b commit 46be391

24 files changed

+5280
-39
lines changed

candle-binding/Cargo.lock

Lines changed: 52 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

candle-binding/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"]
1111

1212
[dependencies]
1313
anyhow = { version = "1", features = ["backtrace"] }
14-
candle-core = "0.8.4"
15-
candle-nn = "0.8.4"
16-
candle-transformers = "0.8.4"
14+
candle-core = { version = "0.8.4", features = ["cuda"] }
15+
candle-nn = { version = "0.8.4", features = ["cuda"] }
16+
candle-transformers = { version = "0.8.4", features = ["cuda"] }
1717
tokenizers = { version = "0.21.0", features = ["http"] }
1818
hf-hub = "0.4.1"
1919
safetensors = "0.4.1"

config/config.development.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ semantic_cache:
1414
max_entries: 100
1515
ttl_seconds: 600
1616
eviction_policy: "fifo"
17+
use_hnsw: true # Enable HNSW for faster search
18+
hnsw_m: 16
19+
hnsw_ef_construction: 200
1720

1821
tools:
1922
enabled: false

config/config.hybrid.yaml

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
bert_model:
2+
model_id: models/all-MiniLM-L12-v2
3+
threshold: 0.6
4+
use_cpu: true
5+
6+
semantic_cache:
7+
enabled: true
8+
backend_type: "hybrid" # Hybrid HNSW + Milvus backend
9+
similarity_threshold: 0.85
10+
ttl_seconds: 3600
11+
12+
# Hybrid cache specific settings
13+
max_memory_entries: 100000 # Max entries in HNSW index (100K)
14+
15+
# HNSW parameters
16+
hnsw_m: 16 # Number of bi-directional links
17+
hnsw_ef_construction: 200 # Construction quality parameter
18+
19+
# Milvus configuration file path
20+
backend_config_path: "config/milvus.yaml"
21+
22+
tools:
23+
enabled: true
24+
top_k: 3
25+
similarity_threshold: 0.2
26+
tools_db_path: "config/tools_db.json"
27+
fallback_to_empty: true
28+
29+
prompt_guard:
30+
enabled: true
31+
use_modernbert: true
32+
model_id: "models/jailbreak_classifier_modernbert-base_model"
33+
threshold: 0.7
34+
use_cpu: true
35+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
36+
37+
# vLLM Endpoints Configuration
38+
vllm_endpoints:
39+
- name: "endpoint1"
40+
address: "172.28.0.20"
41+
port: 8002
42+
weight: 1
43+
44+
model_config:
45+
"qwen3":
46+
reasoning_family: "qwen3"
47+
preferred_endpoints: ["endpoint1"]
48+
pii_policy:
49+
allow_by_default: true
50+
51+
# Classifier configuration
52+
classifier:
53+
enabled: true
54+
model_path: "models/qwen3-router_model/router_qwen_generative_model.safetensors"
55+
tokenizer_path: "models/qwen3-router_model"
56+
use_cpu: true
57+
threshold: 0.7
58+

config/config.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,20 @@ bert_model:
55

66
semantic_cache:
77
enabled: true
8-
backend_type: "memory" # Options: "memory" or "milvus"
8+
backend_type: "memory" # Options: "memory", "milvus", or "hybrid"
99
similarity_threshold: 0.8
1010
max_entries: 1000 # Only applies to memory backend
1111
ttl_seconds: 3600
1212
eviction_policy: "fifo"
13+
# HNSW index configuration (for memory backend only)
14+
use_hnsw: true # Enable HNSW index for faster similarity search
15+
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
16+
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)
17+
18+
# Hybrid cache configuration (when backend_type: "hybrid")
19+
# Combines in-memory HNSW for fast search with Milvus for scalable storage
20+
# max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
21+
# backend_config_path: "config/milvus.yaml" # Path to Milvus config
1322

1423
tools:
1524
enabled: true

src/semantic-router/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ require (
9393
go.yaml.in/yaml/v2 v2.4.2 // indirect
9494
golang.org/x/net v0.43.0 // indirect
9595
golang.org/x/sync v0.16.0 // indirect
96-
golang.org/x/sys v0.35.0 // indirect
96+
golang.org/x/sys v0.37.0 // indirect
9797
golang.org/x/text v0.28.0 // indirect
9898
golang.org/x/tools v0.35.0 // indirect
9999
google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 // indirect

src/semantic-router/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,8 @@ golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBc
428428
golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
429429
golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
430430
golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
431+
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
432+
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
431433
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
432434
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
433435
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=

src/semantic-router/pkg/cache/cache_factory.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,17 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
2424
switch config.BackendType {
2525
case InMemoryCacheType, "":
2626
// Use in-memory cache as the default backend
27-
observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f",
28-
config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold)
27+
observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, UseHNSW: %t",
28+
config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.UseHNSW)
2929
options := InMemoryCacheOptions{
3030
Enabled: config.Enabled,
3131
SimilarityThreshold: config.SimilarityThreshold,
3232
MaxEntries: config.MaxEntries,
3333
TTLSeconds: config.TTLSeconds,
3434
EvictionPolicy: config.EvictionPolicy,
35+
UseHNSW: config.UseHNSW,
36+
HNSWM: config.HNSWM,
37+
HNSWEfConstruction: config.HNSWEfConstruction,
3538
}
3639
return NewInMemoryCache(options), nil
3740

@@ -46,6 +49,20 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
4649
}
4750
return NewMilvusCache(options)
4851

52+
case HybridCacheType:
53+
observability.Debugf("Creating Hybrid cache backend - MaxMemory: %d, TTL: %ds, Threshold: %.3f",
54+
config.MaxMemoryEntries, config.TTLSeconds, config.SimilarityThreshold)
55+
options := HybridCacheOptions{
56+
Enabled: config.Enabled,
57+
SimilarityThreshold: config.SimilarityThreshold,
58+
TTLSeconds: config.TTLSeconds,
59+
MaxMemoryEntries: config.MaxMemoryEntries,
60+
HNSWM: config.HNSWM,
61+
HNSWEfConstruction: config.HNSWEfConstruction,
62+
MilvusConfigPath: config.BackendConfigPath,
63+
}
64+
return NewHybridCache(options)
65+
4966
default:
5067
observability.Debugf("Unsupported cache backend type: %s", config.BackendType)
5168
return nil, fmt.Errorf("unsupported cache backend type: %s", config.BackendType)

src/semantic-router/pkg/cache/cache_interface.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ const (
5858

5959
// MilvusCacheType specifies the Milvus vector database backend
6060
MilvusCacheType CacheBackendType = "milvus"
61+
62+
// HybridCacheType specifies the hybrid HNSW + Milvus backend
63+
HybridCacheType CacheBackendType = "hybrid"
6164
)
6265

6366
// EvictionPolicyType defines the available eviction policies
@@ -96,4 +99,16 @@ type CacheConfig struct {
9699

97100
// BackendConfigPath points to backend-specific configuration files
98101
BackendConfigPath string `yaml:"backend_config_path,omitempty"`
102+
103+
// UseHNSW enables HNSW index for faster search in memory backend
104+
UseHNSW bool `yaml:"use_hnsw,omitempty"`
105+
106+
// HNSWM is the number of bi-directional links per node (default: 16)
107+
HNSWM int `yaml:"hnsw_m,omitempty"`
108+
109+
// HNSWEfConstruction is the size of dynamic candidate list during construction (default: 200)
110+
HNSWEfConstruction int `yaml:"hnsw_ef_construction,omitempty"`
111+
112+
// Hybrid cache specific settings
113+
MaxMemoryEntries int `yaml:"max_memory_entries,omitempty"` // Max entries in HNSW for hybrid cache
99114
}

0 commit comments

Comments
 (0)