Skip to content

Commit 7abe96a

Browse files
authored
feat: Implement hybrid cache that use in-memory index and milvus based doc store (#504)
* feat: add HNSW index to inmemory semantic cache and implement hybrid cache that use in-memory index and milvus based doc store Signed-off-by: Huamin Chen <[email protected]> * chore: run go mod tidy to clean up module dependencies Signed-off-by: Huamin Chen <[email protected]> * conditionally build candle cuda support Signed-off-by: Huamin Chen <[email protected]> * rebuild index upon restart Signed-off-by: Huamin Chen <[email protected]> * precommit fix Signed-off-by: Huamin Chen <[email protected]> * fix precommit Signed-off-by: Huamin Chen <[email protected]> * fix precommit Signed-off-by: Huamin Chen <[email protected]> * fix precommit Signed-off-by: Huamin Chen <[email protected]> * disable cuda build on ci Signed-off-by: Huamin Chen <[email protected]> * review feedback Signed-off-by: Huamin Chen <[email protected]> * review feedback Signed-off-by: Huamin Chen <[email protected]> * review feedback Signed-off-by: Huamin Chen <[email protected]> * review feedback Signed-off-by: Huamin Chen <[email protected]> --------- Signed-off-by: Huamin Chen <[email protected]>
1 parent 811ddb4 commit 7abe96a

32 files changed

+5533
-103
lines changed

.github/workflows/pre-commit.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ jobs:
9797

9898
- name: Run pre-commit check
9999
run: make precommit-check
100+
env:
101+
CI: true
100102

101103
- name: Show pre-commit results
102104
if: failure()

.github/workflows/publish-crate.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,17 @@ jobs:
7171
exit 1
7272
fi
7373
74-
- name: Run tests
74+
- name: Run tests (CPU-only, no CUDA)
7575
working-directory: candle-binding
76-
run: cargo test --verbose
76+
run: cargo test --no-default-features --verbose
7777

78-
- name: Check crate
78+
- name: Check crate (CPU-only, no CUDA)
7979
working-directory: candle-binding
80-
run: cargo check --verbose
80+
run: cargo check --no-default-features --verbose
8181

82-
- name: Build crate
82+
- name: Build crate (CPU-only, no CUDA)
8383
working-directory: candle-binding
84-
run: cargo build --release --verbose
84+
run: cargo build --release --no-default-features --verbose
8585

8686
- name: Dry run publish
8787
working-directory: candle-binding

.github/workflows/test-and-build.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ jobs:
6969
- name: Check go mod tidy
7070
run: make check-go-mod-tidy
7171

72-
- name: Build Rust library
73-
run: make rust
72+
- name: Build Rust library (CPU-only, no CUDA)
73+
run: make rust-ci
7474

7575
- name: Install HuggingFace CLI
7676
run: |
@@ -86,6 +86,7 @@ jobs:
8686
- name: Run semantic router tests
8787
run: make test
8888
env:
89+
CI: true
8990
CGO_ENABLED: 1
9091
LD_LIBRARY_PATH: ${{ github.workspace }}/candle-binding/target/release
9192

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ repos:
8181
pass_filenames: false
8282
- id: cargo-check
8383
name: cargo check
84-
entry: bash -c 'cd candle-binding && cargo check'
84+
entry: bash -c 'cd candle-binding && cargo check --no-default-features'
8585
language: system
8686
files: \.rs$
8787
pass_filenames: false

Dockerfile.extproc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,24 +30,24 @@ COPY candle-binding/Cargo.loc[k] ./candle-binding/
3030
COPY tools/make/ tools/make/
3131
COPY Makefile ./
3232

33-
# Pre-build dependencies to cache them
33+
# Pre-build dependencies to cache them (CPU-only, no CUDA)
3434
RUN cd candle-binding && \
3535
mkdir -p src && \
3636
echo "fn main() {}" > src/lib.rs && \
37-
cargo build --release && \
37+
cargo build --release --no-default-features && \
3838
rm -rf src
3939

4040
# Copy source code and build
4141
COPY candle-binding/src/ ./candle-binding/src/
4242

43-
# Use Makefile to build the Rust library (rebuild with actual source code)
44-
RUN echo "Building Rust library with actual source code..." && \
43+
# Use Makefile to build the Rust library (rebuild with actual source code, CPU-only, no CUDA)
44+
RUN echo "Building Rust library with actual source code (CPU-only, no CUDA)..." && \
4545
echo "Checking source files:" && \
4646
ls -la candle-binding/src/ && \
4747
echo "Forcing clean rebuild..." && \
4848
cd candle-binding && \
4949
cargo clean && \
50-
cargo build --release && \
50+
cargo build --release --no-default-features && \
5151
echo "Checking built library:" && \
5252
find target -name "*.so" -type f && \
5353
ls -la target/release/

Dockerfile.extproc.cross

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,29 +72,29 @@ COPY candle-binding/Cargo.loc[k] ./candle-binding/
7272
COPY tools/make/ tools/make/
7373
COPY Makefile ./
7474

75-
# Create a modified Makefile for cross-compilation
75+
# Create a modified Makefile for cross-compilation (CPU-only, no CUDA)
7676
RUN if [ "$TARGETARCH" = "arm64" ]; then \
77-
echo "Modifying rust.mk for ARM64 cross-compilation..."; \
78-
sed -i 's/cd candle-binding && cargo build --release/cd candle-binding \&\& cargo build --release --target aarch64-unknown-linux-gnu/' tools/make/rust.mk; \
77+
echo "Modifying rust.mk for ARM64 cross-compilation (CPU-only, no CUDA)..."; \
78+
sed -i 's/cd candle-binding && cargo build --release/cd candle-binding \&\& cargo build --release --no-default-features --target aarch64-unknown-linux-gnu/' tools/make/rust.mk; \
7979
cat tools/make/rust.mk | grep "cargo build"; \
8080
fi
8181

82-
# Pre-build dependencies to cache them
82+
# Pre-build dependencies to cache them (CPU-only, no CUDA)
8383
RUN cd candle-binding && \
8484
mkdir -p src && \
8585
echo "fn main() {}" > src/lib.rs && \
8686
if [ "$TARGETARCH" = "arm64" ]; then \
87-
cargo build --release --target aarch64-unknown-linux-gnu; \
87+
cargo build --release --no-default-features --target aarch64-unknown-linux-gnu; \
8888
else \
89-
cargo build --release; \
89+
cargo build --release --no-default-features; \
9090
fi && \
9191
rm -rf src
9292

9393
# Copy source code and build
9494
COPY candle-binding/src/ ./candle-binding/src/
9595

96-
# Build with cross-compilation (rebuild with actual source code)
97-
RUN echo "Building Rust library with actual source code..." && \
96+
# Build with cross-compilation (rebuild with actual source code, CPU-only, no CUDA)
97+
RUN echo "Building Rust library with actual source code (CPU-only, no CUDA)..." && \
9898
echo "Current directory: $(pwd)" && \
9999
echo "TARGETARCH: $TARGETARCH" && \
100100
ls -la candle-binding/src/ && \
@@ -107,9 +107,9 @@ RUN echo "Building Rust library with actual source code..." && \
107107
export CC_aarch64_unknown_linux_gnu=aarch64-linux-gnu-gcc; \
108108
export CXX_aarch64_unknown_linux_gnu=aarch64-linux-gnu-g++; \
109109
export AR_aarch64_unknown_linux_gnu=aarch64-linux-gnu-ar; \
110-
cargo build --release --target aarch64-unknown-linux-gnu; \
110+
cargo build --release --no-default-features --target aarch64-unknown-linux-gnu; \
111111
else \
112-
cargo build --release --target x86_64-unknown-linux-gnu; \
112+
cargo build --release --no-default-features --target x86_64-unknown-linux-gnu; \
113113
fi && \
114114
echo "Checking built library..." && \
115115
find target -name "*.so" -type f

candle-binding/Cargo.lock

Lines changed: 52 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

candle-binding/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ license = "MIT OR Apache-2.0"
99
name = "candle_semantic_router"
1010
crate-type = ["staticlib", "cdylib"]
1111

12+
[features]
13+
default = ["cuda"]
14+
cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
15+
1216
[dependencies]
1317
anyhow = { version = "1", features = ["backtrace"] }
1418
candle-core = "0.8.4"

config/config.development.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ semantic_cache:
1414
max_entries: 100
1515
ttl_seconds: 600
1616
eviction_policy: "fifo"
17+
use_hnsw: true # Enable HNSW for faster search
18+
hnsw_m: 16
19+
hnsw_ef_construction: 200
1720

1821
tools:
1922
enabled: false

config/config.hybrid.yaml

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
bert_model:
2+
model_id: models/all-MiniLM-L12-v2
3+
threshold: 0.6
4+
use_cpu: true
5+
6+
semantic_cache:
7+
enabled: true
8+
backend_type: "hybrid" # Hybrid HNSW + Milvus backend
9+
similarity_threshold: 0.85
10+
ttl_seconds: 3600
11+
12+
# Hybrid cache specific settings
13+
max_memory_entries: 100000 # Max entries in HNSW index (100K)
14+
15+
# HNSW parameters
16+
hnsw_m: 16 # Number of bi-directional links
17+
hnsw_ef_construction: 200 # Construction quality parameter
18+
19+
# Milvus configuration file path
20+
backend_config_path: "config/milvus.yaml"
21+
22+
tools:
23+
enabled: true
24+
top_k: 3
25+
similarity_threshold: 0.2
26+
tools_db_path: "config/tools_db.json"
27+
fallback_to_empty: true
28+
29+
prompt_guard:
30+
enabled: true
31+
use_modernbert: true
32+
model_id: "models/jailbreak_classifier_modernbert-base_model"
33+
threshold: 0.7
34+
use_cpu: true
35+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
36+
37+
# vLLM Endpoints Configuration
38+
vllm_endpoints:
39+
- name: "endpoint1"
40+
address: "172.28.0.20"
41+
port: 8002
42+
weight: 1
43+
44+
model_config:
45+
"qwen3":
46+
reasoning_family: "qwen3"
47+
preferred_endpoints: ["endpoint1"]
48+
pii_policy:
49+
allow_by_default: true
50+
51+
# Classifier configuration
52+
classifier:
53+
enabled: true
54+
model_path: "models/qwen3-router_model/router_qwen_generative_model.safetensors"
55+
tokenizer_path: "models/qwen3-router_model"
56+
use_cpu: true
57+
threshold: 0.7
58+

0 commit comments

Comments
 (0)