diff --git a/Makefile b/Makefile
index 9f3f4258..64b51766 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,66 @@
-.PHONY: all build clean test docker-build podman-build docker-run podman-run
+.PHONY: all build clean test docker-build podman-build docker-run podman-run start-milvus stop-milvus restart-milvus milvus-status clean-milvus test-milvus-cache test-semantic-router-milvus help
 
 # Default target
 all: build
 
+# Help target
+help:
+	@echo "Available targets:"
+	@echo "  Build targets:"
+	@echo "    all                     - Build everything (default)"
+	@echo "    build                   - Build Rust library and Go router"
+	@echo "    rust                    - Build only the Rust library"
+	@echo "    build-router            - Build only the Go router"
+	@echo "    clean                   - Clean build artifacts"
+	@echo ""
+	@echo "  Run targets:"
+	@echo "    run-router              - Run the router (CONFIG_FILE=config/config.yaml)"
+	@echo "    run-envoy               - Run Envoy proxy"
+	@echo ""
+	@echo "  Test targets:"
+	@echo "    test                    - Run all tests"
+	@echo "    test-binding            - Test candle-binding"
+	@echo "    test-semantic-router    - Test semantic router"
+	@echo "    test-category-classifier - Test category classifier"
+	@echo "    test-pii-classifier     - Test PII classifier"
+	@echo "    test-jailbreak-classifier - Test jailbreak classifier"
+	@echo ""
+	@echo "  Milvus targets (CONTAINER_RUNTIME=docker|podman):"
+	@echo "    start-milvus            - Start Milvus container for testing"
+	@echo "    stop-milvus             - Stop and remove Milvus container"
+	@echo "    restart-milvus          - Restart Milvus container"
+	@echo "    milvus-status           - Check Milvus container status"
+	@echo "    clean-milvus            - Stop container and clean data"
+	@echo "    test-milvus-cache       - Test cache with Milvus backend"
+	@echo "    test-semantic-router-milvus - Test router with Milvus cache"
+	@echo "    Example: CONTAINER_RUNTIME=podman make start-milvus"
+	@echo ""
+	@echo "  Demo targets:"
+	@echo "    test-auto-prompt-reasoning - Test reasoning mode"
+	@echo "    test-auto-prompt-no-reasoning - Test normal mode"
+	@echo "    test-pii                - Test PII detection"
+	@echo "    test-prompt-guard       - Test jailbreak detection"
+	@echo "    test-tools              - Test tool auto-selection"
+	@echo ""
+	@echo "  Documentation targets:"
+	@echo "    docs-dev                - Start documentation dev server"
+	@echo "    docs-build              - Build documentation"
+	@echo "    docs-serve              - Serve built documentation"
+	@echo "    docs-clean              - Clean documentation artifacts"
+	@echo ""
+	@echo "  Environment variables:"
+	@echo "    CONTAINER_RUNTIME       - Container runtime (docker|podman, default: docker)"
+	@echo "    CONFIG_FILE             - Config file path (default: config/config.yaml)"
+	@echo "    VLLM_ENDPOINT           - vLLM endpoint URL for testing"
+	@echo ""
+	@echo "  Usage examples:"
+	@echo "    make start-milvus                    # Use Docker (default)"
+	@echo "    CONTAINER_RUNTIME=podman make start-milvus  # Use Podman"
+	@echo "    CONFIG_FILE=custom.yaml make run-router     # Use custom config"
+
+# Container runtime (docker or podman)
+CONTAINER_RUNTIME ?= docker
+
 # vLLM env var
 VLLM_ENDPOINT ?=
 
@@ -30,7 +88,7 @@ rust:
 build-router: rust
 	@echo "Building router..."
 	@mkdir -p bin
-	@cd src/semantic-router && go build -o ../../bin/router cmd/main.go
+	@cd src/semantic-router && go build --tags=milvus -o ../../bin/router cmd/main.go
 
 # Config file path with default
 CONFIG_FILE ?= config/config.yaml
@@ -104,9 +162,12 @@ test-jailbreak-classifier: rust
 		cd src/training/prompt_guard_fine_tuning && CGO_ENABLED=1 go run jailbreak_classifier_verifier.go
 
 # Unit test semantic-router
+# By default, Milvus tests are skipped. To enable them, set SKIP_MILVUS_TESTS=false
+# Example: make test-semantic-router SKIP_MILVUS_TESTS=false
 test-semantic-router: build-router
 	@echo "Testing semantic-router..."
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
+	export SKIP_MILVUS_TESTS=$${SKIP_MILVUS_TESTS:-true} && \
 		cd src/semantic-router && CGO_ENABLED=1 go test -v ./...
 
 # Test the Rust library and the Go binding
@@ -195,6 +256,65 @@ download-models:
 		hf download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir models/pii_classifier_modernbert-base_presidio_token_model; \
 	fi
 
+# Milvus container management
+start-milvus:
+	@echo "Starting Milvus container for testing with $(CONTAINER_RUNTIME)..."
+	@mkdir -p /tmp/milvus-data
+	@$(CONTAINER_RUNTIME) run -d \
+		--name milvus-semantic-cache \
+		--security-opt seccomp:unconfined \
+		-e ETCD_USE_EMBED=true \
+		-e ETCD_DATA_DIR=/var/lib/milvus/etcd \
+		-e ETCD_CONFIG_PATH=/milvus/configs/advanced/etcd.yaml \
+		-e COMMON_STORAGETYPE=local \
+		-e CLUSTER_ENABLED=false \
+		-p 19530:19530 \
+		-p 9091:9091 \
+		-v /tmp/milvus-data:/var/lib/milvus \
+		milvusdb/milvus:v2.3.3 \
+		milvus run standalone
+	@echo "Waiting for Milvus to be ready..."
+	@sleep 15
+	@echo "Milvus should be available at localhost:19530"
+
+stop-milvus:
+	@echo "Stopping Milvus container..."
+	@$(CONTAINER_RUNTIME) stop milvus-semantic-cache || true
+	@$(CONTAINER_RUNTIME) rm milvus-semantic-cache || true
+	@sudo rm -rf /tmp/milvus-data || true
+	@echo "Milvus container stopped and removed"
+
+restart-milvus: stop-milvus start-milvus
+
+milvus-status:
+	@echo "Checking Milvus container status..."
+	@if $(CONTAINER_RUNTIME) ps --filter "name=milvus-semantic-cache" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -q milvus-semantic-cache; then \
+		echo "Milvus container is running:"; \
+		$(CONTAINER_RUNTIME) ps --filter "name=milvus-semantic-cache" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"; \
+	else \
+		echo "Milvus container is not running"; \
+		echo "Run 'make start-milvus' to start it"; \
+	fi
+
+clean-milvus: stop-milvus
+	@echo "Cleaning up Milvus data..."
+	@sudo rm -rf milvus-data || rm -rf milvus-data
+	@echo "Milvus data directory cleaned"
+
+# Test semantic cache with Milvus backend
+test-milvus-cache: start-milvus rust
+	@echo "Testing semantic cache with Milvus backend..."
+	@export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \
+		cd src/semantic-router && CGO_ENABLED=1 go test -tags=milvus -v ./pkg/cache/
+	@echo "Consider running 'make stop-milvus' when done testing"
+
+# Test semantic-router with Milvus enabled
+test-semantic-router-milvus: build-router start-milvus
+	@echo "Testing semantic-router with Milvus cache backend..."
+	@export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \
+		cd src/semantic-router && CGO_ENABLED=1 go test -tags=milvus -v ./...
+	@echo "Consider running 'make stop-milvus' when done testing"
+
 # Documentation targets
 docs-install:
 	@echo "Installing documentation dependencies..."
diff --git a/config/cache/milvus.yaml b/config/cache/milvus.yaml
new file mode 100644
index 00000000..bac19b4d
--- /dev/null
+++ b/config/cache/milvus.yaml
@@ -0,0 +1,171 @@
+# Milvus Vector Database Configuration for Semantic Cache
+# 
+# This configuration file contains settings for using Milvus as the semantic cache backend.
+# To use this configuration:
+# 1. Set backend_type: "milvus" in your main config.yaml
+# 2. Set backend_config_path: "config/cache/milvus.yaml" in your main config.yaml
+# 3. Ensure Milvus server is running and accessible
+# 4. Build with Milvus support: go build -tags=milvus
+
+# Milvus connection settings
+connection:
+  # Milvus server host (change for production deployment)
+  host: "localhost"  # For production: use your Milvus cluster endpoint
+  
+  # Milvus server port
+  port: 19530  # Standard Milvus port
+  
+  # Database name (optional, defaults to "default")
+  database: "semantic_router_cache"
+  
+  # Connection timeout in seconds
+  timeout: 30
+  
+  # Authentication (enable for production)
+  auth:
+    enabled: false  # Set to true for production
+    username: ""    # Your Milvus username
+    password: ""    # Your Milvus password
+  
+  # TLS/SSL configuration (recommended for production)
+  tls:
+    enabled: false      # Set to true for secure connections
+    cert_file: ""       # Path to client certificate
+    key_file: ""        # Path to client private key
+    ca_file: ""         # Path to CA certificate
+
+# Collection settings
+collection:
+  # Name of the collection to store cache entries
+  name: "semantic_cache"
+  
+  # Description of the collection
+  description: "Semantic cache for LLM request-response pairs"
+  
+  # Vector field configuration
+  vector_field:
+    # Name of the vector field
+    name: "embedding"
+    
+    # Dimension of the embeddings (auto-detected from model at runtime)
+    dimension: 384  # This value is ignored - dimension is auto-detected from the embedding model
+    
+    # Metric type for similarity calculation
+    metric_type: "IP"  # Inner Product (cosine similarity for normalized vectors)
+  
+  # Index configuration for the vector field
+  index:
+    # Index type (HNSW is recommended for most use cases)
+    type: "HNSW"
+    
+    # Index parameters
+    params:
+      M: 16              # Number of bi-directional links for each node
+      efConstruction: 64  # Search scope during index construction
+
+# Search configuration
+search:
+  # Search parameters
+  params:
+    ef: 64  # Search scope during search (should be >= topk)
+  
+  # Number of top results to retrieve for similarity comparison
+  topk: 10
+  
+  # Consistency level for search operations
+  consistency_level: "Session"  # Options: Strong, Session, Bounded, Eventually
+
+# Performance and resource settings
+performance:
+  # Connection pool settings
+  connection_pool:
+    # Maximum number of connections in the pool
+    max_connections: 10
+    
+    # Maximum idle connections
+    max_idle_connections: 5
+    
+    # Connection timeout for acquiring from pool
+    acquire_timeout: 5
+  
+  # Batch operation settings
+  batch:
+    # Maximum batch size for insert operations
+    insert_batch_size: 1000
+    
+    # Batch timeout in seconds
+    timeout: 30
+
+# Data management
+data_management:
+  # Automatic data expiration (TTL) settings
+  ttl:
+    # Enable automatic TTL-based cleanup (requires TTL to be set in main config)
+    enabled: true
+    
+    # Field name to store timestamp for TTL calculation
+    timestamp_field: "timestamp"
+    
+    # Cleanup interval in seconds (how often to run cleanup)
+    cleanup_interval: 3600  # 1 hour
+  
+  # Compaction settings
+  compaction:
+    # Enable automatic compaction
+    enabled: true
+    
+    # Compaction interval in seconds
+    interval: 86400  # 24 hours
+
+# Logging and monitoring
+logging:
+  # Log level for Milvus client operations (debug, info, warn, error)
+  level: "info"
+  
+  # Enable query/search logging for debugging
+  enable_query_log: false
+  
+  # Enable performance metrics collection
+  enable_metrics: true
+
+# Development and debugging settings
+development:
+  # Drop collection on startup (WARNING: This will delete all cached data)
+  drop_collection_on_startup: true  # Enable for development to test dynamic dimensions
+  
+  # Create collection if it doesn't exist
+  auto_create_collection: true
+  
+  # Print detailed error messages
+  verbose_errors: true
+
+# Example configurations for different environments:
+#
+# Local Development (Docker):
+# connection:
+#   host: "localhost"
+#   port: 19530
+#   auth:
+#     enabled: false
+#   development:
+#     drop_collection_on_startup: true  # Clean start for development
+#
+# Production (Zilliz Cloud):
+# connection:
+#   host: "your-cluster-endpoint.zillizcloud.com"
+#   port: 443
+#   auth:
+#     enabled: true
+#     username: "your-username"
+#     password: "your-password"
+#   tls:
+#     enabled: true
+#   development:
+#     drop_collection_on_startup: false
+#     auto_create_collection: false  # Pre-create collections in production
+#
+# Kubernetes Deployment:
+# connection:
+#   host: "milvus-service.milvus-system.svc.cluster.local"
+#   port: 19530
+#   timeout: 60  # Longer timeout for cluster environments
diff --git a/config/config.yaml b/config/config.yaml
index 2f722822..32f585e7 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -4,9 +4,25 @@ bert_model:
   use_cpu: true
 semantic_cache:
   enabled: true
+  backend_type: "memory"  # Options: "memory" or "milvus"
   similarity_threshold: 0.8
-  max_entries: 1000
+  max_entries: 1000  # Only applies to memory backend
   ttl_seconds: 3600
+  
+  # For production environments, use Milvus for scalable caching:
+  # backend_type: "milvus"
+  # backend_config_path: "config/cache/milvus.yaml"
+  
+  # Development/Testing: Use in-memory cache (current configuration)
+  # - Fast startup and no external dependencies
+  # - Limited to single instance scaling
+  # - Data lost on restart
+  
+  # Production: Use Milvus vector database
+  # - Horizontally scalable and persistent
+  # - Supports distributed deployments
+  # - Requires Milvus cluster setup
+  # - To enable: uncomment the lines above and install Milvus dependencies
 tools:
   enabled: true  # Set to true to enable automatic tool selection
   top_k: 3        # Number of most relevant tools to select
diff --git a/src/semantic-router/go.mod b/src/semantic-router/go.mod
index 3b6513b2..da335254 100644
--- a/src/semantic-router/go.mod
+++ b/src/semantic-router/go.mod
@@ -4,13 +4,17 @@ go 1.24.1
 
 replace (
 	github.com/vllm-project/semantic-router/candle-binding => ../../candle-binding
+	github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache => ./pkg/cache
 	github.com/vllm-project/semantic-router/src/semantic-router/pkg/config => ./pkg/config
 	github.com/vllm-project/semantic-router/src/semantic-router/pkg/extproc => ./pkg/extproc
+	github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics => ./pkg/metrics
+	github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability => ./pkg/observability
 )
 
 require (
 	github.com/envoyproxy/go-control-plane/envoy v1.32.4
 	github.com/fsnotify/fsnotify v1.7.0
+	github.com/milvus-io/milvus-sdk-go/v2 v2.4.2
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.38.0
 	github.com/openai/openai-go v1.12.0
@@ -25,12 +29,23 @@ require (
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect
+	github.com/cockroachdb/errors v1.9.1 // indirect
+	github.com/cockroachdb/logtags v0.0.0-20211118104740-dabe8e521a4f // indirect
+	github.com/cockroachdb/redact v1.1.3 // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
+	github.com/getsentry/sentry-go v0.12.0 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.4 // indirect
 	github.com/google/go-cmp v0.7.0 // indirect
 	github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
+	github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 // indirect
+	github.com/kr/pretty v0.3.1 // indirect
+	github.com/kr/text v0.2.0 // indirect
+	github.com/milvus-io/milvus-proto/go-api/v2 v2.4.10-0.20240819025435-512e3b98866a // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.65.0 // indirect
@@ -43,6 +58,7 @@ require (
 	go.uber.org/automaxprocs v1.6.0 // indirect
 	go.uber.org/multierr v1.10.0 // indirect
 	golang.org/x/net v0.41.0 // indirect
+	golang.org/x/sync v0.15.0 // indirect
 	golang.org/x/sys v0.33.0 // indirect
 	golang.org/x/text v0.26.0 // indirect
 	golang.org/x/tools v0.33.0 // indirect
diff --git a/src/semantic-router/go.sum b/src/semantic-router/go.sum
index 77570f5b..acb526d9 100644
--- a/src/semantic-router/go.sum
+++ b/src/semantic-router/go.sum
@@ -1,47 +1,212 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/CloudyKit/fastprinter v0.0.0-20200109182630-33d98a066a53/go.mod h1:+3IMCy2vIlbG1XG/0ggNQv0SvxCAIpPM5b1nCz56Xno=
+github.com/CloudyKit/jet/v3 v3.0.0/go.mod h1:HKQPgSJmdK8hdoAbKUUWajkHyHo4RaU5rMdUywE7VMo=
+github.com/Joker/hpp v1.0.0/go.mod h1:8x5n+M1Hp5hC0g8okX3sR3vFQwynaX/UgSOM9MeBKzY=
+github.com/Shopify/goreferrer v0.0.0-20181106222321-ec9c9a553398/go.mod h1:a1uqRtAwp2Xwc6WNPJEufxJ7fx3npB4UV/JOLmbu5I0=
+github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY=
+github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
+github.com/aymerick/raymond v2.0.3-0.20180322193309-b565731e1464+incompatible/go.mod h1:osfaiScAUVup+UC9Nfq76eWqDhXlp+4UYaA8uhTBO6g=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
 github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 h1:boJj011Hh+874zpIySeApCX4GeOjPl9qhRF3QuIZq+Q=
 github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
+github.com/cockroachdb/datadriven v1.0.2/go.mod h1:a9RdTaap04u637JoCzcUoIcDmvwSUtcUFtT/C3kJlTU=
+github.com/cockroachdb/errors v1.9.1 h1:yFVvsI0VxmRShfawbt/laCIDy/mtTqqnvoNgiy5bEV8=
+github.com/cockroachdb/errors v1.9.1/go.mod h1:2sxOtL2WIc096WSZqZ5h8fa17rdDq9HZOZLBCor4mBk=
+github.com/cockroachdb/logtags v0.0.0-20211118104740-dabe8e521a4f h1:6jduT9Hfc0njg5jJ1DdKCFPdMBrp/mdZfCpa5h+WM74=
+github.com/cockroachdb/logtags v0.0.0-20211118104740-dabe8e521a4f/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs=
+github.com/cockroachdb/redact v1.1.3 h1:AKZds10rFSIj7qADf0g46UixK8NNLwWTNdCIGS5wfSQ=
+github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
+github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM=
+github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
+github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
+github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
+github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6psNgSztDR4=
+github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
 github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A=
 github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
 github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
+github.com/etcd-io/bbolt v1.3.3/go.mod h1:ZF2nL25h33cCyBtcyWeZ2/I3HQOfTP+0PIEvHjkjCrw=
+github.com/fasthttp-contrib/websocket v0.0.0-20160511215533-1f3b11f56072/go.mod h1:duJ4Jxv5lDcvg4QuQr0oowTf7dz4/CR8NtyCooz9HL8=
+github.com/fatih/structs v1.1.0/go.mod h1:9NiDSp5zOcgEDl+j00MP/WkGVPOlPRLejGD8Ga6PJ7M=
+github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
 github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
+github.com/gavv/httpexpect v2.0.0+incompatible/go.mod h1:x+9tiU1YnrOvnB725RkpoLv1M62hOWzwo5OXotisrKc=
+github.com/getsentry/sentry-go v0.12.0 h1:era7g0re5iY13bHSdN/xMkyV+5zZppjRVQhZrXCaEIk=
+github.com/getsentry/sentry-go v0.12.0/go.mod h1:NSap0JBYWzHND8oMbyi0+XZhUalc1TBdRL1M71JZW2c=
+github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s=
+github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM=
+github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=
+github.com/go-errors/errors v1.0.1 h1:LUHzmkK3GUKUrL/1gfBUxAHzcev3apQlezX/+O7ma6w=
+github.com/go-errors/errors v1.0.1/go.mod h1:f4zRHt4oKfwPJE5k8C9vpYG+aDHdBFUsgrm6/TyX73Q=
+github.com/go-faker/faker/v4 v4.1.0 h1:ffuWmpDrducIUOO0QSKSF5Q2dxAht+dhsT9FvVHhPEI=
+github.com/go-faker/faker/v4 v4.1.0/go.mod h1:uuNc0PSRxF8nMgjGrrrU4Nw5cF30Jc6Kd0/FUTTYbhg=
+github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
+github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
 github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab/go.mod h1:/P9AEU963A2AYjv4d1V5eVL1CQbEJq6aCNHDDjibzu8=
+github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
 github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
 github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
+github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
+github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
+github.com/gogo/googleapis v0.0.0-20180223154316-0cd9801be74a/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s=
+github.com/gogo/googleapis v1.4.1/go.mod h1:2lpHqI5OcWCtVElxXnPt+s8oJvMpySlOyM6xDCrzib4=
+github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/gogo/status v1.1.0/go.mod h1:BFv9nrluPLmrS0EmGVvLaPNmRosr9KapBYd5/hpY1WM=
+github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/gomodule/redigo v1.7.1-0.20190724094224-574c33c3df38/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
 github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
+github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
+github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 h1:+9834+KizmvFV7pXQGSXQTsaWhq2GjuNUt0aUU0YBYw=
+github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
+github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
+github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
+github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
+github.com/hydrogen18/memlistener v0.0.0-20200120041712-dcc25e7acd91/go.mod h1:qEIFzExnS6016fRpRfxrExeVn2gbClQA99gQhnIcdhE=
+github.com/imkira/go-interpol v1.1.0/go.mod h1:z0h2/2T3XF8kyEPpRgJ3kmNv+C43p+I/CoI+jC3w2iA=
+github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
+github.com/iris-contrib/blackfriday v2.0.0+incompatible/go.mod h1:UzZ2bDEoaSGPbkg6SAB4att1aAwTmVIx/5gCVqeyUdI=
+github.com/iris-contrib/go.uuid v2.0.0+incompatible/go.mod h1:iz2lgM/1UnEf1kP0L/+fafWORmlnuysV2EMP8MW+qe0=
+github.com/iris-contrib/jade v1.1.3/go.mod h1:H/geBymxJhShH5kecoiOCSssPX7QWYH7UaeZTSWddIk=
+github.com/iris-contrib/pongo2 v0.0.1/go.mod h1:Ssh+00+3GAZqSQb30AvBRNxBx7rf0GqwkjqxNd0u65g=
+github.com/iris-contrib/schema v0.0.1/go.mod h1:urYA3uvUNG1TIIjOSCzHr9/LmbQo8LrOcOqfqxa4hXw=
+github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
+github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
+github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
+github.com/kataras/golog v0.0.10/go.mod h1:yJ8YKCmyL+nWjERB90Qwn+bdyBZsaQwU3bTVFgkFIp8=
+github.com/kataras/iris/v12 v12.1.8/go.mod h1:LMYy4VlP67TQ3Zgriz8RE2h2kMZV2SgMYbq3UhfoFmE=
+github.com/kataras/neffos v0.0.14/go.mod h1:8lqADm8PnbeFfL7CLXh1WHw53dG27MC3pgi2R1rmoTE=
+github.com/kataras/pio v0.0.2/go.mod h1:hAoW0t9UmXi4R5Oyq5Z4irTbaTsOemSrDGUtaTl7Dro=
+github.com/kataras/sitemap v0.0.5/go.mod h1:KY2eugMKiPwsJgx7+U103YZehfvNGOXURubcGyk0Bz8=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
+github.com/klauspost/compress v1.9.7/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
 github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
+github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
+github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
+github.com/labstack/echo/v4 v4.5.0/go.mod h1:czIriw4a0C1dFun+ObrXp7ok03xON0N1awStJ6ArI7Y=
+github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k=
+github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
+github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
+github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
+github.com/mattn/go-colorable v0.1.11/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
+github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
+github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
+github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ=
+github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
+github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
+github.com/mattn/goveralls v0.0.2/go.mod h1:8d1ZMHsd7fW6IRPKQh46F2WRpyib5/X4FOpevwGNQEw=
+github.com/mediocregopher/radix/v3 v3.4.2/go.mod h1:8FL3F6UQRXHXIBSPUs5h0RybMF8i4n7wVopoX3x7Bv8=
+github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
+github.com/milvus-io/milvus-proto/go-api/v2 v2.4.10-0.20240819025435-512e3b98866a h1:0B/8Fo66D8Aa23Il0yrQvg1KKz92tE/BJ5BvkUxxAAk=
+github.com/milvus-io/milvus-proto/go-api/v2 v2.4.10-0.20240819025435-512e3b98866a/go.mod h1:1OIl0v5PQeNxIJhCvY+K55CBUOYDZevw9g9380u1Wek=
+github.com/milvus-io/milvus-sdk-go/v2 v2.4.2 h1:Xqf+S7iicElwYoS2Zly8Nf/zKHuZsNy1xQajfdtygVY=
+github.com/milvus-io/milvus-sdk-go/v2 v2.4.2/go.mod h1:ulO1YUXKH0PGg50q27grw048GDY9ayB4FPmh7D+FFTA=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/moul/http2curl v1.0.0/go.mod h1:8UbvGypXm98wA/IqH45anm5Y2Z6ep6O31QGOAZ3H0fQ=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg=
+github.com/nats-io/nats.go v1.9.1/go.mod h1:ZjDU1L/7fJ09jvUSRVBR2e7+RnLiiIQyqyzEE/Zbp4w=
+github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w=
+github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
+github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/ginkgo v1.10.3/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
 github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus=
 github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8=
+github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
 github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY=
 github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o=
 github.com/openai/openai-go v1.12.0 h1:NBQCnXzqOTv5wsgNC36PrFEiskGfO5wccfCWDo9S1U0=
 github.com/openai/openai-go v1.12.0/go.mod h1:g461MYGXEXBVdV5SaR/5tNzNbSfwTBBefwc+LlDCK0Y=
+github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
+github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
+github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
+github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
+github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
 github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -50,14 +215,41 @@ github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4
 github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
 github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc=
 github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE=
 github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
 github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
 github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
+github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
+github.com/rogpeppe/go-internal v1.8.1/go.mod h1:JeRgkft04UBgHMgCIwADu4Pn6Mtm5d4nPKWu0nJ5d+o=
+github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
 github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
+github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
+github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
+github.com/schollz/closestmatch v2.1.0+incompatible/go.mod h1:RtP1ddjLong6gTkbtmuhtR2uUrrJOpYzYRvbcPAid+g=
+github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
+github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
+github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
+github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
+github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
+github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU=
+github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
+github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
+github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
@@ -70,6 +262,27 @@ github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
 github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
 github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
 github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
+github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
+github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
+github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
+github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4=
+github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
+github.com/valyala/fasthttp v1.6.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
+github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
+github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
+github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
+github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
+github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
+github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
+github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
+github.com/yalp/jsonpath v0.0.0-20180802001716-5cc68e5049a0/go.mod h1:/LWChgwKmvncFJFHJ7Gvn9wZArjbV5/FppcK2fKk/tI=
+github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg=
+github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82/go.mod h1:lgjkn3NuSvDfVJdfcVVdX+jpBxNmX4rDAzaS45IcYoM=
+github.com/yudai/pp v2.0.1+incompatible/go.mod h1:PuxR/8QJ7cyCkFp/aUDS+JY727OFEZkTdatxwunjIkc=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
 go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
 go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
 go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY=
@@ -82,30 +295,168 @@ go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce
 go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w=
 go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k=
 go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE=
+go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
 go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
 go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
 go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
 go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
 go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
 go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
+go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
 go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
 go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
+golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191227163750-53104e6ec876/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190327091125-710a502c58a2/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
+golang.org/x/net v0.0.0-20211008194852-3b03d305991f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
 golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8=
+golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
 golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M=
 golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA=
+golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20181221001348-537d06c36207/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190327201419-c70d86f8b7cf/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc=
 golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genproto v0.0.0-20180518175338-11a468237815/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200423170343-7949de9c1215/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto v0.0.0-20210624195500-8bfb893ecb84/go.mod h1:SzzZ/N+nwJDaO1kznhnlzqS8ocJICar6hYhVyhi++24=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1:OxYkA3wjPsZyBylwymxSHa7ViiW1Sml4ToBrncvFehI=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50=
+google.golang.org/grpc v1.12.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
+google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
 google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI=
 google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec=
+google.golang.org/grpc/examples v0.0.0-20220617181431-3e7b97febc7f h1:rqzndB2lIQGivcXdTuY3Y9NBvr70X+y77woofSRluec=
+google.golang.org/grpc/examples v0.0.0-20220617181431-3e7b97febc7f/go.mod h1:gxndsbNG1n4TZcHGgsYEfVGnTxqfEdfiDv6/DADXX9o=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
 google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
 google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
+gopkg.in/go-playground/assert.v1 v1.2.1/go.mod h1:9RXL0bg/zibRAgZUYszZSwO/z8Y/a8bDuhia5mkpMnE=
+gopkg.in/go-playground/validator.v8 v8.18.2/go.mod h1:RX2a/7Ha8BgOhfk7j780h4/u/RRjR0eouCJSH80/M2Y=
+gopkg.in/ini.v1 v1.51.1/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.0-20191120175047-4206685974f2/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
diff --git a/src/semantic-router/pkg/cache/cache.go b/src/semantic-router/pkg/cache/cache.go
index 6dd4be2f..f2f3e814 100644
--- a/src/semantic-router/pkg/cache/cache.go
+++ b/src/semantic-router/pkg/cache/cache.go
@@ -3,304 +3,28 @@ package cache
 import (
 	"encoding/json"
 	"fmt"
-	"log"
-	"sort"
-	"sync"
-	"time"
-
-	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
 )
 
-// CacheEntry represents a cached request-response pair
-type CacheEntry struct {
-	RequestBody  []byte
-	ResponseBody []byte
-	Model        string
-	Query        string
-	Embedding    []float32
-	Timestamp    time.Time
-}
-
-// SemanticCache implements a semantic cache using BERT embeddings
-type SemanticCache struct {
-	entries             []CacheEntry
-	mu                  sync.RWMutex
-	similarityThreshold float32
-	maxEntries          int
-	ttlSeconds          int
-	enabled             bool
-}
-
-// SemanticCacheOptions holds options for creating a new semantic cache
-type SemanticCacheOptions struct {
-	SimilarityThreshold float32
-	MaxEntries          int
-	TTLSeconds          int
-	Enabled             bool
-}
-
-// NewSemanticCache creates a new semantic cache with the given options
-func NewSemanticCache(options SemanticCacheOptions) *SemanticCache {
-	return &SemanticCache{
-		entries:             []CacheEntry{},
-		similarityThreshold: options.SimilarityThreshold,
-		maxEntries:          options.MaxEntries,
-		ttlSeconds:          options.TTLSeconds,
-		enabled:             options.Enabled,
-	}
-}
-
-// IsEnabled returns whether the cache is enabled
-func (c *SemanticCache) IsEnabled() bool {
-	return c.enabled
-}
-
-// AddPendingRequest adds a pending request to the cache (without response yet)
-func (c *SemanticCache) AddPendingRequest(model string, query string, requestBody []byte) (string, error) {
-	if !c.enabled {
-		return query, nil
-	}
-
-	// Generate embedding for the query
-	embedding, err := candle_binding.GetEmbedding(query, 512)
-	if err != nil {
-		return "", fmt.Errorf("failed to generate embedding: %w", err)
-	}
-
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	// Cleanup expired entries if TTL is set
-	c.cleanupExpiredEntries()
-
-	// Create a new entry with the pending request
-	entry := CacheEntry{
-		RequestBody: requestBody,
-		Model:       model,
-		Query:       query,
-		Embedding:   embedding,
-		Timestamp:   time.Now(),
-	}
-
-	c.entries = append(c.entries, entry)
-	// log.Printf("Added pending cache entry for: %s", query)
-
-	// Enforce max entries limit if set
-	if c.maxEntries > 0 && len(c.entries) > c.maxEntries {
-		// Sort by timestamp (oldest first)
-		sort.Slice(c.entries, func(i, j int) bool {
-			return c.entries[i].Timestamp.Before(c.entries[j].Timestamp)
-		})
-		// Remove oldest entries
-		c.entries = c.entries[len(c.entries)-c.maxEntries:]
-		log.Printf("Trimmed cache to %d entries", c.maxEntries)
-	}
-
-	return query, nil
-}
-
-// UpdateWithResponse updates a pending request with its response
-func (c *SemanticCache) UpdateWithResponse(query string, responseBody []byte) error {
-	if !c.enabled {
-		return nil
-	}
-
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	// Cleanup expired entries while we have the write lock
-	c.cleanupExpiredEntries()
-
-	// Find the pending request by query
-	for i, entry := range c.entries {
-		if entry.Query == query && entry.ResponseBody == nil {
-			// Update with response
-			c.entries[i].ResponseBody = responseBody
-			c.entries[i].Timestamp = time.Now()
-			// log.Printf("Cache entry updated: %s", query)
-			return nil
-		}
-	}
-
-	return fmt.Errorf("no pending request found for query: %s", query)
-}
-
-// AddEntry adds a complete entry to the cache
-func (c *SemanticCache) AddEntry(model string, query string, requestBody, responseBody []byte) error {
-	if !c.enabled {
-		return nil
-	}
-
-	// Generate embedding for the query
-	embedding, err := candle_binding.GetEmbedding(query, 512)
-	if err != nil {
-		return fmt.Errorf("failed to generate embedding: %w", err)
-	}
-
-	entry := CacheEntry{
-		RequestBody:  requestBody,
-		ResponseBody: responseBody,
-		Model:        model,
-		Query:        query,
-		Embedding:    embedding,
-		Timestamp:    time.Now(),
-	}
-
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	// Cleanup expired entries
-	c.cleanupExpiredEntries()
-
-	c.entries = append(c.entries, entry)
-	log.Printf("Added cache entry: %s", query)
-
-	// Enforce max entries limit
-	if c.maxEntries > 0 && len(c.entries) > c.maxEntries {
-		// Sort by timestamp (oldest first)
-		sort.Slice(c.entries, func(i, j int) bool {
-			return c.entries[i].Timestamp.Before(c.entries[j].Timestamp)
-		})
-		// Remove oldest entries
-		c.entries = c.entries[len(c.entries)-c.maxEntries:]
-	}
-
-	return nil
-}
-
-// FindSimilar looks for a similar request in the cache
-func (c *SemanticCache) FindSimilar(model string, query string) ([]byte, bool, error) {
-	if !c.enabled {
-		return nil, false, nil
-	}
-
-	// Generate embedding for the query
-	queryEmbedding, err := candle_binding.GetEmbedding(query, 512)
-	if err != nil {
-		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
-	}
-
-	c.mu.RLock()
-	defer c.mu.RUnlock()
-
-	// Cleanup expired entries
-	c.cleanupExpiredEntriesReadOnly()
-
-	type SimilarityResult struct {
-		Entry      CacheEntry
-		Similarity float32
-	}
-
-	// Only compare with entries that have responses
-	results := make([]SimilarityResult, 0, len(c.entries))
-	for _, entry := range c.entries {
-		if entry.ResponseBody == nil {
-			continue // Skip entries without responses
-		}
-
-		// Only compare with entries with the same model
-		if entry.Model != model {
-			continue
-		}
-
-		// Calculate similarity
-		var dotProduct float32
-		for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
-			dotProduct += queryEmbedding[i] * entry.Embedding[i]
-		}
-
-		results = append(results, SimilarityResult{
-			Entry:      entry,
-			Similarity: dotProduct,
-		})
-	}
-
-	// No results found
-	if len(results) == 0 {
-		return nil, false, nil
-	}
-
-	// Sort by similarity (highest first)
-	sort.Slice(results, func(i, j int) bool {
-		return results[i].Similarity > results[j].Similarity
-	})
-
-	// Check if the best match exceeds the threshold
-	if results[0].Similarity >= c.similarityThreshold {
-		log.Printf("Cache hit: similarity=%.4f, threshold=%.4f",
-			results[0].Similarity, c.similarityThreshold)
-		return results[0].Entry.ResponseBody, true, nil
-	}
-
-	log.Printf("Cache miss: best similarity=%.4f, threshold=%.4f",
-		results[0].Similarity, c.similarityThreshold)
-	return nil, false, nil
-}
-
-// cleanupExpiredEntries removes expired entries from the cache
-// Assumes the caller holds a write lock
-func (c *SemanticCache) cleanupExpiredEntries() {
-	if c.ttlSeconds <= 0 {
-		return
-	}
-
-	now := time.Now()
-	validEntries := make([]CacheEntry, 0, len(c.entries))
-
-	for _, entry := range c.entries {
-		// Keep entries that haven't expired
-		if now.Sub(entry.Timestamp).Seconds() < float64(c.ttlSeconds) {
-			validEntries = append(validEntries, entry)
-		}
-	}
-
-	if len(validEntries) < len(c.entries) {
-		log.Printf("Removed %d expired cache entries", len(c.entries)-len(validEntries))
-		c.entries = validEntries
-	}
-}
-
-// cleanupExpiredEntriesReadOnly checks for expired entries but doesn't modify the cache
-// Used during read operations where we only have a read lock
-func (c *SemanticCache) cleanupExpiredEntriesReadOnly() {
-	if c.ttlSeconds <= 0 {
-		return
-	}
-
-	now := time.Now()
-	expiredCount := 0
-
-	for _, entry := range c.entries {
-		if now.Sub(entry.Timestamp).Seconds() >= float64(c.ttlSeconds) {
-			expiredCount++
-		}
-	}
-
-	if expiredCount > 0 {
-		log.Printf("Found %d expired cache entries during read operation", expiredCount)
-	}
-}
-
-// ChatMessage represents a message in the OpenAI chat format
+// ChatMessage represents a message in the OpenAI chat format with role and content
 type ChatMessage struct {
 	Role    string `json:"role"`
 	Content string `json:"content"`
 }
 
-// OpenAIRequest represents an OpenAI API request
+// OpenAIRequest represents the structure of an OpenAI API request
 type OpenAIRequest struct {
 	Model    string        `json:"model"`
 	Messages []ChatMessage `json:"messages"`
 }
 
-// ExtractQueryFromOpenAIRequest extracts the user query from an OpenAI request
+// ExtractQueryFromOpenAIRequest parses an OpenAI request and extracts the user query
 func ExtractQueryFromOpenAIRequest(requestBody []byte) (string, string, error) {
 	var req OpenAIRequest
 	if err := json.Unmarshal(requestBody, &req); err != nil {
 		return "", "", fmt.Errorf("invalid request body: %w", err)
 	}
 
-	// Extract user messages
+	// Find user messages in the conversation
 	var userMessages []string
 	for _, msg := range req.Messages {
 		if msg.Role == "user" {
@@ -308,10 +32,10 @@ func ExtractQueryFromOpenAIRequest(requestBody []byte) (string, string, error) {
 		}
 	}
 
-	// Join all user messages
+	// Use the most recent user message as the query
 	query := ""
 	if len(userMessages) > 0 {
-		query = userMessages[len(userMessages)-1] // Use the last user message
+		query = userMessages[len(userMessages)-1]
 	}
 
 	return req.Model, query, nil
diff --git a/src/semantic-router/pkg/cache/cache_factory.go b/src/semantic-router/pkg/cache/cache_factory.go
new file mode 100644
index 00000000..396d1eb6
--- /dev/null
+++ b/src/semantic-router/pkg/cache/cache_factory.go
@@ -0,0 +1,143 @@
+package cache
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
+)
+
+// NewCacheBackend creates a cache backend instance from the provided configuration
+func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
+	if !config.Enabled {
+		// Create a disabled cache backend
+		observability.Debugf("Cache disabled - creating disabled in-memory cache backend")
+		return NewInMemoryCache(InMemoryCacheOptions{
+			Enabled: false,
+		}), nil
+	}
+
+	switch config.BackendType {
+	case InMemoryCacheType, "":
+		// Use in-memory cache as the default backend
+		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f",
+			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold)
+		options := InMemoryCacheOptions{
+			Enabled:             config.Enabled,
+			SimilarityThreshold: config.SimilarityThreshold,
+			MaxEntries:          config.MaxEntries,
+			TTLSeconds:          config.TTLSeconds,
+		}
+		return NewInMemoryCache(options), nil
+
+	case MilvusCacheType:
+		observability.Debugf("Creating Milvus cache backend - ConfigPath: %s, TTL: %ds, Threshold: %.3f",
+			config.BackendConfigPath, config.TTLSeconds, config.SimilarityThreshold)
+		if config.BackendConfigPath == "" {
+			return nil, fmt.Errorf("backend_config_path is required for Milvus cache backend")
+		}
+
+		// Ensure the Milvus configuration file exists
+		if _, err := os.Stat(config.BackendConfigPath); os.IsNotExist(err) {
+			observability.Debugf("Milvus config file not found: %s", config.BackendConfigPath)
+			return nil, fmt.Errorf("Milvus config file not found: %s", config.BackendConfigPath)
+		}
+		observability.Debugf("Milvus config file found: %s", config.BackendConfigPath)
+
+		options := MilvusCacheOptions{
+			Enabled:             config.Enabled,
+			SimilarityThreshold: config.SimilarityThreshold,
+			TTLSeconds:          config.TTLSeconds,
+			ConfigPath:          config.BackendConfigPath,
+		}
+		return NewMilvusCache(options)
+
+	default:
+		observability.Debugf("Unsupported cache backend type: %s", config.BackendType)
+		return nil, fmt.Errorf("unsupported cache backend type: %s", config.BackendType)
+	}
+}
+
+// ValidateCacheConfig validates cache configuration parameters
+func ValidateCacheConfig(config CacheConfig) error {
+	if !config.Enabled {
+		return nil // Skip validation for disabled cache
+	}
+
+	// Check similarity threshold range
+	if config.SimilarityThreshold < 0.0 || config.SimilarityThreshold > 1.0 {
+		return fmt.Errorf("similarity_threshold must be between 0.0 and 1.0, got: %f", config.SimilarityThreshold)
+	}
+
+	// Check TTL value
+	if config.TTLSeconds < 0 {
+		return fmt.Errorf("ttl_seconds cannot be negative, got: %d", config.TTLSeconds)
+	}
+
+	// Check max entries for in-memory cache
+	if config.BackendType == InMemoryCacheType || config.BackendType == "" {
+		if config.MaxEntries < 0 {
+			return fmt.Errorf("max_entries cannot be negative for in-memory cache, got: %d", config.MaxEntries)
+		}
+	}
+
+	// Check backend-specific requirements
+	switch config.BackendType {
+	case MilvusCacheType:
+		if config.BackendConfigPath == "" {
+			return fmt.Errorf("backend_config_path is required for Milvus cache backend")
+		}
+	}
+
+	return nil
+}
+
+// GetDefaultCacheConfig provides sensible default cache configuration values
+func GetDefaultCacheConfig() CacheConfig {
+	return CacheConfig{
+		BackendType:         InMemoryCacheType,
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		MaxEntries:          1000,
+		TTLSeconds:          3600,
+	}
+}
+
+// CacheBackendInfo describes the capabilities and features of a cache backend
+type CacheBackendInfo struct {
+	Type        CacheBackendType `json:"type"`
+	Name        string           `json:"name"`
+	Description string           `json:"description"`
+	Features    []string         `json:"features"`
+}
+
+// GetAvailableCacheBackends returns metadata for all supported cache backends
+func GetAvailableCacheBackends() []CacheBackendInfo {
+	return []CacheBackendInfo{
+		{
+			Type:        InMemoryCacheType,
+			Name:        "In-Memory Cache",
+			Description: "High-performance in-memory semantic cache with BERT embeddings",
+			Features: []string{
+				"Fast access",
+				"No external dependencies",
+				"Automatic memory management",
+				"TTL support",
+				"Entry limit support",
+			},
+		},
+		{
+			Type:        MilvusCacheType,
+			Name:        "Milvus Vector Database",
+			Description: "Enterprise-grade semantic cache powered by Milvus vector database",
+			Features: []string{
+				"Highly scalable",
+				"Persistent storage",
+				"Distributed architecture",
+				"Advanced indexing",
+				"High availability",
+				"TTL support",
+			},
+		},
+	}
+}
diff --git a/src/semantic-router/pkg/cache/cache_interface.go b/src/semantic-router/pkg/cache/cache_interface.go
new file mode 100644
index 00000000..b1940f10
--- /dev/null
+++ b/src/semantic-router/pkg/cache/cache_interface.go
@@ -0,0 +1,80 @@
+package cache
+
+import "time"
+
+// CacheEntry represents a complete cached request-response pair with associated metadata
+type CacheEntry struct {
+	RequestBody  []byte
+	ResponseBody []byte
+	Model        string
+	Query        string
+	Embedding    []float32
+	Timestamp    time.Time
+}
+
+// CacheBackend defines the interface for semantic cache implementations
+type CacheBackend interface {
+	// IsEnabled returns whether caching is currently active
+	IsEnabled() bool
+
+	// AddPendingRequest stores a request awaiting its response
+	// Returns the processed query string and any error
+	AddPendingRequest(model string, query string, requestBody []byte) (string, error)
+
+	// UpdateWithResponse completes a pending request with the received response
+	UpdateWithResponse(query string, responseBody []byte) error
+
+	// AddEntry stores a complete request-response pair in the cache
+	AddEntry(model string, query string, requestBody, responseBody []byte) error
+
+	// FindSimilar searches for semantically similar cached requests
+	// Returns the cached response, match status, and any error
+	FindSimilar(model string, query string) ([]byte, bool, error)
+
+	// Close releases all resources held by the cache backend
+	Close() error
+
+	// GetStats provides cache performance and usage metrics
+	GetStats() CacheStats
+}
+
+// CacheStats holds performance metrics and usage statistics for cache operations
+type CacheStats struct {
+	TotalEntries    int        `json:"total_entries"`
+	HitCount        int64      `json:"hit_count"`
+	MissCount       int64      `json:"miss_count"`
+	HitRatio        float64    `json:"hit_ratio"`
+	LastCleanupTime *time.Time `json:"last_cleanup_time,omitempty"`
+}
+
+// CacheBackendType defines the available cache backend implementations
+type CacheBackendType string
+
+const (
+	// InMemoryCacheType specifies the in-memory cache backend
+	InMemoryCacheType CacheBackendType = "memory"
+
+	// MilvusCacheType specifies the Milvus vector database backend
+	MilvusCacheType CacheBackendType = "milvus"
+)
+
+// CacheConfig contains configuration settings shared across all cache backends
+type CacheConfig struct {
+	// BackendType specifies which cache implementation to use
+	BackendType CacheBackendType `yaml:"backend_type"`
+
+	// Enabled controls whether semantic caching is active
+	Enabled bool `yaml:"enabled"`
+
+	// SimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
+	SimilarityThreshold float32 `yaml:"similarity_threshold"`
+
+	// MaxEntries limits the number of cached entries (for in-memory backend)
+	MaxEntries int `yaml:"max_entries,omitempty"`
+
+	// TTLSeconds sets cache entry expiration time (0 disables expiration)
+	TTLSeconds int `yaml:"ttl_seconds,omitempty"`
+
+	// BackendConfigPath points to backend-specific configuration files
+	BackendConfigPath string `yaml:"backend_config_path,omitempty"`
+}
diff --git a/src/semantic-router/pkg/cache/cache_test.go b/src/semantic-router/pkg/cache/cache_test.go
index d4787b47..31b9379f 100644
--- a/src/semantic-router/pkg/cache/cache_test.go
+++ b/src/semantic-router/pkg/cache/cache_test.go
@@ -1,17 +1,16 @@
 package cache_test
 
 import (
-	"encoding/json"
-	"fmt"
-	"sync"
+	"os"
+	"path/filepath"
+	"strings"
 	"testing"
-	"time"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-
-	candle "github.com/vllm-project/semantic-router/candle-binding"
-	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache"
 )
 
 func TestCache(t *testing.T) {
@@ -20,680 +19,622 @@ func TestCache(t *testing.T) {
 }
 
 var _ = BeforeSuite(func() {
-	err := candle.InitModel("", true)
+	// Initialize BERT model once for all cache tests (Linux only)
+	err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true)
 	Expect(err).NotTo(HaveOccurred())
 })
 
 var _ = Describe("Cache Package", func() {
 	var (
-		semanticCache  *cache.SemanticCache
-		defaultOptions cache.SemanticCacheOptions
+		tempDir string
 	)
 
 	BeforeEach(func() {
-		defaultOptions = cache.SemanticCacheOptions{
-			SimilarityThreshold: 0.8,
-			MaxEntries:          100,
-			TTLSeconds:          3600,
-			Enabled:             true,
-		}
-		semanticCache = cache.NewSemanticCache(defaultOptions)
+		var err error
+		tempDir, err = os.MkdirTemp("", "cache_test")
+		Expect(err).NotTo(HaveOccurred())
 	})
 
-	Describe("NewSemanticCache", func() {
-		It("should create a cache with correct options", func() {
-			options := cache.SemanticCacheOptions{
-				SimilarityThreshold: 0.9,
-				MaxEntries:          50,
-				TTLSeconds:          1800,
-				Enabled:             true,
-			}
-			c := cache.NewSemanticCache(options)
-			Expect(c).NotTo(BeNil())
-			Expect(c.IsEnabled()).To(BeTrue())
-		})
-
-		It("should create a disabled cache when specified", func() {
-			options := cache.SemanticCacheOptions{
-				Enabled: false,
-			}
-			c := cache.NewSemanticCache(options)
-			Expect(c.IsEnabled()).To(BeFalse())
-		})
+	AfterEach(func() {
+		os.RemoveAll(tempDir)
 	})
 
-	Describe("IsEnabled", func() {
-		It("should return the correct enabled status", func() {
-			enabledCache := cache.NewSemanticCache(cache.SemanticCacheOptions{Enabled: true})
-			Expect(enabledCache.IsEnabled()).To(BeTrue())
+	Describe("Cache Factory", func() {
+		Describe("NewCacheBackend", func() {
+			Context("with memory backend", func() {
+				It("should create in-memory cache backend successfully", func() {
+					config := cache.CacheConfig{
+						BackendType:         cache.InMemoryCacheType,
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						MaxEntries:          1000,
+						TTLSeconds:          3600,
+					}
 
-			disabledCache := cache.NewSemanticCache(cache.SemanticCacheOptions{Enabled: false})
-			Expect(disabledCache.IsEnabled()).To(BeFalse())
-		})
-	})
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).NotTo(HaveOccurred())
+					Expect(backend).NotTo(BeNil())
+					Expect(backend.IsEnabled()).To(BeTrue())
+				})
 
-	Describe("AddEntry", func() {
-		Context("when cache is enabled", func() {
-			It("should add a complete entry successfully", func() {
-				model := "model-a"
-				query := "What is the capital of France?"
-				requestBody := []byte(`{"model": "model-a", "messages": [{"role": "user", "content": "What is the capital of France?"}]}`)
-				responseBody := []byte(`{"choices": [{"message": {"content": "Paris"}}]}`)
+				It("should create disabled cache when enabled is false", func() {
+					config := cache.CacheConfig{
+						BackendType:         cache.InMemoryCacheType,
+						Enabled:             false,
+						SimilarityThreshold: 0.8,
+						MaxEntries:          1000,
+						TTLSeconds:          3600,
+					}
 
-				err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-				Expect(err).NotTo(HaveOccurred())
-			})
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).NotTo(HaveOccurred())
+					Expect(backend).NotTo(BeNil())
+					Expect(backend.IsEnabled()).To(BeFalse())
+				})
 
-			It("should handle empty query gracefully", func() {
-				model := "model-a"
-				query := ""
-				requestBody := []byte(`{"model": "model-a"}`)
-				responseBody := []byte(`{"choices": []}`)
+				It("should default to memory backend when backend_type is empty", func() {
+					config := cache.CacheConfig{
+						BackendType:         "", // Empty should default to memory
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						MaxEntries:          500,
+						TTLSeconds:          1800,
+					}
 
-				err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-				// Should not error, but may not generate embedding for empty query
-				// The actual behavior depends on the candle_binding implementation
-				Expect(err).To(Or(BeNil(), HaveOccurred()))
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).NotTo(HaveOccurred())
+					Expect(backend).NotTo(BeNil())
+					Expect(backend.IsEnabled()).To(BeTrue())
+				})
 			})
 
-			Context("with max entries limit", func() {
+			Context("with Milvus backend", func() {
+				var milvusConfigPath string
+
 				BeforeEach(func() {
-					options := cache.SemanticCacheOptions{
-						SimilarityThreshold: 0.8,
-						MaxEntries:          3,
-						TTLSeconds:          0, // No TTL for this test
-						Enabled:             true,
+					// Skip Milvus tests if environment variable is set
+					if os.Getenv("SKIP_MILVUS_TESTS") == "true" {
+						Skip("Milvus tests skipped due to SKIP_MILVUS_TESTS=true")
 					}
-					semanticCache = cache.NewSemanticCache(options)
-				})
 
-				It("should enforce max entries limit by removing oldest entries", func() {
-					// Add entries beyond the limit
-					for i := 0; i < 5; i++ {
-						query := fmt.Sprintf("Query %d", i)
-						model := "model-a"
-						requestBody := []byte(fmt.Sprintf(`{"query": "%s"}`, query))
-						responseBody := []byte(fmt.Sprintf(`{"response": "Response %d"}`, i))
-
-						err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-						Expect(err).To(Or(BeNil(), HaveOccurred())) // Embedding generation might fail in test
+					// Create a test Milvus configuration file
+					milvusConfigPath = filepath.Join(tempDir, "milvus.yaml")
+					milvusConfig := `
+connection:
+  host: "localhost"
+  port: 19530
+  database: "test_cache"
+  timeout: 30
+
+collection:
+  name: "test_semantic_cache"
+  description: "Test semantic cache collection"
+  vector_field:
+    name: "embedding"
+    dimension: 512
+    metric_type: "IP"
+  index:
+    type: "HNSW"
+    params:
+      M: 16
+      efConstruction: 64
+
+search:
+  params:
+    ef: 64
+  topk: 10
+  consistency_level: "Session"
+
+development:
+  auto_create_collection: true
+  verbose_errors: true
+`
+					err := os.WriteFile(milvusConfigPath, []byte(milvusConfig), 0o644)
+					Expect(err).NotTo(HaveOccurred())
+				})
 
-						// Small delay to ensure different timestamps
-						time.Sleep(time.Millisecond)
+				It("should return error when backend_config_path is missing", func() {
+					config := cache.CacheConfig{
+						BackendType:         cache.MilvusCacheType,
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						TTLSeconds:          3600,
+						// BackendConfigPath is missing
 					}
 
-					// The cache should not exceed max entries
-					// We can't directly access the entries count, but we can test the behavior
-					// by checking that older entries are removed
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).To(HaveOccurred())
+					Expect(err.Error()).To(ContainSubstring("backend_config_path is required"))
+					Expect(backend).To(BeNil())
 				})
-			})
-		})
 
-		Context("when cache is disabled", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{Enabled: false}
-				semanticCache = cache.NewSemanticCache(options)
-			})
+				It("should return error when backend_config_path file doesn't exist", func() {
+					config := cache.CacheConfig{
+						BackendType:         cache.MilvusCacheType,
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						TTLSeconds:          3600,
+						BackendConfigPath:   "/nonexistent/milvus.yaml",
+					}
 
-			It("should return immediately without error", func() {
-				model := "model-a"
-				query := "Test query"
-				requestBody := []byte(`{"test": "data"}`)
-				responseBody := []byte(`{"result": "success"}`)
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).To(HaveOccurred())
+					Expect(err.Error()).To(ContainSubstring("config file not found"))
+					Expect(backend).To(BeNil())
+				})
 
-				err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-				Expect(err).NotTo(HaveOccurred())
-			})
-		})
-	})
+				It("should create Milvus cache backend successfully with valid config", func() {
+					config := cache.CacheConfig{
+						BackendType:         cache.MilvusCacheType,
+						Enabled:             true,
+						SimilarityThreshold: 0.85,
+						TTLSeconds:          7200,
+						BackendConfigPath:   milvusConfigPath,
+					}
 
-	Describe("AddPendingRequest", func() {
-		Context("when cache is enabled", func() {
-			It("should add a pending request and return the query", func() {
-				model := "model-a"
-				query := "What is machine learning?"
-				requestBody := []byte(`{"model": "model-a", "messages": [{"role": "user", "content": "What is machine learning?"}]}`)
-
-				returnedQuery, err := semanticCache.AddPendingRequest(model, query, requestBody)
-				Expect(err).To(Or(BeNil(), HaveOccurred())) // Embedding generation might fail
-				if err == nil {
-					Expect(returnedQuery).To(Equal(query))
-				}
-			})
+					backend, err := cache.NewCacheBackend(config)
 
-			It("should handle empty query", func() {
-				model := "model-a"
-				query := ""
-				requestBody := []byte(`{"model": "model-a"}`)
+					// Skip test if Milvus is not reachable
+					if err != nil {
+						if strings.Contains(err.Error(), "failed to create Milvus client") ||
+							strings.Contains(err.Error(), "connection") ||
+							strings.Contains(err.Error(), "dial") {
+							Skip("Milvus server not available: " + err.Error())
+						}
+						// For other errors, fail the test
+						Expect(err).NotTo(HaveOccurred())
+					} else {
+						// If Milvus is available, creation should succeed
+						Expect(backend).NotTo(BeNil())
+						Expect(backend.IsEnabled()).To(BeTrue())
+					}
+				})
 
-				returnedQuery, err := semanticCache.AddPendingRequest(model, query, requestBody)
-				// Should handle empty query gracefully
-				Expect(err).To(Or(BeNil(), HaveOccurred()))
-				if err == nil {
-					Expect(returnedQuery).To(Equal(query))
-				}
-			})
-		})
+				It("should handle disabled Milvus cache", func() {
+					config := cache.CacheConfig{
+						BackendType:         cache.MilvusCacheType,
+						Enabled:             false,
+						SimilarityThreshold: 0.8,
+						TTLSeconds:          3600,
+						BackendConfigPath:   milvusConfigPath,
+					}
 
-		Context("when cache is disabled", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{Enabled: false}
-				semanticCache = cache.NewSemanticCache(options)
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).NotTo(HaveOccurred())
+					Expect(backend).NotTo(BeNil())
+					Expect(backend.IsEnabled()).To(BeFalse())
+				})
 			})
 
-			It("should return the query without processing", func() {
-				model := "model-a"
-				query := "Test query"
-				requestBody := []byte(`{"test": "data"}`)
+			Context("with unsupported backend type", func() {
+				It("should return error for unsupported backend type", func() {
+					config := cache.CacheConfig{
+						BackendType:         "redis", // Unsupported
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						TTLSeconds:          3600,
+					}
 
-				returnedQuery, err := semanticCache.AddPendingRequest(model, query, requestBody)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(returnedQuery).To(Equal(query))
+					backend, err := cache.NewCacheBackend(config)
+					Expect(err).To(HaveOccurred())
+					Expect(err.Error()).To(ContainSubstring("unsupported cache backend type"))
+					Expect(backend).To(BeNil())
+				})
 			})
 		})
-	})
 
-	Describe("UpdateWithResponse", func() {
-		Context("when cache is enabled", func() {
-			It("should update a pending request with response", func() {
-				model := "model-a"
-				query := "Test query for update"
-				requestBody := []byte(`{"model": "model-a"}`)
-				responseBody := []byte(`{"response": "test response"}`)
-
-				// First add a pending request
-				_, err := semanticCache.AddPendingRequest(model, query, requestBody)
-				Expect(err).NotTo(HaveOccurred())
+		Describe("ValidateCacheConfig", func() {
+			It("should validate enabled memory backend configuration", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             true,
+					SimilarityThreshold: 0.8,
+					MaxEntries:          1000,
+					TTLSeconds:          3600,
+				}
 
-				// Then update it with response
-				err = semanticCache.UpdateWithResponse(query, responseBody)
+				err := cache.ValidateCacheConfig(config)
 				Expect(err).NotTo(HaveOccurred())
 			})
 
-			It("should return error for non-existent pending request", func() {
-				query := "Non-existent query"
-				responseBody := []byte(`{"response": "test"}`)
-
-				err := semanticCache.UpdateWithResponse(query, responseBody)
-				Expect(err).To(HaveOccurred())
-				Expect(err.Error()).To(ContainSubstring("no pending request found"))
-			})
-		})
+			It("should validate disabled cache configuration", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             false,
+					SimilarityThreshold: 2.0, // Invalid, but should be ignored for disabled cache
+					MaxEntries:          -1,  // Invalid, but should be ignored for disabled cache
+				}
 
-		Context("when cache is disabled", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{Enabled: false}
-				semanticCache = cache.NewSemanticCache(options)
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).NotTo(HaveOccurred()) // Disabled cache should skip validation
 			})
 
-			It("should return immediately without error", func() {
-				query := "Test query"
-				responseBody := []byte(`{"response": "test"}`)
+			It("should return error for invalid similarity threshold", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             true,
+					SimilarityThreshold: 1.5, // Invalid: > 1.0
+					MaxEntries:          1000,
+					TTLSeconds:          3600,
+				}
 
-				err := semanticCache.UpdateWithResponse(query, responseBody)
-				Expect(err).NotTo(HaveOccurred())
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("similarity_threshold must be between 0.0 and 1.0"))
 			})
-		})
-	})
 
-	Describe("FindSimilar", func() {
-		Context("when cache is enabled", func() {
-			It("should return cache miss for empty cache", func() {
-				model := "model-a"
-				query := "What is AI?"
+			It("should return error for negative similarity threshold", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             true,
+					SimilarityThreshold: -0.1, // Invalid: < 0.0
+					MaxEntries:          1000,
+					TTLSeconds:          3600,
+				}
 
-				response, found, err := semanticCache.FindSimilar(model, query)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(found).To(BeFalse())
-				Expect(response).To(BeNil())
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("similarity_threshold must be between 0.0 and 1.0"))
 			})
 
-			It("should handle empty query gracefully", func() {
-				model := "model-a"
-				query := ""
-
-				response, found, err := semanticCache.FindSimilar(model, query)
-				// Should handle empty query
-				Expect(err).To(Or(BeNil(), HaveOccurred()))
-				if err == nil {
-					Expect(found).To(BeFalse())
-					Expect(response).To(BeNil())
+			It("should return error for negative TTL", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             true,
+					SimilarityThreshold: 0.8,
+					MaxEntries:          1000,
+					TTLSeconds:          -1, // Invalid: negative TTL
 				}
-			})
 
-			Context("with entries in cache", func() {
-				BeforeEach(func() {
-					// Add some test entries if possible
-					model := "model-a"
-					query := "What is the weather?"
-					requestBody := []byte(`{"model": "model-a"}`)
-					responseBody := []byte(`{"weather": "sunny"}`)
-
-					err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-					if err != nil {
-						Skip("Skipping test due to candle_binding dependency")
-					}
-				})
-
-				It("should find similar entries based on model matching", func() {
-					model := "model-a"
-					query := "Weather information"
-
-					_, _, err := semanticCache.FindSimilar(model, query)
-					Expect(err).NotTo(HaveOccurred())
-					// Result depends on embedding similarity and threshold
-				})
-
-				It("should not find entries for different models", func() {
-					model := "model-b" // Different model
-					query := "What is the weather?"
-
-					response, found, err := semanticCache.FindSimilar(model, query)
-					Expect(err).NotTo(HaveOccurred())
-					Expect(found).To(BeFalse())
-					Expect(response).To(BeNil())
-				})
-			})
-		})
-
-		Context("when cache is disabled", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{Enabled: false}
-				semanticCache = cache.NewSemanticCache(options)
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("ttl_seconds cannot be negative"))
 			})
 
-			It("should return cache miss immediately", func() {
-				model := "model-a"
-				query := "Any query"
+			It("should return error for negative max entries in memory backend", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             true,
+					SimilarityThreshold: 0.8,
+					MaxEntries:          -1, // Invalid: negative max entries
+					TTLSeconds:          3600,
+				}
 
-				response, found, err := semanticCache.FindSimilar(model, query)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(found).To(BeFalse())
-				Expect(response).To(BeNil())
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("max_entries cannot be negative"))
 			})
-		})
-	})
 
-	Describe("TTL Functionality", func() {
-		Context("with TTL enabled", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{
-					SimilarityThreshold: 0.8,
-					MaxEntries:          100,
-					TTLSeconds:          1, // 1 second TTL for testing
+			It("should return error for Milvus backend without config path", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.MilvusCacheType,
 					Enabled:             true,
+					SimilarityThreshold: 0.8,
+					TTLSeconds:          3600,
+					// BackendConfigPath is missing
 				}
-				semanticCache = cache.NewSemanticCache(options)
-			})
 
-			It("should expire entries after TTL", func() {
-				model := "model-a"
-				query := "TTL test query"
-				requestBody := []byte(`{"model": "model-a"}`)
-				responseBody := []byte(`{"response": "test"}`)
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("backend_config_path is required for Milvus"))
+			})
 
-				// Add entry
-				err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-				if err != nil {
-					Skip("Skipping test due to candle_binding dependency")
+			It("should validate edge case values", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
+					Enabled:             true,
+					SimilarityThreshold: 0.0, // Valid: minimum threshold
+					MaxEntries:          0,   // Valid: unlimited entries
+					TTLSeconds:          0,   // Valid: no expiration
 				}
 
-				// Wait for TTL to expire
-				time.Sleep(2 * time.Second)
-
-				// Try to find the entry - should trigger cleanup and not find expired entry
-				_, _, err = semanticCache.FindSimilar(model, query)
+				err := cache.ValidateCacheConfig(config)
 				Expect(err).NotTo(HaveOccurred())
-				// Entry should be expired and not found, or found but will be cleaned up
 			})
-		})
 
-		Context("without TTL", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{
-					SimilarityThreshold: 0.8,
-					MaxEntries:          100,
-					TTLSeconds:          0, // No TTL
+			It("should validate maximum threshold value", func() {
+				config := cache.CacheConfig{
+					BackendType:         cache.InMemoryCacheType,
 					Enabled:             true,
+					SimilarityThreshold: 1.0, // Valid: maximum threshold
+					MaxEntries:          10000,
+					TTLSeconds:          86400,
 				}
-				semanticCache = cache.NewSemanticCache(options)
-			})
 
-			It("should not expire entries", func() {
-				model := "model-a"
-				query := "No TTL test query"
-				requestBody := []byte(`{"model": "model-a"}`)
-				responseBody := []byte(`{"response": "test"}`)
+				err := cache.ValidateCacheConfig(config)
+				Expect(err).NotTo(HaveOccurred())
+			})
+		})
 
-				// Add entry
-				err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-				if err != nil {
-					Skip("Skipping test due to candle_binding dependency")
-				}
+		Describe("GetDefaultCacheConfig", func() {
+			It("should return valid default configuration", func() {
+				config := cache.GetDefaultCacheConfig()
 
-				// Wait some time
-				time.Sleep(100 * time.Millisecond)
+				Expect(config.BackendType).To(Equal(cache.InMemoryCacheType))
+				Expect(config.Enabled).To(BeTrue())
+				Expect(config.SimilarityThreshold).To(Equal(float32(0.8)))
+				Expect(config.MaxEntries).To(Equal(1000))
+				Expect(config.TTLSeconds).To(Equal(3600))
+				Expect(config.BackendConfigPath).To(BeEmpty())
 
-				// Entry should still be searchable
-				_, _, err = semanticCache.FindSimilar(model, query)
+				// Default config should pass validation
+				err := cache.ValidateCacheConfig(config)
 				Expect(err).NotTo(HaveOccurred())
-				// Without TTL, entry should persist (subject to similarity matching)
 			})
 		})
-	})
 
-	Describe("Concurrent Access", func() {
-		It("should handle concurrent AddEntry calls safely", func() {
-			const numGoroutines = 10
-			var wg sync.WaitGroup
-			errors := make([]error, numGoroutines)
-
-			wg.Add(numGoroutines)
-			for i := 0; i < numGoroutines; i++ {
-				go func(index int) {
-					defer wg.Done()
-					model := "model-a"
-					query := fmt.Sprintf("Concurrent query %d", index)
-					requestBody := []byte(fmt.Sprintf(`{"index": %d}`, index))
-					responseBody := []byte(fmt.Sprintf(`{"result": %d}`, index))
-
-					err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-					errors[index] = err
-				}(i)
-			}
+		Describe("GetAvailableCacheBackends", func() {
+			It("should return information about available backends", func() {
+				backends := cache.GetAvailableCacheBackends()
+
+				Expect(backends).To(HaveLen(2)) // Memory and Milvus
+
+				// Check memory backend info
+				memoryBackend := backends[0]
+				Expect(memoryBackend.Type).To(Equal(cache.InMemoryCacheType))
+				Expect(memoryBackend.Name).To(Equal("In-Memory Cache"))
+				Expect(memoryBackend.Description).To(ContainSubstring("in-memory semantic cache"))
+				Expect(memoryBackend.Features).To(ContainElement("Fast access"))
+				Expect(memoryBackend.Features).To(ContainElement("No external dependencies"))
+
+				// Check Milvus backend info
+				milvusBackend := backends[1]
+				Expect(milvusBackend.Type).To(Equal(cache.MilvusCacheType))
+				Expect(milvusBackend.Name).To(Equal("Milvus Vector Database"))
+				Expect(milvusBackend.Description).To(ContainSubstring("Milvus vector database"))
+				Expect(milvusBackend.Features).To(ContainElement("Highly scalable"))
+				Expect(milvusBackend.Features).To(ContainElement("Persistent storage"))
+			})
+		})
+	})
 
-			wg.Wait()
+	Describe("InMemoryCache", func() {
+		var (
+			inMemoryCache cache.CacheBackend
+		)
 
-			// Check that no race conditions occurred
-			// Some errors might occur due to candle_binding, but no panics should happen
-			for i := 0; i < numGoroutines; i++ {
-				// We don't assert on specific errors since candle_binding might not be available
-				// The important thing is that no race conditions or panics occurred
+		BeforeEach(func() {
+			options := cache.InMemoryCacheOptions{
+				Enabled:             true,
+				SimilarityThreshold: 0.8,
+				MaxEntries:          100,
+				TTLSeconds:          300,
 			}
+			inMemoryCache = cache.NewInMemoryCache(options)
 		})
 
-		It("should handle concurrent FindSimilar calls safely", func() {
-			const numGoroutines = 10
-			var wg sync.WaitGroup
-			results := make([]bool, numGoroutines)
-			errors := make([]error, numGoroutines)
-
-			wg.Add(numGoroutines)
-			for i := 0; i < numGoroutines; i++ {
-				go func(index int) {
-					defer wg.Done()
-					model := "model-a"
-					query := fmt.Sprintf("Search query %d", index)
-
-					_, found, err := semanticCache.FindSimilar(model, query)
-					results[index] = found
-					errors[index] = err
-				}(i)
+		AfterEach(func() {
+			if inMemoryCache != nil {
+				inMemoryCache.Close()
 			}
+			// BERT model is initialized once per process, no need to reset
+		})
+
+		It("should implement CacheBackend interface", func() {
+			// Check that the concrete type implements the interface
+			var _ cache.CacheBackend = inMemoryCache
+			Expect(inMemoryCache).NotTo(BeNil())
+		})
 
-			wg.Wait()
+		It("should report enabled status correctly", func() {
+			Expect(inMemoryCache.IsEnabled()).To(BeTrue())
 
-			// Check that no race conditions occurred
-			for i := 0; i < numGoroutines; i++ {
-				// We don't assert on specific results since cache is likely empty
-				// The important thing is that no race conditions or panics occurred
+			// Create disabled cache
+			disabledOptions := cache.InMemoryCacheOptions{
+				Enabled:             false,
+				SimilarityThreshold: 0.8,
+				MaxEntries:          100,
+				TTLSeconds:          300,
 			}
+			disabledCache := cache.NewInMemoryCache(disabledOptions)
+			defer disabledCache.Close()
+
+			Expect(disabledCache.IsEnabled()).To(BeFalse())
 		})
 
-		It("should handle mixed concurrent operations safely", func() {
-			const numGoroutines = 20
-			var wg sync.WaitGroup
-
-			wg.Add(numGoroutines)
-			for i := 0; i < numGoroutines; i++ {
-				go func(index int) {
-					defer wg.Done()
-					model := "model-a"
-					query := fmt.Sprintf("Mixed operation query %d", index)
-
-					if index%2 == 0 {
-						// Add entry
-						requestBody := []byte(fmt.Sprintf(`{"index": %d}`, index))
-						responseBody := []byte(fmt.Sprintf(`{"result": %d}`, index))
-						semanticCache.AddEntry(model, query, requestBody, responseBody)
-					} else {
-						// Search for similar
-						semanticCache.FindSimilar(model, query)
-					}
-				}(i)
-			}
+		It("should handle basic cache operations without embeddings", func() {
+			// Test GetStats on empty cache
+			stats := inMemoryCache.GetStats()
+			Expect(stats.TotalEntries).To(Equal(0))
+			Expect(stats.HitCount).To(Equal(int64(0)))
+			Expect(stats.MissCount).To(Equal(int64(0)))
+			Expect(stats.HitRatio).To(Equal(0.0))
+		})
+
+		It("should handle AddEntry operation with embeddings", func() {
+			err := inMemoryCache.AddEntry("test-model", "test query", []byte("request"), []byte("response"))
+			Expect(err).NotTo(HaveOccurred())
 
-			wg.Wait()
-			// If we reach here without panic, the concurrent access handling is working
+			stats := inMemoryCache.GetStats()
+			Expect(stats.TotalEntries).To(Equal(1))
 		})
-	})
 
-	Describe("ExtractQueryFromOpenAIRequest", func() {
-		It("should extract model and query from valid OpenAI request", func() {
-			request := cache.OpenAIRequest{
-				Model: "model-a",
-				Messages: []cache.ChatMessage{
-					{Role: "system", Content: "You are a helpful assistant."},
-					{Role: "user", Content: "What is the capital of France?"},
-					{Role: "assistant", Content: "The capital of France is Paris."},
-					{Role: "user", Content: "What about Germany?"},
-				},
-			}
+		It("should handle FindSimilar operation with embeddings", func() {
+			// First add an entry
+			err := inMemoryCache.AddEntry("test-model", "test query", []byte("request"), []byte("response"))
+			Expect(err).NotTo(HaveOccurred())
 
-			requestBody, err := json.Marshal(request)
+			// Search for similar query
+			response, found, err := inMemoryCache.FindSimilar("test-model", "test query")
 			Expect(err).NotTo(HaveOccurred())
+			Expect(found).To(BeTrue()) // Should find exact match
+			Expect(response).To(Equal([]byte("response")))
 
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(requestBody)
+			// Search for different model (should not match)
+			response, found, err = inMemoryCache.FindSimilar("different-model", "test query")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(model).To(Equal("model-a"))
-			Expect(query).To(Equal("What about Germany?")) // Should get the last user message
+			Expect(found).To(BeFalse()) // Should not match different model
+			Expect(response).To(BeNil())
 		})
 
-		It("should handle request with only system messages", func() {
-			request := cache.OpenAIRequest{
-				Model: "model-b",
-				Messages: []cache.ChatMessage{
-					{Role: "system", Content: "You are a helpful assistant."},
-				},
-			}
+		It("should handle AddPendingRequest and UpdateWithResponse", func() {
+			query, err := inMemoryCache.AddPendingRequest("test-model", "test query", []byte("request"))
+			Expect(err).NotTo(HaveOccurred())
+			Expect(query).To(Equal("test query"))
 
-			requestBody, err := json.Marshal(request)
+			// Update with response
+			err = inMemoryCache.UpdateWithResponse("test query", []byte("response"))
 			Expect(err).NotTo(HaveOccurred())
 
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(requestBody)
+			// Should now be able to find it
+			response, found, err := inMemoryCache.FindSimilar("test-model", "test query")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(model).To(Equal("model-b"))
-			Expect(query).To(BeEmpty()) // No user messages
+			Expect(found).To(BeTrue())
+			Expect(response).To(Equal([]byte("response")))
 		})
 
-		It("should handle request with multiple user messages", func() {
-			request := cache.OpenAIRequest{
-				Model: "model-a",
-				Messages: []cache.ChatMessage{
-					{Role: "user", Content: "First user message"},
-					{Role: "assistant", Content: "Assistant response"},
-					{Role: "user", Content: "Second user message"},
-					{Role: "user", Content: "Third user message"},
-				},
+		It("should respect similarity threshold", func() {
+			// Add entry with a very high similarity threshold
+			highThresholdOptions := cache.InMemoryCacheOptions{
+				Enabled:             true,
+				SimilarityThreshold: 0.99, // Very high threshold
+				MaxEntries:          100,
+				TTLSeconds:          300,
 			}
+			highThresholdCache := cache.NewInMemoryCache(highThresholdOptions)
+			defer highThresholdCache.Close()
 
-			requestBody, err := json.Marshal(request)
+			err := highThresholdCache.AddEntry("test-model", "machine learning", []byte("request"), []byte("ml response"))
 			Expect(err).NotTo(HaveOccurred())
 
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(requestBody)
+			// Exact match should work
+			response, found, err := highThresholdCache.FindSimilar("test-model", "machine learning")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(model).To(Equal("model-a"))
-			Expect(query).To(Equal("Third user message")) // Should get the last user message
+			Expect(found).To(BeTrue())
+			Expect(response).To(Equal([]byte("ml response")))
+
+			// Different query should not match due to high threshold
+			response, found, err = highThresholdCache.FindSimilar("test-model", "artificial intelligence")
+			Expect(err).NotTo(HaveOccurred())
+			Expect(found).To(BeFalse())
+			Expect(response).To(BeNil())
 		})
 
-		It("should handle empty messages array", func() {
-			request := cache.OpenAIRequest{
-				Model:    "model-a",
-				Messages: []cache.ChatMessage{},
-			}
+		It("should track hit and miss statistics", func() {
+			// Add an entry with a specific query
+			err := inMemoryCache.AddEntry("test-model", "What is machine learning?", []byte("request"), []byte("ML is a subset of AI"))
+			Expect(err).NotTo(HaveOccurred())
 
-			requestBody, err := json.Marshal(request)
+			// Search for the exact cached query (should be a hit)
+			response, found, err := inMemoryCache.FindSimilar("test-model", "What is machine learning?")
 			Expect(err).NotTo(HaveOccurred())
+			Expect(found).To(BeTrue())
+			Expect(response).To(Equal([]byte("ML is a subset of AI")))
 
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(requestBody)
+			// Search for a completely unrelated query (should be a miss)
+			response, found, err = inMemoryCache.FindSimilar("test-model", "How do I cook pasta?")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(model).To(Equal("model-a"))
-			Expect(query).To(BeEmpty())
+			Expect(found).To(BeFalse())
+			Expect(response).To(BeNil())
+
+			// Check statistics
+			stats := inMemoryCache.GetStats()
+			Expect(stats.HitCount).To(Equal(int64(1)))
+			Expect(stats.MissCount).To(Equal(int64(1)))
+			Expect(stats.HitRatio).To(Equal(0.5))
 		})
 
-		It("should return error for invalid JSON", func() {
-			invalidJSON := []byte(`{"model": "model-a", "messages": [invalid json}`)
-
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(invalidJSON)
+		It("should handle error when updating non-existent pending request", func() {
+			err := inMemoryCache.UpdateWithResponse("non-existent-query", []byte("response"))
 			Expect(err).To(HaveOccurred())
-			Expect(model).To(BeEmpty())
-			Expect(query).To(BeEmpty())
-			Expect(err.Error()).To(ContainSubstring("invalid request body"))
+			Expect(err.Error()).To(ContainSubstring("no pending request found"))
 		})
 
-		It("should handle missing model field", func() {
-			request := map[string]interface{}{
-				"messages": []cache.ChatMessage{
-					{Role: "user", Content: "Test message"},
-				},
-			}
-
-			requestBody, err := json.Marshal(request)
+		It("should handle close operation", func() {
+			err := inMemoryCache.Close()
 			Expect(err).NotTo(HaveOccurred())
 
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(requestBody)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(model).To(BeEmpty()) // Missing model field
-			Expect(query).To(Equal("Test message"))
+			// Stats should show zero entries after close
+			stats := inMemoryCache.GetStats()
+			Expect(stats.TotalEntries).To(Equal(0))
 		})
 
-		It("should handle request with empty content", func() {
-			request := cache.OpenAIRequest{
-				Model: "model-a",
-				Messages: []cache.ChatMessage{
-					{Role: "user", Content: ""},
-					{Role: "user", Content: "Non-empty message"},
-				},
+		It("should handle disabled cache operations gracefully", func() {
+			disabledOptions := cache.InMemoryCacheOptions{
+				Enabled:             false,
+				SimilarityThreshold: 0.8,
+				MaxEntries:          100,
+				TTLSeconds:          300,
 			}
+			disabledCache := cache.NewInMemoryCache(disabledOptions)
+			defer disabledCache.Close()
 
-			requestBody, err := json.Marshal(request)
+			// Disabled cache operations should not error but should be no-ops
+			// They should NOT try to generate embeddings
+			query, err := disabledCache.AddPendingRequest("model", "query", []byte("request"))
 			Expect(err).NotTo(HaveOccurred())
+			Expect(query).To(Equal("query"))
 
-			model, query, err := cache.ExtractQueryFromOpenAIRequest(requestBody)
+			err = disabledCache.UpdateWithResponse("query", []byte("response"))
 			Expect(err).NotTo(HaveOccurred())
-			Expect(model).To(Equal("model-a"))
-			Expect(query).To(Equal("Non-empty message")) // Should get the last non-empty user message
-		})
-	})
 
-	Describe("Edge Cases and Error Conditions", func() {
-		It("should handle very large request/response bodies", func() {
-			model := "model-a"
-			query := "Large data test"
-			largeData := make([]byte, 1024*1024) // 1MB of data
-			for i := range largeData {
-				largeData[i] = byte(i % 256)
-			}
+			err = disabledCache.AddEntry("model", "query", []byte("request"), []byte("response"))
+			Expect(err).NotTo(HaveOccurred())
 
-			err := semanticCache.AddEntry(model, query, largeData, largeData)
-			// Should handle large data gracefully
-			Expect(err).To(Or(BeNil(), HaveOccurred()))
+			response, found, err := disabledCache.FindSimilar("model", "query")
+			Expect(err).NotTo(HaveOccurred())
+			Expect(found).To(BeFalse())
+			Expect(response).To(BeNil())
+
+			// Stats should show zero activity
+			stats := disabledCache.GetStats()
+			Expect(stats.TotalEntries).To(Equal(0))
+			Expect(stats.HitCount).To(Equal(int64(0)))
+			Expect(stats.MissCount).To(Equal(int64(0)))
 		})
+	})
 
-		It("should handle special characters in queries", func() {
-			model := "model-a"
-			query := "Query with special chars: 你好, émoji 🚀, and unicode ∀∃∅"
-			requestBody := []byte(`{"special": "chars"}`)
-			responseBody := []byte(`{"response": "special"}`)
-
-			err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-			Expect(err).To(Or(BeNil(), HaveOccurred()))
+	Describe("Cache Backend Types", func() {
+		It("should have correct backend type constants", func() {
+			Expect(cache.InMemoryCacheType).To(Equal(cache.CacheBackendType("memory")))
+			Expect(cache.MilvusCacheType).To(Equal(cache.CacheBackendType("milvus")))
 		})
+	})
 
-		It("should handle very long queries", func() {
-			model := "model-a"
-			query := string(make([]byte, 10000)) // Very long query
-			for i := range query {
-				query = query[:i] + "a"
+	Describe("Cache Configuration Types", func() {
+		It("should support all required configuration fields", func() {
+			config := cache.CacheConfig{
+				BackendType:         cache.MilvusCacheType,
+				Enabled:             true,
+				SimilarityThreshold: 0.9,
+				MaxEntries:          2000,
+				TTLSeconds:          7200,
+				BackendConfigPath:   "config/cache/milvus.yaml",
 			}
-			requestBody := []byte(`{"long": "query"}`)
-			responseBody := []byte(`{"response": "long"}`)
-
-			err := semanticCache.AddEntry(model, query, requestBody, responseBody)
-			Expect(err).To(Or(BeNil(), HaveOccurred()))
-		})
-
-		It("should handle nil request/response bodies", func() {
-			model := "model-a"
-			query := "Nil test"
 
-			err := semanticCache.AddEntry(model, query, nil, nil)
-			Expect(err).To(Or(BeNil(), HaveOccurred()))
+			// Verify all fields are accessible
+			Expect(string(config.BackendType)).To(Equal("milvus"))
+			Expect(config.Enabled).To(BeTrue())
+			Expect(config.SimilarityThreshold).To(Equal(float32(0.9)))
+			Expect(config.MaxEntries).To(Equal(2000))
+			Expect(config.TTLSeconds).To(Equal(7200))
+			Expect(config.BackendConfigPath).To(Equal("config/cache/milvus.yaml"))
 		})
 	})
 
-	Describe("Similarity Threshold Edge Cases", func() {
-		Context("with very low threshold", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{
-					SimilarityThreshold: 0.0, // Very low threshold
-					MaxEntries:          100,
-					TTLSeconds:          0,
-					Enabled:             true,
-				}
-				semanticCache = cache.NewSemanticCache(options)
-			})
-
-			It("should potentially match more entries", func() {
-				// Add an entry
-				model := "model-a"
-				query1 := "What is AI?"
-				requestBody := []byte(`{"model": "model-a"}`)
-				responseBody := []byte(`{"response": "AI info"}`)
-
-				err := semanticCache.AddEntry(model, query1, requestBody, responseBody)
-				if err != nil {
-					Skip("Skipping test due to candle_binding dependency")
-				}
+	Describe("Cache Stats", func() {
+		It("should calculate hit ratio correctly", func() {
+			stats := cache.CacheStats{
+				TotalEntries: 100,
+				HitCount:     75,
+				MissCount:    25,
+				HitRatio:     0.75,
+			}
 
-				// Search with different query
-				query2 := "Completely different query"
-				_, _, err = semanticCache.FindSimilar(model, query2)
-				Expect(err).NotTo(HaveOccurred())
-				// With very low threshold, might find matches
-			})
+			Expect(stats.HitRatio).To(Equal(0.75))
+			Expect(stats.HitCount + stats.MissCount).To(Equal(int64(100)))
 		})
 
-		Context("with very high threshold", func() {
-			BeforeEach(func() {
-				options := cache.SemanticCacheOptions{
-					SimilarityThreshold: 0.999, // Very high threshold
-					MaxEntries:          100,
-					TTLSeconds:          0,
-					Enabled:             true,
-				}
-				semanticCache = cache.NewSemanticCache(options)
-			})
-
-			It("should rarely match entries", func() {
-				// Add an entry
-				model := "model-a"
-				query1 := "What is AI?"
-				requestBody := []byte(`{"model": "model-a"}`)
-				responseBody := []byte(`{"response": "AI info"}`)
-
-				err := semanticCache.AddEntry(model, query1, requestBody, responseBody)
-				if err != nil {
-					Skip("Skipping test due to candle_binding dependency")
-				}
+		It("should handle zero values correctly", func() {
+			stats := cache.CacheStats{
+				TotalEntries: 0,
+				HitCount:     0,
+				MissCount:    0,
+				HitRatio:     0.0,
+			}
 
-				// Search with slightly different query
-				query2 := "What is artificial intelligence?"
-				_, found, err := semanticCache.FindSimilar(model, query2)
-				Expect(err).NotTo(HaveOccurred())
-				// With very high threshold, should rarely find matches
-				Expect(found).To(BeFalse())
-			})
+			Expect(stats.HitRatio).To(Equal(0.0))
+			Expect(stats.TotalEntries).To(Equal(0))
 		})
 	})
 })
diff --git a/src/semantic-router/pkg/cache/inmemory_cache.go b/src/semantic-router/pkg/cache/inmemory_cache.go
new file mode 100644
index 00000000..9928683d
--- /dev/null
+++ b/src/semantic-router/pkg/cache/inmemory_cache.go
@@ -0,0 +1,404 @@
+//go:build !windows && cgo
+// +build !windows,cgo
+
+package cache
+
+import (
+	"fmt"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
+)
+
+// InMemoryCache provides a high-performance semantic cache using BERT embeddings in memory
+type InMemoryCache struct {
+	entries             []CacheEntry
+	mu                  sync.RWMutex
+	similarityThreshold float32
+	maxEntries          int
+	ttlSeconds          int
+	enabled             bool
+	hitCount            int64
+	missCount           int64
+	lastCleanupTime     *time.Time
+}
+
+// InMemoryCacheOptions contains configuration parameters for the in-memory cache
+type InMemoryCacheOptions struct {
+	SimilarityThreshold float32
+	MaxEntries          int
+	TTLSeconds          int
+	Enabled             bool
+}
+
+// NewInMemoryCache initializes a new in-memory semantic cache instance
+func NewInMemoryCache(options InMemoryCacheOptions) *InMemoryCache {
+	observability.Debugf("Initializing in-memory cache: enabled=%t, maxEntries=%d, ttlSeconds=%d, threshold=%.3f",
+		options.Enabled, options.MaxEntries, options.TTLSeconds, options.SimilarityThreshold)
+	return &InMemoryCache{
+		entries:             []CacheEntry{},
+		similarityThreshold: options.SimilarityThreshold,
+		maxEntries:          options.MaxEntries,
+		ttlSeconds:          options.TTLSeconds,
+		enabled:             options.Enabled,
+	}
+}
+
+// IsEnabled returns the current cache activation status
+func (c *InMemoryCache) IsEnabled() bool {
+	return c.enabled
+}
+
+// AddPendingRequest stores a request that is awaiting its response
+func (c *InMemoryCache) AddPendingRequest(model string, query string, requestBody []byte) (string, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		return query, nil
+	}
+
+	// Generate semantic embedding for the query
+	embedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		metrics.RecordCacheOperation("memory", "add_pending", "error", time.Since(start).Seconds())
+		return "", fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Remove expired entries to maintain cache hygiene
+	c.cleanupExpiredEntries()
+
+	// Create cache entry for the pending request
+	entry := CacheEntry{
+		RequestBody: requestBody,
+		Model:       model,
+		Query:       query,
+		Embedding:   embedding,
+		Timestamp:   time.Now(),
+	}
+
+	c.entries = append(c.entries, entry)
+	observability.Debugf("InMemoryCache.AddPendingRequest: added pending entry (total entries: %d, embedding_dim: %d)",
+		len(c.entries), len(embedding))
+
+	// Apply entry limit to prevent unbounded memory growth
+	if c.maxEntries > 0 && len(c.entries) > c.maxEntries {
+		// Sort entries by timestamp to identify oldest
+		sort.Slice(c.entries, func(i, j int) bool {
+			return c.entries[i].Timestamp.Before(c.entries[j].Timestamp)
+		})
+		// Keep only the most recent entries
+		removedCount := len(c.entries) - c.maxEntries
+		c.entries = c.entries[len(c.entries)-c.maxEntries:]
+		observability.Debugf("InMemoryCache: size limit exceeded, removed %d oldest entries (limit: %d)",
+			removedCount, c.maxEntries)
+		observability.LogEvent("cache_trimmed", map[string]interface{}{
+			"backend":       "memory",
+			"removed_count": removedCount,
+			"max_entries":   c.maxEntries,
+		})
+	}
+
+	// Record metrics
+	metrics.RecordCacheOperation("memory", "add_pending", "success", time.Since(start).Seconds())
+	metrics.UpdateCacheEntries("memory", len(c.entries))
+
+	return query, nil
+}
+
+// UpdateWithResponse completes a pending request by adding the response
+func (c *InMemoryCache) UpdateWithResponse(query string, responseBody []byte) error {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Clean up expired entries during the update
+	c.cleanupExpiredEntries()
+
+	// Locate the pending request and complete it
+	for i, entry := range c.entries {
+		if entry.Query == query && entry.ResponseBody == nil {
+			// Complete the cache entry with the response
+			c.entries[i].ResponseBody = responseBody
+			c.entries[i].Timestamp = time.Now()
+			observability.Debugf("InMemoryCache.UpdateWithResponse: updated entry with response (response_size: %d bytes)",
+				len(responseBody))
+
+			// Record successful completion
+			metrics.RecordCacheOperation("memory", "update_response", "success", time.Since(start).Seconds())
+			return nil
+		}
+	}
+
+	// No matching pending request found
+	metrics.RecordCacheOperation("memory", "update_response", "error", time.Since(start).Seconds())
+	return fmt.Errorf("no pending request found for query: %s", query)
+}
+
+// AddEntry stores a complete request-response pair in the cache
+func (c *InMemoryCache) AddEntry(model string, query string, requestBody, responseBody []byte) error {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil
+	}
+
+	// Generate semantic embedding for the query
+	embedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		metrics.RecordCacheOperation("memory", "add_entry", "error", time.Since(start).Seconds())
+		return fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	entry := CacheEntry{
+		RequestBody:  requestBody,
+		ResponseBody: responseBody,
+		Model:        model,
+		Query:        query,
+		Embedding:    embedding,
+		Timestamp:    time.Now(),
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Clean up expired entries before adding new one
+	c.cleanupExpiredEntries()
+
+	c.entries = append(c.entries, entry)
+	observability.Debugf("InMemoryCache.AddEntry: added complete entry (total entries: %d, request_size: %d, response_size: %d)",
+		len(c.entries), len(requestBody), len(responseBody))
+	observability.LogEvent("cache_entry_added", map[string]interface{}{
+		"backend": "memory",
+		"query":   query,
+		"model":   model,
+	})
+
+	// Apply entry limit if configured
+	if c.maxEntries > 0 && len(c.entries) > c.maxEntries {
+		// Sort by timestamp to identify oldest entries
+		sort.Slice(c.entries, func(i, j int) bool {
+			return c.entries[i].Timestamp.Before(c.entries[j].Timestamp)
+		})
+		// Keep only the most recent entries
+		c.entries = c.entries[len(c.entries)-c.maxEntries:]
+	}
+
+	// Record success metrics
+	metrics.RecordCacheOperation("memory", "add_entry", "success", time.Since(start).Seconds())
+	metrics.UpdateCacheEntries("memory", len(c.entries))
+
+	return nil
+}
+
+// FindSimilar searches for semantically similar cached requests
+func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		observability.Debugf("InMemoryCache.FindSimilar: cache disabled")
+		return nil, false, nil
+	}
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+	observability.Debugf("InMemoryCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
+		model, queryPreview, len(query))
+
+	// Generate semantic embedding for similarity comparison
+	queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		metrics.RecordCacheOperation("memory", "find_similar", "error", time.Since(start).Seconds())
+		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	// Check for expired entries during search
+	c.cleanupExpiredEntriesReadOnly()
+
+	type SimilarityResult struct {
+		Entry      CacheEntry
+		Similarity float32
+	}
+
+	// Compare with completed entries for the same model
+	results := make([]SimilarityResult, 0, len(c.entries))
+	for _, entry := range c.entries {
+		if entry.ResponseBody == nil {
+			continue // Skip incomplete entries
+		}
+
+		// Only consider entries for the same model
+		if entry.Model != model {
+			continue
+		}
+
+		// Compute semantic similarity using dot product
+		var dotProduct float32
+		for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
+			dotProduct += queryEmbedding[i] * entry.Embedding[i]
+		}
+
+		results = append(results, SimilarityResult{
+			Entry:      entry,
+			Similarity: dotProduct,
+		})
+	}
+
+	// Handle case where no suitable entries exist
+	if len(results) == 0 {
+		atomic.AddInt64(&c.missCount, 1)
+		observability.Debugf("InMemoryCache.FindSimilar: no entries found with responses (total entries: %d)", len(c.entries))
+		metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	// Sort results by similarity score (highest first)
+	sort.Slice(results, func(i, j int) bool {
+		return results[i].Similarity > results[j].Similarity
+	})
+
+	// Check if the best match meets the similarity threshold
+	if results[0].Similarity >= c.similarityThreshold {
+		atomic.AddInt64(&c.hitCount, 1)
+		observability.Debugf("InMemoryCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
+			results[0].Similarity, c.similarityThreshold, len(results[0].Entry.ResponseBody))
+		observability.LogEvent("cache_hit", map[string]interface{}{
+			"backend":    "memory",
+			"similarity": results[0].Similarity,
+			"threshold":  c.similarityThreshold,
+			"model":      model,
+		})
+		metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
+		metrics.RecordCacheHit()
+		return results[0].Entry.ResponseBody, true, nil
+	}
+
+	atomic.AddInt64(&c.missCount, 1)
+	observability.Debugf("InMemoryCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
+		results[0].Similarity, c.similarityThreshold, len(results))
+	observability.LogEvent("cache_miss", map[string]interface{}{
+		"backend":         "memory",
+		"best_similarity": results[0].Similarity,
+		"threshold":       c.similarityThreshold,
+		"model":           model,
+		"entries_checked": len(results),
+	})
+	metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
+	metrics.RecordCacheMiss()
+	return nil, false, nil
+}
+
+// Close releases all resources held by the cache
+func (c *InMemoryCache) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Clear all entries to free memory
+	c.entries = nil
+	return nil
+}
+
+// GetStats provides current cache performance metrics
+func (c *InMemoryCache) GetStats() CacheStats {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	hits := atomic.LoadInt64(&c.hitCount)
+	misses := atomic.LoadInt64(&c.missCount)
+	total := hits + misses
+
+	var hitRatio float64
+	if total > 0 {
+		hitRatio = float64(hits) / float64(total)
+	}
+
+	stats := CacheStats{
+		TotalEntries: len(c.entries),
+		HitCount:     hits,
+		MissCount:    misses,
+		HitRatio:     hitRatio,
+	}
+
+	if c.lastCleanupTime != nil {
+		stats.LastCleanupTime = c.lastCleanupTime
+	}
+
+	return stats
+}
+
+// cleanupExpiredEntries removes entries that have exceeded their TTL
+// Caller must hold a write lock
+func (c *InMemoryCache) cleanupExpiredEntries() {
+	if c.ttlSeconds <= 0 {
+		return
+	}
+
+	now := time.Now()
+	validEntries := make([]CacheEntry, 0, len(c.entries))
+
+	for _, entry := range c.entries {
+		// Retain entries that are still within their TTL
+		if now.Sub(entry.Timestamp).Seconds() < float64(c.ttlSeconds) {
+			validEntries = append(validEntries, entry)
+		}
+	}
+
+	if len(validEntries) < len(c.entries) {
+		expiredCount := len(c.entries) - len(validEntries)
+		observability.Debugf("InMemoryCache: TTL cleanup removed %d expired entries (remaining: %d)",
+			expiredCount, len(validEntries))
+		observability.LogEvent("cache_cleanup", map[string]interface{}{
+			"backend":         "memory",
+			"expired_count":   expiredCount,
+			"remaining_count": len(validEntries),
+			"ttl_seconds":     c.ttlSeconds,
+		})
+		c.entries = validEntries
+		cleanupTime := time.Now()
+		c.lastCleanupTime = &cleanupTime
+	}
+}
+
+// cleanupExpiredEntriesReadOnly identifies expired entries without modifying the cache
+// Used during read operations with only a read lock held
+func (c *InMemoryCache) cleanupExpiredEntriesReadOnly() {
+	if c.ttlSeconds <= 0 {
+		return
+	}
+
+	now := time.Now()
+	expiredCount := 0
+
+	for _, entry := range c.entries {
+		if now.Sub(entry.Timestamp).Seconds() >= float64(c.ttlSeconds) {
+			expiredCount++
+		}
+	}
+
+	if expiredCount > 0 {
+		observability.Debugf("InMemoryCache: found %d expired entries during read (TTL: %ds)",
+			expiredCount, c.ttlSeconds)
+		observability.LogEvent("cache_expired_entries_found", map[string]interface{}{
+			"backend":       "memory",
+			"expired_count": expiredCount,
+			"ttl_seconds":   c.ttlSeconds,
+		})
+	}
+}
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
new file mode 100644
index 00000000..a4edcde3
--- /dev/null
+++ b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -0,0 +1,676 @@
+package cache
+
+import (
+	"context"
+	"crypto/md5"
+	"fmt"
+	"os"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/milvus-io/milvus-sdk-go/v2/client"
+	"github.com/milvus-io/milvus-sdk-go/v2/entity"
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
+	"gopkg.in/yaml.v3"
+)
+
+// MilvusConfig defines the complete configuration structure for Milvus cache backend
+type MilvusConfig struct {
+	Connection struct {
+		Host     string `yaml:"host"`
+		Port     int    `yaml:"port"`
+		Database string `yaml:"database"`
+		Timeout  int    `yaml:"timeout"`
+		Auth     struct {
+			Enabled  bool   `yaml:"enabled"`
+			Username string `yaml:"username"`
+			Password string `yaml:"password"`
+		} `yaml:"auth"`
+		TLS struct {
+			Enabled  bool   `yaml:"enabled"`
+			CertFile string `yaml:"cert_file"`
+			KeyFile  string `yaml:"key_file"`
+			CAFile   string `yaml:"ca_file"`
+		} `yaml:"tls"`
+	} `yaml:"connection"`
+	Collection struct {
+		Name        string `yaml:"name"`
+		Description string `yaml:"description"`
+		VectorField struct {
+			Name       string `yaml:"name"`
+			Dimension  int    `yaml:"dimension"`
+			MetricType string `yaml:"metric_type"`
+		} `yaml:"vector_field"`
+		Index struct {
+			Type   string `yaml:"type"`
+			Params struct {
+				M              int `yaml:"M"`
+				EfConstruction int `yaml:"efConstruction"`
+			} `yaml:"params"`
+		} `yaml:"index"`
+	} `yaml:"collection"`
+	Search struct {
+		Params struct {
+			Ef int `yaml:"ef"`
+		} `yaml:"params"`
+		TopK             int    `yaml:"topk"`
+		ConsistencyLevel string `yaml:"consistency_level"`
+	} `yaml:"search"`
+	Performance struct {
+		ConnectionPool struct {
+			MaxConnections     int `yaml:"max_connections"`
+			MaxIdleConnections int `yaml:"max_idle_connections"`
+			AcquireTimeout     int `yaml:"acquire_timeout"`
+		} `yaml:"connection_pool"`
+		Batch struct {
+			InsertBatchSize int `yaml:"insert_batch_size"`
+			Timeout         int `yaml:"timeout"`
+		} `yaml:"batch"`
+	} `yaml:"performance"`
+	DataManagement struct {
+		TTL struct {
+			Enabled         bool   `yaml:"enabled"`
+			TimestampField  string `yaml:"timestamp_field"`
+			CleanupInterval int    `yaml:"cleanup_interval"`
+		} `yaml:"ttl"`
+		Compaction struct {
+			Enabled  bool `yaml:"enabled"`
+			Interval int  `yaml:"interval"`
+		} `yaml:"compaction"`
+	} `yaml:"data_management"`
+	Logging struct {
+		Level          string `yaml:"level"`
+		EnableQueryLog bool   `yaml:"enable_query_log"`
+		EnableMetrics  bool   `yaml:"enable_metrics"`
+	} `yaml:"logging"`
+	Development struct {
+		DropCollectionOnStartup bool `yaml:"drop_collection_on_startup"`
+		AutoCreateCollection    bool `yaml:"auto_create_collection"`
+		VerboseErrors           bool `yaml:"verbose_errors"`
+	} `yaml:"development"`
+}
+
+// MilvusCache provides a scalable semantic cache implementation using Milvus vector database
+type MilvusCache struct {
+	client              client.Client
+	config              *MilvusConfig
+	collectionName      string
+	similarityThreshold float32
+	ttlSeconds          int
+	enabled             bool
+	hitCount            int64
+	missCount           int64
+	lastCleanupTime     *time.Time
+	mu                  sync.RWMutex
+}
+
+// MilvusCacheOptions contains configuration parameters for Milvus cache initialization
+type MilvusCacheOptions struct {
+	SimilarityThreshold float32
+	TTLSeconds          int
+	Enabled             bool
+	ConfigPath          string
+}
+
+// NewMilvusCache initializes a new Milvus-backed semantic cache instance
+func NewMilvusCache(options MilvusCacheOptions) (*MilvusCache, error) {
+	if !options.Enabled {
+		observability.Debugf("MilvusCache: disabled, returning stub")
+		return &MilvusCache{
+			enabled: false,
+		}, nil
+	}
+
+	// Load Milvus configuration from file
+	observability.Debugf("MilvusCache: loading config from %s", options.ConfigPath)
+	config, err := loadMilvusConfig(options.ConfigPath)
+	if err != nil {
+		observability.Debugf("MilvusCache: failed to load config: %v", err)
+		return nil, fmt.Errorf("failed to load Milvus config: %w", err)
+	}
+	observability.Debugf("MilvusCache: config loaded - host=%s:%d, collection=%s, dimension=auto-detect",
+		config.Connection.Host, config.Connection.Port, config.Collection.Name)
+
+	// Establish connection to Milvus server
+	connectionString := fmt.Sprintf("%s:%d", config.Connection.Host, config.Connection.Port)
+	observability.Debugf("MilvusCache: connecting to Milvus at %s", connectionString)
+	milvusClient, err := client.NewGrpcClient(context.Background(), connectionString)
+	if err != nil {
+		observability.Debugf("MilvusCache: failed to connect: %v", err)
+		return nil, fmt.Errorf("failed to create Milvus client: %w", err)
+	}
+	observability.Debugf("MilvusCache: successfully connected to Milvus")
+
+	cache := &MilvusCache{
+		client:              milvusClient,
+		config:              config,
+		collectionName:      config.Collection.Name,
+		similarityThreshold: options.SimilarityThreshold,
+		ttlSeconds:          options.TTLSeconds,
+		enabled:             options.Enabled,
+	}
+
+	// Set up the collection for caching
+	observability.Debugf("MilvusCache: initializing collection '%s'", config.Collection.Name)
+	if err := cache.initializeCollection(); err != nil {
+		observability.Debugf("MilvusCache: failed to initialize collection: %v", err)
+		milvusClient.Close()
+		return nil, fmt.Errorf("failed to initialize collection: %w", err)
+	}
+	observability.Debugf("MilvusCache: initialization complete")
+
+	return cache, nil
+}
+
+// loadMilvusConfig reads and parses the Milvus configuration from file
+func loadMilvusConfig(configPath string) (*MilvusConfig, error) {
+	if configPath == "" {
+		return nil, fmt.Errorf("Milvus config path is required")
+	}
+
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read config file: %w", err)
+	}
+
+	var config MilvusConfig
+	if err := yaml.Unmarshal(data, &config); err != nil {
+		return nil, fmt.Errorf("failed to parse config file: %w", err)
+	}
+
+	return &config, nil
+}
+
+// initializeCollection sets up the Milvus collection and index structures
+func (c *MilvusCache) initializeCollection() error {
+	ctx := context.Background()
+
+	// Verify collection existence
+	hasCollection, err := c.client.HasCollection(ctx, c.collectionName)
+	if err != nil {
+		return fmt.Errorf("failed to check collection existence: %w", err)
+	}
+
+	// Handle development mode collection reset
+	if c.config.Development.DropCollectionOnStartup && hasCollection {
+		if err := c.client.DropCollection(ctx, c.collectionName); err != nil {
+			observability.Debugf("MilvusCache: failed to drop collection: %v", err)
+			return fmt.Errorf("failed to drop collection: %w", err)
+		}
+		hasCollection = false
+		observability.Debugf("MilvusCache: dropped existing collection '%s' for development", c.collectionName)
+		observability.LogEvent("collection_dropped", map[string]interface{}{
+			"backend":    "milvus",
+			"collection": c.collectionName,
+			"reason":     "development_mode",
+		})
+	}
+
+	// Create collection if it doesn't exist
+	if !hasCollection {
+		if !c.config.Development.AutoCreateCollection {
+			return fmt.Errorf("collection %s does not exist and auto-creation is disabled", c.collectionName)
+		}
+
+		if err := c.createCollection(); err != nil {
+			observability.Debugf("MilvusCache: failed to create collection: %v", err)
+			return fmt.Errorf("failed to create collection: %w", err)
+		}
+		observability.Debugf("MilvusCache: created new collection '%s' with dimension %d",
+			c.collectionName, c.config.Collection.VectorField.Dimension)
+		observability.LogEvent("collection_created", map[string]interface{}{
+			"backend":    "milvus",
+			"collection": c.collectionName,
+			"dimension":  c.config.Collection.VectorField.Dimension,
+		})
+	}
+
+	// Load collection into memory for queries
+	observability.Debugf("MilvusCache: loading collection '%s' into memory", c.collectionName)
+	if err := c.client.LoadCollection(ctx, c.collectionName, false); err != nil {
+		observability.Debugf("MilvusCache: failed to load collection: %v", err)
+		return fmt.Errorf("failed to load collection: %w", err)
+	}
+	observability.Debugf("MilvusCache: collection loaded successfully")
+
+	return nil
+}
+
+// createCollection builds the Milvus collection with the appropriate schema
+func (c *MilvusCache) createCollection() error {
+	ctx := context.Background()
+
+	// Determine embedding dimension automatically
+	testEmbedding, err := candle_binding.GetEmbedding("test", 0) // Auto-detect
+	if err != nil {
+		return fmt.Errorf("failed to detect embedding dimension: %w", err)
+	}
+	actualDimension := len(testEmbedding)
+
+	observability.Debugf("MilvusCache.createCollection: auto-detected embedding dimension: %d", actualDimension)
+
+	// Define schema with auto-detected dimension
+	schema := &entity.Schema{
+		CollectionName: c.collectionName,
+		Description:    c.config.Collection.Description,
+		Fields: []*entity.Field{
+			{
+				Name:       "id",
+				DataType:   entity.FieldTypeVarChar,
+				PrimaryKey: true,
+				TypeParams: map[string]string{"max_length": "64"},
+			},
+			{
+				Name:       "model",
+				DataType:   entity.FieldTypeVarChar,
+				TypeParams: map[string]string{"max_length": "256"},
+			},
+			{
+				Name:       "query",
+				DataType:   entity.FieldTypeVarChar,
+				TypeParams: map[string]string{"max_length": "65535"},
+			},
+			{
+				Name:       "request_body",
+				DataType:   entity.FieldTypeVarChar,
+				TypeParams: map[string]string{"max_length": "65535"},
+			},
+			{
+				Name:       "response_body",
+				DataType:   entity.FieldTypeVarChar,
+				TypeParams: map[string]string{"max_length": "65535"},
+			},
+			{
+				Name:     c.config.Collection.VectorField.Name,
+				DataType: entity.FieldTypeFloatVector,
+				TypeParams: map[string]string{
+					"dim": fmt.Sprintf("%d", actualDimension), // Use auto-detected dimension
+				},
+			},
+			{
+				Name:     "timestamp",
+				DataType: entity.FieldTypeInt64,
+			},
+		},
+	}
+
+	// Create collection
+	if err := c.client.CreateCollection(ctx, schema, 1); err != nil {
+		return err
+	}
+
+	// Create index
+	indexParams := map[string]string{
+		"index_type":  c.config.Collection.Index.Type,
+		"metric_type": c.config.Collection.VectorField.MetricType,
+		"params": fmt.Sprintf(`{"M": %d, "efConstruction": %d}`,
+			c.config.Collection.Index.Params.M,
+			c.config.Collection.Index.Params.EfConstruction),
+	}
+
+	observability.Debugf("MilvusCache.createCollection: creating index for %d-dimensional vectors", actualDimension)
+
+	// Create index with updated API
+	index := entity.NewGenericIndex(c.config.Collection.VectorField.Name, entity.IndexType(c.config.Collection.Index.Type), indexParams)
+	if err := c.client.CreateIndex(ctx, c.collectionName, c.config.Collection.VectorField.Name, index, false); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// IsEnabled returns the current cache activation status
+func (c *MilvusCache) IsEnabled() bool {
+	return c.enabled
+}
+
+// AddPendingRequest stores a request that is awaiting its response
+func (c *MilvusCache) AddPendingRequest(model string, query string, requestBody []byte) (string, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		return query, nil
+	}
+
+	// Store incomplete entry for later completion with response
+	result, err := c.addEntry(model, query, requestBody, nil)
+
+	if err != nil {
+		metrics.RecordCacheOperation("milvus", "add_pending", "error", time.Since(start).Seconds())
+	} else {
+		metrics.RecordCacheOperation("milvus", "add_pending", "success", time.Since(start).Seconds())
+	}
+
+	return result, err
+}
+
+// UpdateWithResponse completes a pending request by adding the response
+func (c *MilvusCache) UpdateWithResponse(query string, responseBody []byte) error {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil
+	}
+
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+
+	observability.Debugf("MilvusCache.UpdateWithResponse: updating pending entry (query: %s, response_size: %d)",
+		queryPreview, len(responseBody))
+
+	// Find the pending entry and complete it with the response
+	// Query for the incomplete entry to retrieve its metadata
+	ctx := context.Background()
+	queryExpr := fmt.Sprintf("query == \"%s\" && response_body == \"\"", query)
+
+	observability.Debugf("MilvusCache.UpdateWithResponse: searching for pending entry with expr: %s", queryExpr)
+
+	results, err := c.client.Query(ctx, c.collectionName, []string{}, queryExpr,
+		[]string{"model", "request_body"})
+
+	if err != nil {
+		observability.Debugf("MilvusCache.UpdateWithResponse: query failed: %v", err)
+		metrics.RecordCacheOperation("milvus", "update_response", "error", time.Since(start).Seconds())
+		return fmt.Errorf("failed to query pending entry: %w", err)
+	}
+
+	if len(results) == 0 {
+		observability.Debugf("MilvusCache.UpdateWithResponse: no pending entry found, adding as new complete entry")
+		// Create new complete entry when no pending entry exists
+		_, err := c.addEntry("unknown", query, []byte(""), responseBody)
+		if err != nil {
+			metrics.RecordCacheOperation("milvus", "update_response", "error", time.Since(start).Seconds())
+		} else {
+			metrics.RecordCacheOperation("milvus", "update_response", "success", time.Since(start).Seconds())
+		}
+		return err
+	}
+
+	// Get the model and request body from the pending entry
+	modelColumn := results[0].(*entity.ColumnVarChar)
+	requestColumn := results[1].(*entity.ColumnVarChar)
+
+	if modelColumn.Len() > 0 {
+		model := modelColumn.Data()[0]
+		requestBody := requestColumn.Data()[0]
+
+		observability.Debugf("MilvusCache.UpdateWithResponse: found pending entry, adding complete entry (model: %s)", model)
+
+		// Create the complete entry with response data
+		_, err := c.addEntry(model, query, []byte(requestBody), responseBody)
+		if err != nil {
+			metrics.RecordCacheOperation("milvus", "update_response", "error", time.Since(start).Seconds())
+			return fmt.Errorf("failed to add complete entry: %w", err)
+		}
+
+		observability.Debugf("MilvusCache.UpdateWithResponse: successfully added complete entry with response")
+		metrics.RecordCacheOperation("milvus", "update_response", "success", time.Since(start).Seconds())
+	}
+
+	return nil
+}
+
+// AddEntry stores a complete request-response pair in the cache
+func (c *MilvusCache) AddEntry(model string, query string, requestBody, responseBody []byte) error {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil
+	}
+
+	_, err := c.addEntry(model, query, requestBody, responseBody)
+
+	if err != nil {
+		metrics.RecordCacheOperation("milvus", "add_entry", "error", time.Since(start).Seconds())
+	} else {
+		metrics.RecordCacheOperation("milvus", "add_entry", "success", time.Since(start).Seconds())
+	}
+
+	return err
+}
+
+// addEntry handles the internal logic for storing entries in Milvus
+func (c *MilvusCache) addEntry(model string, query string, requestBody, responseBody []byte) (string, error) {
+	// Generate semantic embedding for the query
+	embedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		return "", fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Generate unique ID
+	id := fmt.Sprintf("%x", md5.Sum([]byte(fmt.Sprintf("%s_%s_%d", model, query, time.Now().UnixNano()))))
+
+	ctx := context.Background()
+
+	// Prepare data for insertion
+	ids := []string{id}
+	models := []string{model}
+	queries := []string{query}
+	requestBodies := []string{string(requestBody)}
+	responseBodies := []string{string(responseBody)}
+	embeddings := [][]float32{embedding}
+	timestamps := []int64{time.Now().Unix()}
+
+	// Create columns
+	idColumn := entity.NewColumnVarChar("id", ids)
+	modelColumn := entity.NewColumnVarChar("model", models)
+	queryColumn := entity.NewColumnVarChar("query", queries)
+	requestColumn := entity.NewColumnVarChar("request_body", requestBodies)
+	responseColumn := entity.NewColumnVarChar("response_body", responseBodies)
+	embeddingColumn := entity.NewColumnFloatVector(c.config.Collection.VectorField.Name, len(embedding), embeddings)
+	timestampColumn := entity.NewColumnInt64("timestamp", timestamps)
+
+	// Insert the entry into the collection
+	observability.Debugf("MilvusCache.addEntry: inserting entry into collection '%s' (embedding_dim: %d, request_size: %d, response_size: %d)",
+		c.collectionName, len(embedding), len(requestBody), len(responseBody))
+	_, err = c.client.Insert(ctx, c.collectionName, "", idColumn, modelColumn, queryColumn, requestColumn, responseColumn, embeddingColumn, timestampColumn)
+	if err != nil {
+		observability.Debugf("MilvusCache.addEntry: insert failed: %v", err)
+		return "", fmt.Errorf("failed to insert cache entry: %w", err)
+	}
+
+	// Ensure data is persisted to storage
+	if err := c.client.Flush(ctx, c.collectionName, false); err != nil {
+		observability.Warnf("Failed to flush cache entry: %v", err)
+	}
+
+	observability.Debugf("MilvusCache.addEntry: successfully added entry to Milvus")
+	observability.LogEvent("cache_entry_added", map[string]interface{}{
+		"backend":             "milvus",
+		"collection":          c.collectionName,
+		"query":               query,
+		"model":               model,
+		"embedding_dimension": len(embedding),
+	})
+	return query, nil
+}
+
+// FindSimilar searches for semantically similar cached requests
+func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		observability.Debugf("MilvusCache.FindSimilar: cache disabled")
+		return nil, false, nil
+	}
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+	observability.Debugf("MilvusCache.FindSimilar: searching for model='%s', query='%s' (len=%d chars)",
+		model, queryPreview, len(query))
+
+	// Generate semantic embedding for similarity comparison
+	queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
+		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	ctx := context.Background()
+
+	// Query for completed entries with the same model
+	// Using Query approach for comprehensive similarity search
+	queryExpr := fmt.Sprintf("model == \"%s\" && response_body != \"\"", model)
+	observability.Debugf("MilvusCache.FindSimilar: querying with expr: %s (embedding_dim: %d)",
+		queryExpr, len(queryEmbedding))
+
+	// Use Query to get all matching entries, then compute similarity manually
+	results, err := c.client.Query(ctx, c.collectionName, []string{}, queryExpr,
+		[]string{"query", "response_body", c.config.Collection.VectorField.Name})
+
+	if err != nil {
+		observability.Debugf("MilvusCache.FindSimilar: query failed: %v", err)
+		atomic.AddInt64(&c.missCount, 1)
+		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	if len(results) == 0 {
+		atomic.AddInt64(&c.missCount, 1)
+		observability.Debugf("MilvusCache.FindSimilar: no entries found with responses")
+		metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	// Calculate semantic similarity for each candidate
+	bestSimilarity := float32(-1.0)
+	var bestResponse string
+
+	// Find columns by type instead of assuming order
+	var queryColumn *entity.ColumnVarChar
+	var responseColumn *entity.ColumnVarChar
+	var embeddingColumn *entity.ColumnFloatVector
+
+	for _, col := range results {
+		switch typedCol := col.(type) {
+		case *entity.ColumnVarChar:
+			if typedCol.Name() == "query" {
+				queryColumn = typedCol
+			} else if typedCol.Name() == "response_body" {
+				responseColumn = typedCol
+			}
+		case *entity.ColumnFloatVector:
+			if typedCol.Name() == c.config.Collection.VectorField.Name {
+				embeddingColumn = typedCol
+			}
+		}
+	}
+
+	if queryColumn == nil || responseColumn == nil || embeddingColumn == nil {
+		observability.Debugf("MilvusCache.FindSimilar: missing required columns in results")
+		atomic.AddInt64(&c.missCount, 1)
+		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	for i := 0; i < queryColumn.Len(); i++ {
+		storedEmbedding := embeddingColumn.Data()[i]
+
+		// Calculate dot product similarity score
+		var similarity float32
+		for j := 0; j < len(queryEmbedding) && j < len(storedEmbedding); j++ {
+			similarity += queryEmbedding[j] * storedEmbedding[j]
+		}
+
+		if similarity > bestSimilarity {
+			bestSimilarity = similarity
+			bestResponse = responseColumn.Data()[i]
+		}
+	}
+
+	observability.Debugf("MilvusCache.FindSimilar: best similarity=%.4f, threshold=%.4f (checked %d entries)",
+		bestSimilarity, c.similarityThreshold, queryColumn.Len())
+
+	if bestSimilarity >= c.similarityThreshold {
+		atomic.AddInt64(&c.hitCount, 1)
+		observability.Debugf("MilvusCache.FindSimilar: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
+			bestSimilarity, c.similarityThreshold, len(bestResponse))
+		observability.LogEvent("cache_hit", map[string]interface{}{
+			"backend":    "milvus",
+			"similarity": bestSimilarity,
+			"threshold":  c.similarityThreshold,
+			"model":      model,
+			"collection": c.collectionName,
+		})
+		metrics.RecordCacheOperation("milvus", "find_similar", "hit", time.Since(start).Seconds())
+		metrics.RecordCacheHit()
+		return []byte(bestResponse), true, nil
+	}
+
+	atomic.AddInt64(&c.missCount, 1)
+	observability.Debugf("MilvusCache.FindSimilar: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
+		bestSimilarity, c.similarityThreshold)
+	observability.LogEvent("cache_miss", map[string]interface{}{
+		"backend":         "milvus",
+		"best_similarity": bestSimilarity,
+		"threshold":       c.similarityThreshold,
+		"model":           model,
+		"collection":      c.collectionName,
+		"entries_checked": queryColumn.Len(),
+	})
+	metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
+	metrics.RecordCacheMiss()
+	return nil, false, nil
+}
+
+// Close releases all resources held by the cache
+func (c *MilvusCache) Close() error {
+	if c.client != nil {
+		return c.client.Close()
+	}
+	return nil
+}
+
+// GetStats provides current cache performance metrics
+func (c *MilvusCache) GetStats() CacheStats {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	hits := atomic.LoadInt64(&c.hitCount)
+	misses := atomic.LoadInt64(&c.missCount)
+	total := hits + misses
+
+	var hitRatio float64
+	if total > 0 {
+		hitRatio = float64(hits) / float64(total)
+	}
+
+	// Retrieve collection statistics from Milvus
+	totalEntries := 0
+	if c.enabled && c.client != nil {
+		ctx := context.Background()
+		stats, err := c.client.GetCollectionStatistics(ctx, c.collectionName)
+		if err == nil {
+			// Extract entity count from statistics
+			if entityCount, ok := stats["row_count"]; ok {
+				fmt.Sscanf(entityCount, "%d", &totalEntries)
+				observability.Debugf("MilvusCache.GetStats: collection '%s' contains %d entries",
+					c.collectionName, totalEntries)
+			}
+		} else {
+			observability.Debugf("MilvusCache.GetStats: failed to get collection stats: %v", err)
+		}
+	}
+
+	cacheStats := CacheStats{
+		TotalEntries: totalEntries,
+		HitCount:     hits,
+		MissCount:    misses,
+		HitRatio:     hitRatio,
+	}
+
+	if c.lastCleanupTime != nil {
+		cacheStats.LastCleanupTime = c.lastCleanupTime
+	}
+
+	return cacheStats
+}
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
index fe1318e1..7a1441f3 100644
--- a/src/semantic-router/pkg/config/config.go
+++ b/src/semantic-router/pkg/config/config.go
@@ -49,7 +49,26 @@ type RouterConfig struct {
 	ReasoningFamilies map[string]ReasoningFamilyConfig `yaml:"reasoning_families,omitempty"`
 
 	// Semantic cache configuration
-	SemanticCache SemanticCacheConfig `yaml:"semantic_cache"`
+	SemanticCache struct {
+		// Type of cache backend to use
+		BackendType string `yaml:"backend_type,omitempty"`
+
+		// Enable semantic caching
+		Enabled bool `yaml:"enabled"`
+
+		// Similarity threshold for cache hits (0.0-1.0)
+		// If not specified, will use the BertModel.Threshold
+		SimilarityThreshold *float32 `yaml:"similarity_threshold,omitempty"`
+
+		// Maximum number of cache entries to keep (applies to in-memory cache)
+		MaxEntries int `yaml:"max_entries,omitempty"`
+
+		// Time-to-live for cache entries in seconds (0 means no expiration)
+		TTLSeconds int `yaml:"ttl_seconds,omitempty"`
+
+		// Path to backend-specific configuration file
+		BackendConfigPath string `yaml:"backend_config_path,omitempty"`
+	} `yaml:"semantic_cache"`
 
 	// Prompt guard configuration
 	PromptGuard PromptGuardConfig `yaml:"prompt_guard"`
@@ -67,22 +86,6 @@ type RouterConfig struct {
 	API APIConfig `yaml:"api"`
 }
 
-// SemanticCacheConfig represents configuration for the semantic cache
-type SemanticCacheConfig struct {
-	// Enable semantic caching
-	Enabled bool `yaml:"enabled"`
-
-	// Similarity threshold for cache hits (0.0-1.0)
-	// If not specified, will use the BertModel.Threshold
-	SimilarityThreshold *float32 `yaml:"similarity_threshold,omitempty"`
-
-	// Maximum number of cache entries to keep
-	MaxEntries int `yaml:"max_entries,omitempty"`
-
-	// Time-to-live for cache entries in seconds (0 means no expiration)
-	TTLSeconds int `yaml:"ttl_seconds,omitempty"`
-}
-
 // APIConfig represents configuration for API endpoints
 type APIConfig struct {
 	// Batch classification configuration
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
index 464f0cc7..cc83f8c1 100644
--- a/src/semantic-router/pkg/config/config_test.go
+++ b/src/semantic-router/pkg/config/config_test.go
@@ -146,12 +146,16 @@ tools:
 				// Verify default model
 				Expect(cfg.DefaultModel).To(Equal("model-b"))
 
-				// Verify semantic cache
+				// Verify semantic cache (legacy fields)
 				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
 				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.9)))
 				Expect(cfg.SemanticCache.MaxEntries).To(Equal(1000))
 				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(3600))
 
+				// New fields should have default/zero values when not specified
+				Expect(cfg.SemanticCache.BackendType).To(BeEmpty())
+				Expect(cfg.SemanticCache.BackendConfigPath).To(BeEmpty())
+
 				// Verify prompt guard
 				Expect(cfg.PromptGuard.Enabled).To(BeTrue())
 				Expect(cfg.PromptGuard.ModelID).To(Equal("test-jailbreak-model"))
@@ -968,6 +972,311 @@ default_model: "missing-default-model"
 		})
 	})
 
+	Describe("Semantic Cache Backend Configuration", func() {
+		Context("with memory backend configuration", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.85
+  max_entries: 2000
+  ttl_seconds: 1800
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should parse memory backend configuration correctly", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("memory"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.85)))
+				Expect(cfg.SemanticCache.MaxEntries).To(Equal(2000))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(1800))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(BeEmpty())
+			})
+		})
+
+		Context("with milvus backend configuration", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  enabled: true
+  backend_type: "milvus"
+  similarity_threshold: 0.9
+  ttl_seconds: 7200
+  backend_config_path: "config/cache/milvus.yaml"
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should parse milvus backend configuration correctly", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("milvus"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.9)))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(7200))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(Equal("config/cache/milvus.yaml"))
+
+				// MaxEntries should be ignored for Milvus backend
+				Expect(cfg.SemanticCache.MaxEntries).To(Equal(0))
+			})
+		})
+
+		Context("with disabled cache", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  enabled: false
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should preserve configuration even when cache is disabled", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeFalse())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("memory"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.8)))
+			})
+		})
+
+		Context("with minimal configuration", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  enabled: true
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should handle minimal configuration with default values", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(BeEmpty())       // Should default to empty (memory)
+				Expect(cfg.SemanticCache.SimilarityThreshold).To(BeNil()) // Will fallback to BERT threshold
+				Expect(cfg.SemanticCache.MaxEntries).To(Equal(0))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(0))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(BeEmpty())
+			})
+		})
+
+		Context("with comprehensive configuration", func() {
+			BeforeEach(func() {
+				configContent := `
+bert_model:
+  threshold: 0.7
+
+semantic_cache:
+  enabled: true
+  backend_type: "milvus"
+  similarity_threshold: 0.95
+  ttl_seconds: 14400
+  backend_config_path: "config/cache/production_milvus.yaml"
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should parse all semantic cache fields correctly", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("milvus"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.95)))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(14400))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(Equal("config/cache/production_milvus.yaml"))
+
+				// Verify threshold resolution
+				threshold := cfg.GetCacheSimilarityThreshold()
+				Expect(threshold).To(Equal(float32(0.95))) // Should use cache threshold, not BERT
+			})
+		})
+
+		Context("threshold fallback behavior", func() {
+			BeforeEach(func() {
+				configContent := `
+bert_model:
+  threshold: 0.75
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  max_entries: 500
+  # No similarity_threshold specified
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should fall back to BERT threshold when cache threshold not specified", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.SimilarityThreshold).To(BeNil())
+
+				// GetCacheSimilarityThreshold should return BERT threshold
+				threshold := cfg.GetCacheSimilarityThreshold()
+				Expect(threshold).To(Equal(float32(0.75)))
+			})
+		})
+
+		Context("with edge case values", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 1.0
+  max_entries: 0
+  ttl_seconds: -1
+  backend_config_path: ""
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should handle edge case values correctly", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("memory"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(1.0)))
+				Expect(cfg.SemanticCache.MaxEntries).To(Equal(0))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(-1))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(BeEmpty())
+			})
+		})
+
+		Context("with unsupported backend type", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  enabled: true
+  backend_type: "redis"
+  similarity_threshold: 0.8
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should parse unsupported backend type without error (validation happens at runtime)", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				// Configuration parsing should succeed
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("redis"))
+
+				// Runtime validation will catch unsupported backend types
+			})
+		})
+
+		Context("with production-like configuration", func() {
+			BeforeEach(func() {
+				configContent := `
+bert_model:
+  model_id: sentence-transformers/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: false
+
+semantic_cache:
+  enabled: true
+  backend_type: "milvus"
+  similarity_threshold: 0.85
+  ttl_seconds: 86400  # 24 hours
+  backend_config_path: "config/cache/milvus.yaml"
+
+categories:
+  - name: "production"
+    description: "Production workload"
+    model_scores:
+      - model: "gpt-4"
+        score: 0.95
+
+default_model: "gpt-4"
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should handle production-like configuration correctly", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				// Verify BERT config
+				Expect(cfg.BertModel.ModelID).To(Equal("sentence-transformers/all-MiniLM-L12-v2"))
+				Expect(cfg.BertModel.Threshold).To(Equal(float32(0.6)))
+				Expect(cfg.BertModel.UseCPU).To(BeFalse())
+
+				// Verify semantic cache config
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("milvus"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.85)))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(86400))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(Equal("config/cache/milvus.yaml"))
+
+				// Verify threshold resolution
+				threshold := cfg.GetCacheSimilarityThreshold()
+				Expect(threshold).To(Equal(float32(0.85))) // Should use cache threshold
+
+				// Verify other config is still working
+				Expect(cfg.DefaultModel).To(Equal("gpt-4"))
+				Expect(cfg.Categories).To(HaveLen(1))
+			})
+		})
+
+		Context("with multiple backend configurations in comments", func() {
+			BeforeEach(func() {
+				configContent := `
+semantic_cache:
+  # Development configuration
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+
+  # Production configuration (commented out)
+  # backend_type: "milvus"
+  # backend_config_path: "config/cache/milvus.yaml"
+  # max_entries is ignored for Milvus
+`
+				err := os.WriteFile(configFile, []byte(configContent), 0o644)
+				Expect(err).NotTo(HaveOccurred())
+			})
+
+			It("should parse active configuration and ignore commented alternatives", func() {
+				cfg, err := config.LoadConfig(configFile)
+				Expect(err).NotTo(HaveOccurred())
+
+				Expect(cfg.SemanticCache.Enabled).To(BeTrue())
+				Expect(cfg.SemanticCache.BackendType).To(Equal("memory"))
+				Expect(*cfg.SemanticCache.SimilarityThreshold).To(Equal(float32(0.8)))
+				Expect(cfg.SemanticCache.MaxEntries).To(Equal(1000))
+				Expect(cfg.SemanticCache.TTLSeconds).To(Equal(3600))
+				Expect(cfg.SemanticCache.BackendConfigPath).To(BeEmpty()) // Comments are ignored
+			})
+		})
+	})
+
 	Describe("PII Constants", func() {
 		It("should have all expected PII type constants defined", func() {
 			expectedPIITypes := []string{
diff --git a/src/semantic-router/pkg/extproc/caching_test.go b/src/semantic-router/pkg/extproc/caching_test.go
index b0f3f6bd..2be6da39 100644
--- a/src/semantic-router/pkg/extproc/caching_test.go
+++ b/src/semantic-router/pkg/extproc/caching_test.go
@@ -29,13 +29,16 @@ var _ = Describe("Caching Functionality", func() {
 		Expect(err).NotTo(HaveOccurred())
 
 		// Override cache with enabled configuration
-		cacheOptions := cache.SemanticCacheOptions{
+		cacheConfig := cache.CacheConfig{
+			BackendType:         cache.InMemoryCacheType,
 			Enabled:             true,
 			SimilarityThreshold: 0.9,
 			MaxEntries:          100,
 			TTLSeconds:          3600,
 		}
-		router.Cache = cache.NewSemanticCache(cacheOptions)
+		cacheBackend, err := cache.NewCacheBackend(cacheConfig)
+		Expect(err).NotTo(HaveOccurred())
+		router.Cache = cacheBackend
 	})
 
 	It("should handle cache miss scenario", func() {
@@ -207,13 +210,16 @@ var _ = Describe("Caching Functionality", func() {
 	Context("with cache disabled", func() {
 		BeforeEach(func() {
 			cfg.SemanticCache.Enabled = false
-			cacheOptions := cache.SemanticCacheOptions{
+			cacheConfig := cache.CacheConfig{
+				BackendType:         cache.InMemoryCacheType,
 				Enabled:             false,
 				SimilarityThreshold: 0.9,
 				MaxEntries:          100,
 				TTLSeconds:          3600,
 			}
-			router.Cache = cache.NewSemanticCache(cacheOptions)
+			cacheBackend, err := cache.NewCacheBackend(cacheConfig)
+			Expect(err).NotTo(HaveOccurred())
+			router.Cache = cacheBackend
 		})
 
 		It("should process requests normally without caching", func() {
diff --git a/src/semantic-router/pkg/extproc/router.go b/src/semantic-router/pkg/extproc/router.go
index d18b7d10..b3e3cb6b 100644
--- a/src/semantic-router/pkg/extproc/router.go
+++ b/src/semantic-router/pkg/extproc/router.go
@@ -27,7 +27,7 @@ type OpenAIRouter struct {
 	CategoryDescriptions []string
 	Classifier           *classification.Classifier
 	PIIChecker           *pii.PolicyChecker
-	Cache                *cache.SemanticCache
+	Cache                cache.CacheBackend
 	ToolsDatabase        *tools.ToolsDatabase
 
 	// Map to track pending requests and their unique IDs
@@ -92,17 +92,31 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {
 	log.Printf("Category descriptions: %v", categoryDescriptions)
 
 	// Create semantic cache with config options
-	cacheOptions := cache.SemanticCacheOptions{
+	cacheConfig := cache.CacheConfig{
+		BackendType:         cache.CacheBackendType(cfg.SemanticCache.BackendType),
+		Enabled:             cfg.SemanticCache.Enabled,
 		SimilarityThreshold: cfg.GetCacheSimilarityThreshold(),
 		MaxEntries:          cfg.SemanticCache.MaxEntries,
 		TTLSeconds:          cfg.SemanticCache.TTLSeconds,
-		Enabled:             cfg.SemanticCache.Enabled,
+		BackendConfigPath:   cfg.SemanticCache.BackendConfigPath,
+	}
+
+	// Use default backend type if not specified
+	if cacheConfig.BackendType == "" {
+		cacheConfig.BackendType = cache.InMemoryCacheType
+	}
+
+	semanticCache, err := cache.NewCacheBackend(cacheConfig)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create semantic cache: %w", err)
 	}
-	semanticCache := cache.NewSemanticCache(cacheOptions)
 
 	if semanticCache.IsEnabled() {
-		log.Printf("Semantic cache enabled with threshold: %.4f, max entries: %d, TTL: %d seconds",
-			cacheOptions.SimilarityThreshold, cacheOptions.MaxEntries, cacheOptions.TTLSeconds)
+		log.Printf("Semantic cache enabled (backend: %s) with threshold: %.4f, TTL: %d seconds",
+			cacheConfig.BackendType, cacheConfig.SimilarityThreshold, cacheConfig.TTLSeconds)
+		if cacheConfig.BackendType == cache.InMemoryCacheType {
+			log.Printf("In-memory cache max entries: %d", cacheConfig.MaxEntries)
+		}
 	} else {
 		log.Println("Semantic cache is disabled")
 	}
diff --git a/src/semantic-router/pkg/extproc/test_utils_test.go b/src/semantic-router/pkg/extproc/test_utils_test.go
index 5ca4ca3b..218b10d5 100644
--- a/src/semantic-router/pkg/extproc/test_utils_test.go
+++ b/src/semantic-router/pkg/extproc/test_utils_test.go
@@ -130,7 +130,15 @@ func CreateTestConfig() *config.RouterConfig {
 			},
 		},
 		DefaultModel: "model-b",
-		SemanticCache: config.SemanticCacheConfig{
+		SemanticCache: struct {
+			BackendType         string   `yaml:"backend_type,omitempty"`
+			Enabled             bool     `yaml:"enabled"`
+			SimilarityThreshold *float32 `yaml:"similarity_threshold,omitempty"`
+			MaxEntries          int      `yaml:"max_entries,omitempty"`
+			TTLSeconds          int      `yaml:"ttl_seconds,omitempty"`
+			BackendConfigPath   string   `yaml:"backend_config_path,omitempty"`
+		}{
+			BackendType:         "memory",
 			Enabled:             false, // Disable for most tests
 			SimilarityThreshold: &[]float32{0.9}[0],
 			MaxEntries:          100,
@@ -202,13 +210,17 @@ func CreateTestRouter(cfg *config.RouterConfig) (*extproc.OpenAIRouter, error) {
 	}
 
 	// Create semantic cache
-	cacheOptions := cache.SemanticCacheOptions{
+	cacheConfig := cache.CacheConfig{
+		BackendType:         cache.InMemoryCacheType,
+		Enabled:             cfg.SemanticCache.Enabled,
 		SimilarityThreshold: cfg.GetCacheSimilarityThreshold(),
 		MaxEntries:          cfg.SemanticCache.MaxEntries,
 		TTLSeconds:          cfg.SemanticCache.TTLSeconds,
-		Enabled:             cfg.SemanticCache.Enabled,
 	}
-	semanticCache := cache.NewSemanticCache(cacheOptions)
+	semanticCache, err := cache.NewCacheBackend(cacheConfig)
+	if err != nil {
+		return nil, err
+	}
 
 	// Create tools database
 	toolsOptions := tools.ToolsDatabaseOptions{
diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go
index bc465848..354b13f0 100644
--- a/src/semantic-router/pkg/metrics/metrics.go
+++ b/src/semantic-router/pkg/metrics/metrics.go
@@ -193,6 +193,42 @@ var (
 		},
 	)
 
+	// CacheMisses tracks cache misses
+	CacheMisses = promauto.NewCounter(
+		prometheus.CounterOpts{
+			Name: "llm_cache_misses_total",
+			Help: "The total number of cache misses",
+		},
+	)
+
+	// CacheOperationDuration tracks the duration of cache operations by backend and operation type
+	CacheOperationDuration = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "llm_cache_operation_duration_seconds",
+			Help:    "The duration of cache operations in seconds",
+			Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
+		},
+		[]string{"backend", "operation"},
+	)
+
+	// CacheOperationTotal tracks the total number of cache operations by backend and operation type
+	CacheOperationTotal = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "llm_cache_operations_total",
+			Help: "The total number of cache operations",
+		},
+		[]string{"backend", "operation", "status"},
+	)
+
+	// CacheEntriesTotal tracks the total number of entries in the cache by backend
+	CacheEntriesTotal = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "llm_cache_entries_total",
+			Help: "The total number of entries in the cache",
+		},
+		[]string{"backend"},
+	)
+
 	// CategoryClassifications tracks the number of times each category is classified
 	CategoryClassifications = promauto.NewGaugeVec(
 		prometheus.GaugeOpts{
@@ -302,6 +338,22 @@ func RecordCacheHit() {
 	CacheHits.Inc()
 }
 
+// RecordCacheMiss records a cache miss
+func RecordCacheMiss() {
+	CacheMisses.Inc()
+}
+
+// RecordCacheOperation records a cache operation with duration and status
+func RecordCacheOperation(backend, operation, status string, duration float64) {
+	CacheOperationDuration.WithLabelValues(backend, operation).Observe(duration)
+	CacheOperationTotal.WithLabelValues(backend, operation, status).Inc()
+}
+
+// UpdateCacheEntries updates the current number of cache entries for a backend
+func UpdateCacheEntries(backend string, count int) {
+	CacheEntriesTotal.WithLabelValues(backend).Set(float64(count))
+}
+
 // RecordCategoryClassification increments the gauge for a specific category classification
 func RecordCategoryClassification(category string) {
 	CategoryClassifications.WithLabelValues(category).Inc()