diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index d643504a..d4f7332b 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -10,7 +10,7 @@ on: jobs: pre-commit: runs-on: ubuntu-latest - name: Run pre-commit hooks on Go, Rust, JavaScripts, Markdown and Python files + name: Run pre-commit hooks check file lint steps: - name: Check out the repo @@ -47,6 +47,7 @@ jobs: build-essential \ pkg-config npm install -g markdownlint-cli + pip install --user yamllint - name: Cache Rust dependencies uses: actions/cache@v4 @@ -82,10 +83,10 @@ jobs: - name: Install pre-commit run: pip install pre-commit - - name: Run pre-commit on Go, Rust, JavaScript, Markdown and Python files + - name: Run pre-commit on Go, Rust, JavaScript, Markdown, Yaml and Python files run: | # Find all Go, Rust, JavaScripts, Markdown and Python files (excluding vendored/generated code) - FILES=$(find . -type f \( -name "*.go" -o -name "*.rs" -o -name "*.py" -o -name "*.js" -o -name "*.md" \) \ + FILES=$(find . -type f \( -name "*.go" -o -name "*.rs" -o -name "*.py" -o -name "*.js" -o -name "*.md" -o -name "*.yaml" -o -name "*.yml" \) \ ! -path "./target/*" \ ! -path "./candle-binding/target/*" \ ! -path "./.git/*" \ @@ -100,7 +101,7 @@ jobs: echo "Running pre-commit on files: $FILES" pre-commit run --files $FILES else - echo "No Go, Rust, JavaScript, Markdown or Python files found to check" + echo "No Go, Rust, JavaScript, Markdown, Yaml, or Python files found to check" fi - name: Show pre-commit results diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 38d36c9a..929f1ed5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,79 +2,89 @@ # See https://pre-commit.com/hooks.html for more hooks repos: # Basic hooks for Go, Rust, Python And JavaScript files only -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v6.0.0 - hooks: - - id: trailing-whitespace - files: \.(go|rs|py|js)$ - - id: end-of-file-fixer - files: \.(go|rs|py|js)$ - - id: check-added-large-files - args: ['--maxkb=500'] - files: \.(go|rs|py|js)$ +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: trailing-whitespace + files: \.(go|rs|py|js)$ + - id: end-of-file-fixer + files: \.(go|rs|py|js)$ + - id: check-added-large-files + args: ['--maxkb=500'] + files: \.(go|rs|py|js)$ # Go specific hooks -- repo: local - hooks: - - id: go-fmt - name: go fmt - entry: gofmt -w - language: system - files: \.go$ +- repo: local + hooks: + - id: go-fmt + name: go fmt + entry: gofmt -w + language: system + files: \.go$ # Markdown specific hooks -- repo: local - hooks: - - id: md-fmt - name: md fmt - entry: bash -c "make markdown-lint" - language: system - files: \.md$ - exclude: ^(\node_modules/) +- repo: local + hooks: + - id: md-fmt + name: md fmt + entry: bash -c "make markdown-lint" + language: system + files: \.md$ + exclude: ^(\node_modules/) + +# Yaml specific hooks +- repo: local + hooks: + - id: yaml-and-yml-fmt + name: yaml/yml fmt + entry: bash -c "make markdown-lint" + language: system + files: \.(yaml|yml)$ + exclude: ^(\node_modules/) # JavaScript specific hooks -- repo: local - hooks: - - id: js-lint - name: js lint - entry: bash -c 'cd website && npm install 2>/dev/null || true && npm run lint' - language: system - files: \.js$ - exclude: ^(\node_modules/) - pass_filenames: false +- repo: local + hooks: + - id: js-lint + name: js lint + entry: bash -c 'cd website && npm install 2>/dev/null || true && npm run lint' + language: system + files: \.js$ + exclude: ^(\node_modules/) + pass_filenames: false -# Rust specific hooks -- repo: local - hooks: - - id: cargo-fmt - name: cargo fmt - entry: bash -c 'cd candle-binding && rustup component add rustfmt 2>/dev/null || true && cargo fmt' - language: system - files: \.rs$ - pass_filenames: false - - id: cargo-check - name: cargo check - entry: bash -c 'cd candle-binding && cargo check' - language: system - files: \.rs$ - pass_filenames: false +# Rust specific hooks +- repo: local + hooks: + - id: cargo-fmt + name: cargo fmt + entry: bash -c 'cd candle-binding && rustup component add rustfmt 2>/dev/null || true && cargo fmt' + language: system + files: \.rs$ + pass_filenames: false + - id: cargo-check + name: cargo check + entry: bash -c 'cd candle-binding && cargo check' + language: system + files: \.rs$ + pass_filenames: false # Python specific hooks -- repo: https://github.com/psf/black - rev: 25.1.0 - hooks: - - id: black - language_version: python3 - files: \.py$ - exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/) +- repo: https://github.com/psf/black + rev: 25.1.0 + hooks: + - id: black + language_version: python3 + files: \.py$ + exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/) -- repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort - args: ["--profile", "black"] - files: \.py$ - exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/) +- repo: https://github.com/PyCQA/isort + rev: 6.0.1 + hooks: + - id: isort + args: ["--profile", "black"] + files: \.py$ + exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/) # Commented out flake8 - only reports issues, doesn't auto-fix # - repo: https://github.com/PyCQA/flake8 diff --git a/.yamllint b/.yamllint new file mode 100644 index 00000000..4c7aa56f --- /dev/null +++ b/.yamllint @@ -0,0 +1,57 @@ +ignore: | + # This directory fails checks since many files + # are templated. Instead, we run the linter + # after running `make generate-manifests` which creates + # the Install YAML in bin/ + .git + .github + node_modules + +rules: + braces: + min-spaces-inside: 0 + max-spaces-inside: 0 + min-spaces-inside-empty: -1 + max-spaces-inside-empty: -1 + brackets: + min-spaces-inside: 0 + max-spaces-inside: 1 + min-spaces-inside-empty: -1 + max-spaces-inside-empty: -1 + colons: + max-spaces-before: 0 + max-spaces-after: 1 + commas: + max-spaces-before: 1 + min-spaces-after: 1 + max-spaces-after: 1 + comments: + level: warning + require-starting-space: true + min-spaces-from-content: 2 + comments-indentation: + level: warning + document-end: disable + document-start: disable + empty-lines: + max: 2 + max-start: 0 + max-end: 1 + empty-values: + forbid-in-block-mappings: false + forbid-in-flow-mappings: true + hyphens: + max-spaces-after: 1 + indentation: + spaces: 2 + indent-sequences: consistent + check-multi-line-strings: false + key-duplicates: enable + key-ordering: disable + new-line-at-end-of-file: enable + new-lines: + type: unix + trailing-spaces: enable + truthy: + check-keys: false + level: warning diff --git a/Makefile b/Makefile index e148ed67..026ed8c1 100644 --- a/Makefile +++ b/Makefile @@ -387,3 +387,7 @@ markdown-lint: markdown-lint-fix: @echo "Fixing markdown lint issues..." markdownlint -c markdownlint.yaml "**/*.md" --ignore node_modules --ignore website/node_modules --fix + +yaml-lint: + @echo "Linting YAML files..." + yamllint --config-file=.yamllint . diff --git a/config/cache/milvus.yaml b/config/cache/milvus.yaml index bac19b4d..0838c4e7 100644 --- a/config/cache/milvus.yaml +++ b/config/cache/milvus.yaml @@ -1,171 +1,170 @@ -# Milvus Vector Database Configuration for Semantic Cache -# -# This configuration file contains settings for using Milvus as the semantic cache backend. -# To use this configuration: -# 1. Set backend_type: "milvus" in your main config.yaml -# 2. Set backend_config_path: "config/cache/milvus.yaml" in your main config.yaml -# 3. Ensure Milvus server is running and accessible -# 4. Build with Milvus support: go build -tags=milvus - -# Milvus connection settings -connection: - # Milvus server host (change for production deployment) - host: "localhost" # For production: use your Milvus cluster endpoint - - # Milvus server port - port: 19530 # Standard Milvus port - - # Database name (optional, defaults to "default") - database: "semantic_router_cache" - - # Connection timeout in seconds - timeout: 30 - - # Authentication (enable for production) - auth: - enabled: false # Set to true for production - username: "" # Your Milvus username - password: "" # Your Milvus password - - # TLS/SSL configuration (recommended for production) - tls: - enabled: false # Set to true for secure connections - cert_file: "" # Path to client certificate - key_file: "" # Path to client private key - ca_file: "" # Path to CA certificate - -# Collection settings -collection: - # Name of the collection to store cache entries - name: "semantic_cache" - - # Description of the collection - description: "Semantic cache for LLM request-response pairs" - - # Vector field configuration - vector_field: - # Name of the vector field - name: "embedding" - - # Dimension of the embeddings (auto-detected from model at runtime) - dimension: 384 # This value is ignored - dimension is auto-detected from the embedding model - - # Metric type for similarity calculation - metric_type: "IP" # Inner Product (cosine similarity for normalized vectors) - - # Index configuration for the vector field - index: - # Index type (HNSW is recommended for most use cases) - type: "HNSW" - - # Index parameters - params: - M: 16 # Number of bi-directional links for each node - efConstruction: 64 # Search scope during index construction - -# Search configuration -search: - # Search parameters - params: - ef: 64 # Search scope during search (should be >= topk) - - # Number of top results to retrieve for similarity comparison - topk: 10 - - # Consistency level for search operations - consistency_level: "Session" # Options: Strong, Session, Bounded, Eventually - -# Performance and resource settings -performance: - # Connection pool settings - connection_pool: - # Maximum number of connections in the pool - max_connections: 10 - - # Maximum idle connections - max_idle_connections: 5 - - # Connection timeout for acquiring from pool - acquire_timeout: 5 - - # Batch operation settings - batch: - # Maximum batch size for insert operations - insert_batch_size: 1000 - - # Batch timeout in seconds - timeout: 30 - -# Data management -data_management: - # Automatic data expiration (TTL) settings - ttl: - # Enable automatic TTL-based cleanup (requires TTL to be set in main config) - enabled: true - - # Field name to store timestamp for TTL calculation - timestamp_field: "timestamp" - - # Cleanup interval in seconds (how often to run cleanup) - cleanup_interval: 3600 # 1 hour - - # Compaction settings - compaction: - # Enable automatic compaction - enabled: true - - # Compaction interval in seconds - interval: 86400 # 24 hours - -# Logging and monitoring -logging: - # Log level for Milvus client operations (debug, info, warn, error) - level: "info" - - # Enable query/search logging for debugging - enable_query_log: false - - # Enable performance metrics collection - enable_metrics: true - -# Development and debugging settings -development: - # Drop collection on startup (WARNING: This will delete all cached data) - drop_collection_on_startup: true # Enable for development to test dynamic dimensions - - # Create collection if it doesn't exist - auto_create_collection: true - - # Print detailed error messages - verbose_errors: true - -# Example configurations for different environments: -# -# Local Development (Docker): -# connection: -# host: "localhost" -# port: 19530 -# auth: -# enabled: false -# development: -# drop_collection_on_startup: true # Clean start for development -# -# Production (Zilliz Cloud): -# connection: -# host: "your-cluster-endpoint.zillizcloud.com" -# port: 443 -# auth: -# enabled: true -# username: "your-username" -# password: "your-password" -# tls: -# enabled: true -# development: -# drop_collection_on_startup: false -# auto_create_collection: false # Pre-create collections in production -# -# Kubernetes Deployment: -# connection: -# host: "milvus-service.milvus-system.svc.cluster.local" -# port: 19530 -# timeout: 60 # Longer timeout for cluster environments +# Milvus Vector Database Configuration for Semantic Cache +# This configuration file contains settings for using Milvus as the semantic cache backend. +# To use this configuration: +# 1. Set backend_type: "milvus" in your main config.yaml +# 2. Set backend_config_path: "config/cache/milvus.yaml" in your main config.yaml +# 3. Ensure Milvus server is running and accessible +# 4. Build with Milvus support: go build -tags=milvus + +# Milvus connection settings +connection: + # Milvus server host (change for production deployment) + host: "localhost" # For production: use your Milvus cluster endpoint + + # Milvus server port + port: 19530 # Standard Milvus port + + # Database name (optional, defaults to "default") + database: "semantic_router_cache" + + # Connection timeout in seconds + timeout: 30 + + # Authentication (enable for production) + auth: + enabled: false # Set to true for production + username: "" # Your Milvus username + password: "" # Your Milvus password + + # TLS/SSL configuration (recommended for production) + tls: + enabled: false # Set to true for secure connections + cert_file: "" # Path to client certificate + key_file: "" # Path to client private key + ca_file: "" # Path to CA certificate + +# Collection settings +collection: + # Name of the collection to store cache entries + name: "semantic_cache" + + # Description of the collection + description: "Semantic cache for LLM request-response pairs" + + # Vector field configuration + vector_field: + # Name of the vector field + name: "embedding" + + # Dimension of the embeddings (auto-detected from model at runtime) + dimension: 384 # This value is ignored - dimension is auto-detected from the embedding model + + # Metric type for similarity calculation + metric_type: "IP" # Inner Product (cosine similarity for normalized vectors) + + # Index configuration for the vector field + index: + # Index type (HNSW is recommended for most use cases) + type: "HNSW" + + # Index parameters + params: + M: 16 # Number of bi-directional links for each node + efConstruction: 64 # Search scope during index construction + +# Search configuration +search: + # Search parameters + params: + ef: 64 # Search scope during search (should be >= topk) + + # Number of top results to retrieve for similarity comparison + topk: 10 + + # Consistency level for search operations + consistency_level: "Session" # Options: Strong, Session, Bounded, Eventually + +# Performance and resource settings +performance: + # Connection pool settings + connection_pool: + # Maximum number of connections in the pool + max_connections: 10 + + # Maximum idle connections + max_idle_connections: 5 + + # Connection timeout for acquiring from pool + acquire_timeout: 5 + + # Batch operation settings + batch: + # Maximum batch size for insert operations + insert_batch_size: 1000 + + # Batch timeout in seconds + timeout: 30 + +# Data management +data_management: + # Automatic data expiration (TTL) settings + ttl: + # Enable automatic TTL-based cleanup (requires TTL to be set in main config) + enabled: true + + # Field name to store timestamp for TTL calculation + timestamp_field: "timestamp" + + # Cleanup interval in seconds (how often to run cleanup) + cleanup_interval: 3600 # 1 hour + + # Compaction settings + compaction: + # Enable automatic compaction + enabled: true + + # Compaction interval in seconds + interval: 86400 # 24 hours + +# Logging and monitoring +logging: + # Log level for Milvus client operations (debug, info, warn, error) + level: "info" + + # Enable query/search logging for debugging + enable_query_log: false + + # Enable performance metrics collection + enable_metrics: true + +# Development and debugging settings +development: + # Drop collection on startup (WARNING: This will delete all cached data) + drop_collection_on_startup: true # Enable for development to test dynamic dimensions + + # Create collection if it doesn't exist + auto_create_collection: true + + # Print detailed error messages + verbose_errors: true + +# Example configurations for different environments: +# +# Local Development (Docker): +# connection: +# host: "localhost" +# port: 19530 +# auth: +# enabled: false +# development: +# drop_collection_on_startup: true # Clean start for development +# +# Production (Zilliz Cloud): +# connection: +# host: "your-cluster-endpoint.zillizcloud.com" +# port: 443 +# auth: +# enabled: true +# username: "your-username" +# password: "your-password" +# tls: +# enabled: true +# development: +# drop_collection_on_startup: false +# auto_create_collection: false # Pre-create collections in production +# +# Kubernetes Deployment: +# connection: +# host: "milvus-service.milvus-system.svc.cluster.local" +# port: 19530 +# timeout: 60 # Longer timeout for cluster environments diff --git a/config/config.yaml b/config/config.yaml index 4f951ac3..faabb985 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -8,16 +8,16 @@ semantic_cache: similarity_threshold: 0.8 max_entries: 1000 # Only applies to memory backend ttl_seconds: 3600 - + # For production environments, use Milvus for scalable caching: # backend_type: "milvus" # backend_config_path: "config/cache/milvus.yaml" - + # Development/Testing: Use in-memory cache (current configuration) # - Fast startup and no external dependencies # - Limited to single instance scaling # - Data lost on restart - + # Production: Use Milvus vector database # - Horizontally scalable and persistent # - Supports distributed deployments @@ -74,21 +74,21 @@ model_config: # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model) preferred_endpoints: ["endpoint1", "endpoint3"] # Reasoning family - phi4 doesn't support reasoning, so omit this field - + # Example: DeepSeek model with custom name "ds-v31-custom": reasoning_family: "deepseek" # This model uses DeepSeek reasoning syntax preferred_endpoints: ["endpoint1"] pii_policy: allow_by_default: true - - # Example: Qwen3 model with custom name + + # Example: Qwen3 model with custom name "my-qwen3-model": reasoning_family: "qwen3" # This model uses Qwen3 reasoning syntax preferred_endpoints: ["endpoint2"] pii_policy: allow_by_default: true - + # Example: GPT-OSS model with custom name "custom-gpt-oss": reasoning_family: "gpt-oss" # This model uses GPT-OSS reasoning syntax @@ -117,161 +117,162 @@ model_config: # Classifier configuration for text classification classifier: category_model: - model_id: "models/category_classifier_modernbert-base_model" #TODO: Use local model for now before the code can download the entire model from huggingface + model_id: "models/category_classifier_modernbert-base_model" # TODO: Use local model for now before the code can download the entire model from huggingface use_modernbert: true threshold: 0.6 use_cpu: true category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" pii_model: - model_id: "models/pii_classifier_modernbert-base_presidio_token_model" #TODO: Use local model for now before the code can download the entire model from huggingface + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" # TODO: Use local model for now before the code can download the entire model from huggingface use_modernbert: true threshold: 0.7 use_cpu: true pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" categories: -- name: business - use_reasoning: false - reasoning_description: "Business content is typically conversational" - reasoning_effort: low # Business conversations need low reasoning effort - model_scores: - - model: phi4 - score: 0.8 - - model: gemma3:27b - score: 0.4 - - model: mistral-small3.1 - score: 0.2 -- name: law - use_reasoning: false - reasoning_description: "Legal content is typically explanatory" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.6 - - model: mistral-small3.1 - score: 0.4 -- name: psychology - use_reasoning: false - reasoning_description: "Psychology content is usually explanatory" - model_scores: - - model: mistral-small3.1 - score: 0.6 - - model: gemma3:27b - score: 0.4 - - model: phi4 - score: 0.4 -- name: biology - use_reasoning: true - reasoning_description: "Biological processes benefit from structured analysis" - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 - - model: phi4 - score: 0.2 -- name: chemistry - use_reasoning: true - reasoning_description: "Chemical reactions and formulas require systematic thinking" - reasoning_effort: high # Chemistry requires high reasoning effort - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 - - model: phi4 - score: 0.6 -- name: history - use_reasoning: false - reasoning_description: "Historical content is narrative-based" - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: phi4 - score: 0.6 - - model: gemma3:27b - score: 0.4 -- name: other - use_reasoning: false - reasoning_description: "General content doesn't require reasoning" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.6 - - model: mistral-small3.1 - score: 0.6 -- name: health - use_reasoning: false - reasoning_description: "Health information is typically informational" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.8 - - model: mistral-small3.1 - score: 0.6 -- name: economics - use_reasoning: false - reasoning_description: "Economic discussions are usually explanatory" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: mistral-small3.1 - score: 0.8 - - model: phi4 - score: 0.0 -- name: math - use_reasoning: true - reasoning_description: "Mathematical problems require step-by-step reasoning" - reasoning_effort: high # Math problems need high reasoning effort - model_scores: - - model: phi4 - score: 1.0 - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 -- name: physics - use_reasoning: true - reasoning_description: "Physics concepts need logical analysis" - model_scores: - - model: gemma3:27b - score: 0.4 - - model: phi4 - score: 0.4 - - model: mistral-small3.1 - score: 0.4 -- name: computer science - use_reasoning: true - reasoning_description: "Programming and algorithms need logical reasoning" - model_scores: - - model: gemma3:27b - score: 0.6 - - model: mistral-small3.1 - score: 0.6 - - model: phi4 - score: 0.0 -- name: philosophy - use_reasoning: false - reasoning_description: "Philosophical discussions are conversational" - model_scores: - - model: phi4 - score: 0.6 - - model: gemma3:27b - score: 0.2 - - model: mistral-small3.1 - score: 0.2 -- name: engineering - use_reasoning: true - reasoning_description: "Engineering problems require systematic problem-solving" - model_scores: - - model: gemma3:27b - score: 0.6 - - model: mistral-small3.1 - score: 0.6 - - model: phi4 - score: 0.2 + - name: business + use_reasoning: false + reasoning_description: "Business content is typically conversational" + reasoning_effort: low # Business conversations need low reasoning effort + model_scores: + - model: phi4 + score: 0.8 + - model: gemma3:27b + score: 0.4 + - model: mistral-small3.1 + score: 0.2 + - name: law + use_reasoning: false + reasoning_description: "Legal content is typically explanatory" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.6 + - model: mistral-small3.1 + score: 0.4 + - name: psychology + use_reasoning: false + reasoning_description: "Psychology content is usually explanatory" + model_scores: + - model: mistral-small3.1 + score: 0.6 + - model: gemma3:27b + score: 0.4 + - model: phi4 + score: 0.4 + - name: biology + use_reasoning: true + reasoning_description: "Biological processes benefit from structured analysis" + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - model: phi4 + score: 0.2 + - name: chemistry + use_reasoning: true + reasoning_description: "Chemical reactions and formulas require systematic thinking" + reasoning_effort: high # Chemistry requires high reasoning effort + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - model: phi4 + score: 0.6 + - name: history + use_reasoning: false + reasoning_description: "Historical content is narrative-based" + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: phi4 + score: 0.6 + - model: gemma3:27b + score: 0.4 + - name: other + use_reasoning: false + reasoning_description: "General content doesn't require reasoning" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - name: health + use_reasoning: false + reasoning_description: "Health information is typically informational" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.8 + - model: mistral-small3.1 + score: 0.6 + - name: economics + use_reasoning: false + reasoning_description: "Economic discussions are usually explanatory" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: mistral-small3.1 + score: 0.8 + - model: phi4 + score: 0.0 + - name: math + use_reasoning: true + reasoning_description: "Mathematical problems require step-by-step reasoning" + reasoning_effort: high # Math problems need high reasoning effort + model_scores: + - model: phi4 + score: 1.0 + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - name: physics + use_reasoning: true + reasoning_description: "Physics concepts need logical analysis" + model_scores: + - model: gemma3:27b + score: 0.4 + - model: phi4 + score: 0.4 + - model: mistral-small3.1 + score: 0.4 + - name: computer science + use_reasoning: true + reasoning_description: "Programming and algorithms need logical reasoning" + model_scores: + - model: gemma3:27b + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - model: phi4 + score: 0.0 + - name: philosophy + use_reasoning: false + reasoning_description: "Philosophical discussions are conversational" + model_scores: + - model: phi4 + score: 0.6 + - model: gemma3:27b + score: 0.2 + - model: mistral-small3.1 + score: 0.2 + - name: engineering + use_reasoning: true + reasoning_description: "Engineering problems require systematic problem-solving" + model_scores: + - model: gemma3:27b + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - model: phi4 + score: 0.2 + default_model: mistral-small3.1 # API Configuration @@ -292,18 +293,18 @@ reasoning_families: deepseek: type: "chat_template_kwargs" parameter: "thinking" - + qwen3: type: "chat_template_kwargs" parameter: "enable_thinking" - + gpt-oss: type: "reasoning_effort" parameter: "reasoning_effort" - + gpt: type: "reasoning_effort" parameter: "reasoning_effort" # Global default reasoning effort level -default_reasoning_effort: medium # Default reasoning effort level (low, medium, high) +default_reasoning_effort: medium # Default reasoning effort level (low, medium, high) diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml index 358fb0cd..226834e7 100644 --- a/deploy/kubernetes/config.yaml +++ b/deploy/kubernetes/config.yaml @@ -67,7 +67,7 @@ model_config: # Classifier configuration for text classification classifier: category_model: - model_id: "models/category_classifier_modernbert-base_model" #TODO: Use local model for now before the code can download the entire model from huggingface + model_id: "models/category_classifier_modernbert-base_model" # TODO: Use local model for now before the code can download the entire model from huggingface use_modernbert: true threshold: 0.6 use_cpu: true @@ -79,116 +79,117 @@ classifier: use_cpu: true pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" categories: -- name: business - model_scores: - - model: phi4 - score: 0.8 - - model: gemma3:27b - score: 0.4 - - model: mistral-small3.1 - score: 0.2 -- name: law - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.6 - - model: mistral-small3.1 - score: 0.4 -- name: psychology - model_scores: - - model: mistral-small3.1 - score: 0.6 - - model: gemma3:27b - score: 0.4 - - model: phi4 - score: 0.4 -- name: biology - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 - - model: phi4 - score: 0.2 -- name: chemistry - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 - - model: phi4 - score: 0.6 -- name: history - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: phi4 - score: 0.6 - - model: gemma3:27b - score: 0.4 -- name: other - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.6 - - model: mistral-small3.1 - score: 0.6 -- name: health - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.8 - - model: mistral-small3.1 - score: 0.6 -- name: economics - model_scores: - - model: gemma3:27b - score: 0.8 - - model: mistral-small3.1 - score: 0.8 - - model: phi4 - score: 0.0 -- name: math - model_scores: - - model: phi4 - score: 1.0 - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 -- name: physics - model_scores: - - model: gemma3:27b - score: 0.4 - - model: phi4 - score: 0.4 - - model: mistral-small3.1 - score: 0.4 -- name: computer science - model_scores: - - model: gemma3:27b - score: 0.6 - - model: mistral-small3.1 - score: 0.6 - - model: phi4 - score: 0.0 -- name: philosophy - model_scores: - - model: phi4 - score: 0.6 - - model: gemma3:27b - score: 0.2 - - model: mistral-small3.1 - score: 0.2 -- name: engineering - model_scores: - - model: gemma3:27b - score: 0.6 - - model: mistral-small3.1 - score: 0.6 - - model: phi4 - score: 0.2 -default_model: mistral-small3.1 \ No newline at end of file + - name: business + model_scores: + - model: phi4 + score: 0.8 + - model: gemma3:27b + score: 0.4 + - model: mistral-small3.1 + score: 0.2 + - name: law + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.6 + - model: mistral-small3.1 + score: 0.4 + - name: psychology + model_scores: + - model: mistral-small3.1 + score: 0.6 + - model: gemma3:27b + score: 0.4 + - model: phi4 + score: 0.4 + - name: biology + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - model: phi4 + score: 0.2 + - name: chemistry + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - model: phi4 + score: 0.6 + - name: history + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: phi4 + score: 0.6 + - model: gemma3:27b + score: 0.4 + - name: other + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - name: health + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.8 + - model: mistral-small3.1 + score: 0.6 + - name: economics + model_scores: + - model: gemma3:27b + score: 0.8 + - model: mistral-small3.1 + score: 0.8 + - model: phi4 + score: 0.0 + - name: math + model_scores: + - model: phi4 + score: 1.0 + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - name: physics + model_scores: + - model: gemma3:27b + score: 0.4 + - model: phi4 + score: 0.4 + - model: mistral-small3.1 + score: 0.4 + - name: computer science + model_scores: + - model: gemma3:27b + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - model: phi4 + score: 0.0 + - name: philosophy + model_scores: + - model: phi4 + score: 0.6 + - model: gemma3:27b + score: 0.2 + - model: mistral-small3.1 + score: 0.2 + - name: engineering + model_scores: + - model: gemma3:27b + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - model: phi4 + score: 0.2 + +default_model: mistral-small3.1 diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/deployment.yaml index c0564a6b..45ab8e98 100644 --- a/deploy/kubernetes/deployment.yaml +++ b/deploy/kubernetes/deployment.yaml @@ -23,10 +23,10 @@ spec: set -e echo "Installing Hugging Face CLI..." pip install --no-cache-dir huggingface_hub[cli] - + echo "Downloading models to persistent volume..." cd /app/models - + # Download category classifier model if [ ! -d "category_classifier_modernbert-base_model" ]; then echo "Downloading category classifier model..." @@ -34,15 +34,15 @@ spec: else echo "Category classifier model already exists, skipping..." fi - - # Download PII classifier model + + # Download PII classifier model if [ ! -d "pii_classifier_modernbert-base_model" ]; then echo "Downloading PII classifier model..." huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model else echo "PII classifier model already exists, skipping..." fi - + # Download jailbreak classifier model if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then echo "Downloading jailbreak classifier model..." @@ -50,7 +50,7 @@ spec: else echo "Jailbreak classifier model already exists, skipping..." fi - + # Download PII token classifier model if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then echo "Downloading PII token classifier model..." @@ -58,7 +58,7 @@ spec: else echo "PII token classifier model already exists, skipping..." fi - + echo "All models downloaded successfully!" ls -la /app/models/ env: diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/kustomization.yaml index 90bd3006..8160564b 100644 --- a/deploy/kubernetes/kustomization.yaml +++ b/deploy/kubernetes/kustomization.yaml @@ -21,6 +21,5 @@ configMapGenerator: namespace: semantic-router images: - - name: ghcr.io/vllm-project/semantic-router/extproc - newTag: latest - +- name: ghcr.io/vllm-project/semantic-router/extproc + newTag: latest diff --git a/docker-compose.yml b/docker-compose.yml index 2b34eaf8..09f7b9ad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,7 +16,7 @@ services: networks: - semantic-network healthcheck: - test: ["CMD", "curl","-f", "localhost:8080/health"] + test: ["CMD", "curl", "-f", "localhost:8080/health"] interval: 10s timeout: 5s retries: 5