vllm-project
diff --git a/‎.gitignore‎
Lines changed: 11 additions & 1 deletion b/‎.gitignore‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/config.e2e.yaml‎
Lines changed: 337 additions & 0 deletions b/‎config/config.e2e.yaml‎
Lines changed: 337 additions & 0 deletions
@@ -13,6 +13,13 @@ __pycache__/
 .venv/
 pii_env/
 
+# Python build artifacts
+dist/
+build/
+*.egg-info/
+*.whl
+*.tar.gz
+
 # Go
 *.exe
 *.exe~
@@ -117,4 +124,7 @@ results/
 .cursorrules.*
 
 # augment editor rules
-.augment
+.augment
+
+# Claude Code configuration (should not be committed)
+CLAUDE.md
@@ -30,7 +30,7 @@ repos:
     entry: bash -c "make markdown-lint"
     language: system
     files: \.md$
-    exclude: ^(\node_modules/)
+    exclude: ^(\node_modules/|CLAUDE\.md)
 
 # Yaml specific hooks
 - repo: local
 
@@ -0,0 +1,337 @@
+bert_model:
+  model_id: sentence-transformers/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+semantic_cache:
+  enabled: true
+  backend_type: "memory"  # Options: "memory" or "milvus"
+  similarity_threshold: 0.8
+  max_entries: 1000  # Only applies to memory backend
+  ttl_seconds: 3600
+
+  # For production environments, use Milvus for scalable caching:
+  # backend_type: "milvus"
+  # backend_config_path: "config/cache/milvus.yaml"
+
+  # Development/Testing: Use in-memory cache (current configuration)
+  # - Fast startup and no external dependencies
+  # - Limited to single instance scaling
+  # - Data lost on restart
+
+  # Production: Use Milvus vector database
+  # - Horizontally scalable and persistent
+  # - Supports distributed deployments
+  # - Requires Milvus cluster setup
+  # - To enable: uncomment the lines above and install Milvus dependencies
+tools:
+  enabled: true  # Set to true to enable automatic tool selection
+  top_k: 3        # Number of most relevant tools to select
+  similarity_threshold: 0.2  # Threshold for tool similarity
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true  # If true, return no tools on failure; if false, return error
+prompt_guard:
+  enabled: true
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
+vllm_endpoints:
+  - name: "endpoint1"
+    address: "127.0.0.1"
+    port: 11434
+    models:
+      - "phi4"
+      - "gemma3:27b"
+    weight: 1  # Load balancing weight
+    health_check_path: "/health"  # Optional health check endpoint
+  - name: "endpoint2"
+    address: "127.0.0.1"
+    port: 11434
+    models:
+      - "mistral-small3.1"
+    weight: 1
+    health_check_path: "/health"
+  - name: "endpoint3"
+    address: "127.0.0.1"
+    port: 11434
+    models:
+      - "phi4"  # Same model can be served by multiple endpoints for redundancy
+      - "mistral-small3.1"
+    weight: 2  # Higher weight for more powerful endpoint
+  - name: "qwen-endpoint"
+    address: "127.0.0.1"
+    port: 8000
+    models:
+      - "Qwen/Qwen2-0.5B-Instruct"
+    weight: 1
+    health_check_path: "/health"
+  - name: "tinyllama-endpoint"
+    address: "127.0.0.1"
+    port: 8001
+    models:
+      - "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    weight: 1
+    health_check_path: "/health"
+
+model_config:
+  phi4:
+    pricing:
+      currency: USD
+      prompt_per_1m: 0.07
+      completion_per_1m: 0.35
+    pii_policy:
+      allow_by_default: false  # Deny all PII by default
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
+    # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
+    preferred_endpoints: ["endpoint1", "endpoint3"]
+    # Reasoning family - phi4 doesn't support reasoning, so omit this field
+
+  # Example: DeepSeek model with custom name
+  "ds-v31-custom":
+    reasoning_family: "deepseek"  # This model uses DeepSeek reasoning syntax
+    preferred_endpoints: ["endpoint1"]
+    pii_policy:
+      allow_by_default: true
+
+  # Example: Qwen3 model with custom name
+  "my-qwen3-model":
+    reasoning_family: "qwen3"     # This model uses Qwen3 reasoning syntax
+    preferred_endpoints: ["endpoint2"]
+    pii_policy:
+      allow_by_default: true
+
+  # Example: GPT-OSS model with custom name
+  "custom-gpt-oss":
+    reasoning_family: "gpt-oss"   # This model uses GPT-OSS reasoning syntax
+    preferred_endpoints: ["endpoint1"]
+    pii_policy:
+      allow_by_default: true
+  gemma3:27b:
+    pricing:
+      currency: USD
+      prompt_per_1m: 0.067
+      completion_per_1m: 0.267
+    pii_policy:
+      allow_by_default: false  # Deny all PII by default
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
+    preferred_endpoints: ["endpoint1"]
+  "mistral-small3.1":
+    pricing:
+      currency: USD
+      prompt_per_1m: 0.1
+      completion_per_1m: 0.3
+    pii_policy:
+      allow_by_default: false  # Deny all PII by default
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
+    preferred_endpoints: ["endpoint2", "endpoint3"]
+  "Qwen/Qwen2-0.5B-Instruct":
+    reasoning_family: "qwen3"  # This model uses Qwen reasoning syntax
+    preferred_endpoints: ["qwen-endpoint"]
+    pii_policy:
+      allow_by_default: true
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
+  "TinyLlama/TinyLlama-1.1B-Chat-v1.0":
+    preferred_endpoints: ["tinyllama-endpoint"]
+    pii_policy:
+      allow_by_default: true
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]
+
+# Classifier configuration for text classification
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"  # TODO: Use local model for now before the code can download the entire model from huggingface
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+  pii_model:
+    model_id: "models/pii_classifier_modernbert-base_presidio_token_model"  # TODO: Use local model for now before the code can download the entire model from huggingface
+    use_modernbert: true
+    threshold: 0.7
+    use_cpu: true
+    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+categories:
+  - name: business
+    use_reasoning: false
+    reasoning_description: "Business content is typically conversational"
+    reasoning_effort: low  # Business conversations need low reasoning effort
+    model_scores:
+      - model: phi4
+        score: 0.8
+      - model: gemma3:27b
+        score: 0.4
+      - model: mistral-small3.1
+        score: 0.2
+  - name: law
+    use_reasoning: false
+    reasoning_description: "Legal content is typically explanatory"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.8
+      - model: phi4
+        score: 0.6
+      - model: mistral-small3.1
+        score: 0.4
+  - name: psychology
+    use_reasoning: false
+    reasoning_description: "Psychology content is usually explanatory"
+    model_scores:
+      - model: mistral-small3.1
+        score: 0.6
+      - model: gemma3:27b
+        score: 0.4
+      - model: phi4
+        score: 0.4
+  - name: biology
+    use_reasoning: true
+    reasoning_description: "Biological processes benefit from structured analysis"
+    model_scores:
+      - model: mistral-small3.1
+        score: 0.8
+      - model: gemma3:27b
+        score: 0.6
+      - model: phi4
+        score: 0.2
+  - name: chemistry
+    use_reasoning: true
+    reasoning_description: "Chemical reactions and formulas require systematic thinking"
+    reasoning_effort: high  # Chemistry requires high reasoning effort
+    model_scores:
+      - model: mistral-small3.1
+        score: 0.8
+      - model: gemma3:27b
+        score: 0.6
+      - model: phi4
+        score: 0.6
+  - name: history
+    use_reasoning: false
+    reasoning_description: "Historical content is narrative-based"
+    model_scores:
+      - model: mistral-small3.1
+        score: 0.8
+      - model: phi4
+        score: 0.6
+      - model: gemma3:27b
+        score: 0.4
+  - name: other
+    use_reasoning: false
+    reasoning_description: "General content doesn't require reasoning"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.8
+      - model: phi4
+        score: 0.6
+      - model: mistral-small3.1
+        score: 0.6
+  - name: health
+    use_reasoning: false
+    reasoning_description: "Health information is typically informational"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.8
+      - model: phi4
+        score: 0.8
+      - model: mistral-small3.1
+        score: 0.6
+  - name: economics
+    use_reasoning: false
+    reasoning_description: "Economic discussions are usually explanatory"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.8
+      - model: mistral-small3.1
+        score: 0.8
+      - model: phi4
+        score: 0.0
+  - name: math
+    use_reasoning: true
+    reasoning_description: "Mathematical problems require step-by-step reasoning"
+    reasoning_effort: high  # Math problems need high reasoning effort
+    model_scores:
+      - model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+        score: 1.0
+      - model: phi4
+        score: 0.9
+      - model: mistral-small3.1
+        score: 0.8
+      - model: gemma3:27b
+        score: 0.6
+  - name: physics
+    use_reasoning: true
+    reasoning_description: "Physics concepts need logical analysis"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.4
+      - model: phi4
+        score: 0.4
+      - model: mistral-small3.1
+        score: 0.4
+  - name: computer science
+    use_reasoning: true
+    reasoning_description: "Programming and algorithms need logical reasoning"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.6
+      - model: mistral-small3.1
+        score: 0.6
+      - model: phi4
+        score: 0.0
+  - name: philosophy
+    use_reasoning: false
+    reasoning_description: "Philosophical discussions are conversational"
+    model_scores:
+      - model: phi4
+        score: 0.6
+      - model: gemma3:27b
+        score: 0.2
+      - model: mistral-small3.1
+        score: 0.2
+  - name: engineering
+    use_reasoning: true
+    reasoning_description: "Engineering problems require systematic problem-solving"
+    model_scores:
+      - model: gemma3:27b
+        score: 0.6
+      - model: mistral-small3.1
+        score: 0.6
+      - model: phi4
+        score: 0.2
+
+default_model: mistral-small3.1
+
+# API Configuration
+api:
+  batch_classification:
+    # Metrics configuration for monitoring batch classification performance
+    metrics:
+      enabled: true              # Enable comprehensive metrics collection
+      detailed_goroutine_tracking: true  # Track individual goroutine lifecycle
+      high_resolution_timing: false      # Use nanosecond precision timing
+      sample_rate: 1.0                   # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%)
+      # Histogram buckets for metrics (directly configure what you need)
+      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
+
+# Reasoning family configurations - define how different model families handle reasoning syntax
+reasoning_families:
+  deepseek:
+    type: "chat_template_kwargs"
+    parameter: "thinking"
+
+  qwen3:
+    type: "chat_template_kwargs"
+    parameter: "enable_thinking"
+
+  gpt-oss:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+
+  gpt:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+
+# Global default reasoning effort level
+default_reasoning_effort: medium  # Default reasoning effort level (low, medium, high)