|
| 1 | +bert_model: |
| 2 | + model_id: sentence-transformers/all-MiniLM-L12-v2 |
| 3 | + threshold: 0.6 |
| 4 | + use_cpu: true |
| 5 | +semantic_cache: |
| 6 | + enabled: true |
| 7 | + backend_type: "memory" # Options: "memory" or "milvus" |
| 8 | + similarity_threshold: 0.8 |
| 9 | + max_entries: 1000 # Only applies to memory backend |
| 10 | + ttl_seconds: 3600 |
| 11 | + |
| 12 | + # For production environments, use Milvus for scalable caching: |
| 13 | + # backend_type: "milvus" |
| 14 | + # backend_config_path: "config/cache/milvus.yaml" |
| 15 | + |
| 16 | + # Development/Testing: Use in-memory cache (current configuration) |
| 17 | + # - Fast startup and no external dependencies |
| 18 | + # - Limited to single instance scaling |
| 19 | + # - Data lost on restart |
| 20 | + |
| 21 | + # Production: Use Milvus vector database |
| 22 | + # - Horizontally scalable and persistent |
| 23 | + # - Supports distributed deployments |
| 24 | + # - Requires Milvus cluster setup |
| 25 | + # - To enable: uncomment the lines above and install Milvus dependencies |
| 26 | +tools: |
| 27 | + enabled: true # Set to true to enable automatic tool selection |
| 28 | + top_k: 3 # Number of most relevant tools to select |
| 29 | + similarity_threshold: 0.2 # Threshold for tool similarity |
| 30 | + tools_db_path: "config/tools_db.json" |
| 31 | + fallback_to_empty: true # If true, return no tools on failure; if false, return error |
| 32 | +prompt_guard: |
| 33 | + enabled: true |
| 34 | + use_modernbert: true |
| 35 | + model_id: "models/jailbreak_classifier_modernbert-base_model" |
| 36 | + threshold: 0.7 |
| 37 | + use_cpu: true |
| 38 | + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" |
| 39 | + |
| 40 | +# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models |
| 41 | +vllm_endpoints: |
| 42 | + - name: "endpoint1" |
| 43 | + address: "127.0.0.1" |
| 44 | + port: 11434 |
| 45 | + models: |
| 46 | + - "phi4" |
| 47 | + - "gemma3:27b" |
| 48 | + weight: 1 # Load balancing weight |
| 49 | + health_check_path: "/health" # Optional health check endpoint |
| 50 | + - name: "endpoint2" |
| 51 | + address: "127.0.0.1" |
| 52 | + port: 11434 |
| 53 | + models: |
| 54 | + - "mistral-small3.1" |
| 55 | + weight: 1 |
| 56 | + health_check_path: "/health" |
| 57 | + - name: "endpoint3" |
| 58 | + address: "127.0.0.1" |
| 59 | + port: 11434 |
| 60 | + models: |
| 61 | + - "phi4" # Same model can be served by multiple endpoints for redundancy |
| 62 | + - "mistral-small3.1" |
| 63 | + weight: 2 # Higher weight for more powerful endpoint |
| 64 | + - name: "qwen-endpoint" |
| 65 | + address: "127.0.0.1" |
| 66 | + port: 8000 |
| 67 | + models: |
| 68 | + - "Qwen/Qwen2-0.5B-Instruct" |
| 69 | + weight: 1 |
| 70 | + health_check_path: "/health" |
| 71 | + - name: "tinyllama-endpoint" |
| 72 | + address: "127.0.0.1" |
| 73 | + port: 8001 |
| 74 | + models: |
| 75 | + - "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
| 76 | + weight: 1 |
| 77 | + health_check_path: "/health" |
| 78 | + |
| 79 | +model_config: |
| 80 | + phi4: |
| 81 | + pricing: |
| 82 | + currency: USD |
| 83 | + prompt_per_1m: 0.07 |
| 84 | + completion_per_1m: 0.35 |
| 85 | + pii_policy: |
| 86 | + allow_by_default: false # Deny all PII by default |
| 87 | + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types |
| 88 | + # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model) |
| 89 | + preferred_endpoints: ["endpoint1", "endpoint3"] |
| 90 | + # Reasoning family - phi4 doesn't support reasoning, so omit this field |
| 91 | + |
| 92 | + # Example: DeepSeek model with custom name |
| 93 | + "ds-v31-custom": |
| 94 | + reasoning_family: "deepseek" # This model uses DeepSeek reasoning syntax |
| 95 | + preferred_endpoints: ["endpoint1"] |
| 96 | + pii_policy: |
| 97 | + allow_by_default: true |
| 98 | + |
| 99 | + # Example: Qwen3 model with custom name |
| 100 | + "my-qwen3-model": |
| 101 | + reasoning_family: "qwen3" # This model uses Qwen3 reasoning syntax |
| 102 | + preferred_endpoints: ["endpoint2"] |
| 103 | + pii_policy: |
| 104 | + allow_by_default: true |
| 105 | + |
| 106 | + # Example: GPT-OSS model with custom name |
| 107 | + "custom-gpt-oss": |
| 108 | + reasoning_family: "gpt-oss" # This model uses GPT-OSS reasoning syntax |
| 109 | + preferred_endpoints: ["endpoint1"] |
| 110 | + pii_policy: |
| 111 | + allow_by_default: true |
| 112 | + gemma3:27b: |
| 113 | + pricing: |
| 114 | + currency: USD |
| 115 | + prompt_per_1m: 0.067 |
| 116 | + completion_per_1m: 0.267 |
| 117 | + pii_policy: |
| 118 | + allow_by_default: false # Deny all PII by default |
| 119 | + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types |
| 120 | + preferred_endpoints: ["endpoint1"] |
| 121 | + "mistral-small3.1": |
| 122 | + pricing: |
| 123 | + currency: USD |
| 124 | + prompt_per_1m: 0.1 |
| 125 | + completion_per_1m: 0.3 |
| 126 | + pii_policy: |
| 127 | + allow_by_default: false # Deny all PII by default |
| 128 | + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types |
| 129 | + preferred_endpoints: ["endpoint2", "endpoint3"] |
| 130 | + "Qwen/Qwen2-0.5B-Instruct": |
| 131 | + reasoning_family: "qwen3" # This model uses Qwen reasoning syntax |
| 132 | + preferred_endpoints: ["qwen-endpoint"] |
| 133 | + pii_policy: |
| 134 | + allow_by_default: true |
| 135 | + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] |
| 136 | + "TinyLlama/TinyLlama-1.1B-Chat-v1.0": |
| 137 | + preferred_endpoints: ["tinyllama-endpoint"] |
| 138 | + pii_policy: |
| 139 | + allow_by_default: true |
| 140 | + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] |
| 141 | + |
| 142 | +# Classifier configuration for text classification |
| 143 | +classifier: |
| 144 | + category_model: |
| 145 | + model_id: "models/category_classifier_modernbert-base_model" # TODO: Use local model for now before the code can download the entire model from huggingface |
| 146 | + use_modernbert: true |
| 147 | + threshold: 0.6 |
| 148 | + use_cpu: true |
| 149 | + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" |
| 150 | + pii_model: |
| 151 | + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" # TODO: Use local model for now before the code can download the entire model from huggingface |
| 152 | + use_modernbert: true |
| 153 | + threshold: 0.7 |
| 154 | + use_cpu: true |
| 155 | + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" |
| 156 | +categories: |
| 157 | + - name: business |
| 158 | + use_reasoning: false |
| 159 | + reasoning_description: "Business content is typically conversational" |
| 160 | + reasoning_effort: low # Business conversations need low reasoning effort |
| 161 | + model_scores: |
| 162 | + - model: phi4 |
| 163 | + score: 0.8 |
| 164 | + - model: gemma3:27b |
| 165 | + score: 0.4 |
| 166 | + - model: mistral-small3.1 |
| 167 | + score: 0.2 |
| 168 | + - name: law |
| 169 | + use_reasoning: false |
| 170 | + reasoning_description: "Legal content is typically explanatory" |
| 171 | + model_scores: |
| 172 | + - model: gemma3:27b |
| 173 | + score: 0.8 |
| 174 | + - model: phi4 |
| 175 | + score: 0.6 |
| 176 | + - model: mistral-small3.1 |
| 177 | + score: 0.4 |
| 178 | + - name: psychology |
| 179 | + use_reasoning: false |
| 180 | + reasoning_description: "Psychology content is usually explanatory" |
| 181 | + model_scores: |
| 182 | + - model: mistral-small3.1 |
| 183 | + score: 0.6 |
| 184 | + - model: gemma3:27b |
| 185 | + score: 0.4 |
| 186 | + - model: phi4 |
| 187 | + score: 0.4 |
| 188 | + - name: biology |
| 189 | + use_reasoning: true |
| 190 | + reasoning_description: "Biological processes benefit from structured analysis" |
| 191 | + model_scores: |
| 192 | + - model: mistral-small3.1 |
| 193 | + score: 0.8 |
| 194 | + - model: gemma3:27b |
| 195 | + score: 0.6 |
| 196 | + - model: phi4 |
| 197 | + score: 0.2 |
| 198 | + - name: chemistry |
| 199 | + use_reasoning: true |
| 200 | + reasoning_description: "Chemical reactions and formulas require systematic thinking" |
| 201 | + reasoning_effort: high # Chemistry requires high reasoning effort |
| 202 | + model_scores: |
| 203 | + - model: mistral-small3.1 |
| 204 | + score: 0.8 |
| 205 | + - model: gemma3:27b |
| 206 | + score: 0.6 |
| 207 | + - model: phi4 |
| 208 | + score: 0.6 |
| 209 | + - name: history |
| 210 | + use_reasoning: false |
| 211 | + reasoning_description: "Historical content is narrative-based" |
| 212 | + model_scores: |
| 213 | + - model: mistral-small3.1 |
| 214 | + score: 0.8 |
| 215 | + - model: phi4 |
| 216 | + score: 0.6 |
| 217 | + - model: gemma3:27b |
| 218 | + score: 0.4 |
| 219 | + - name: other |
| 220 | + use_reasoning: false |
| 221 | + reasoning_description: "General content doesn't require reasoning" |
| 222 | + model_scores: |
| 223 | + - model: gemma3:27b |
| 224 | + score: 0.8 |
| 225 | + - model: phi4 |
| 226 | + score: 0.6 |
| 227 | + - model: mistral-small3.1 |
| 228 | + score: 0.6 |
| 229 | + - name: health |
| 230 | + use_reasoning: false |
| 231 | + reasoning_description: "Health information is typically informational" |
| 232 | + model_scores: |
| 233 | + - model: gemma3:27b |
| 234 | + score: 0.8 |
| 235 | + - model: phi4 |
| 236 | + score: 0.8 |
| 237 | + - model: mistral-small3.1 |
| 238 | + score: 0.6 |
| 239 | + - name: economics |
| 240 | + use_reasoning: false |
| 241 | + reasoning_description: "Economic discussions are usually explanatory" |
| 242 | + model_scores: |
| 243 | + - model: gemma3:27b |
| 244 | + score: 0.8 |
| 245 | + - model: mistral-small3.1 |
| 246 | + score: 0.8 |
| 247 | + - model: phi4 |
| 248 | + score: 0.0 |
| 249 | + - name: math |
| 250 | + use_reasoning: true |
| 251 | + reasoning_description: "Mathematical problems require step-by-step reasoning" |
| 252 | + reasoning_effort: high # Math problems need high reasoning effort |
| 253 | + model_scores: |
| 254 | + - model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 |
| 255 | + score: 1.0 |
| 256 | + - model: phi4 |
| 257 | + score: 0.9 |
| 258 | + - model: mistral-small3.1 |
| 259 | + score: 0.8 |
| 260 | + - model: gemma3:27b |
| 261 | + score: 0.6 |
| 262 | + - name: physics |
| 263 | + use_reasoning: true |
| 264 | + reasoning_description: "Physics concepts need logical analysis" |
| 265 | + model_scores: |
| 266 | + - model: gemma3:27b |
| 267 | + score: 0.4 |
| 268 | + - model: phi4 |
| 269 | + score: 0.4 |
| 270 | + - model: mistral-small3.1 |
| 271 | + score: 0.4 |
| 272 | + - name: computer science |
| 273 | + use_reasoning: true |
| 274 | + reasoning_description: "Programming and algorithms need logical reasoning" |
| 275 | + model_scores: |
| 276 | + - model: gemma3:27b |
| 277 | + score: 0.6 |
| 278 | + - model: mistral-small3.1 |
| 279 | + score: 0.6 |
| 280 | + - model: phi4 |
| 281 | + score: 0.0 |
| 282 | + - name: philosophy |
| 283 | + use_reasoning: false |
| 284 | + reasoning_description: "Philosophical discussions are conversational" |
| 285 | + model_scores: |
| 286 | + - model: phi4 |
| 287 | + score: 0.6 |
| 288 | + - model: gemma3:27b |
| 289 | + score: 0.2 |
| 290 | + - model: mistral-small3.1 |
| 291 | + score: 0.2 |
| 292 | + - name: engineering |
| 293 | + use_reasoning: true |
| 294 | + reasoning_description: "Engineering problems require systematic problem-solving" |
| 295 | + model_scores: |
| 296 | + - model: gemma3:27b |
| 297 | + score: 0.6 |
| 298 | + - model: mistral-small3.1 |
| 299 | + score: 0.6 |
| 300 | + - model: phi4 |
| 301 | + score: 0.2 |
| 302 | + |
| 303 | +default_model: mistral-small3.1 |
| 304 | + |
| 305 | +# API Configuration |
| 306 | +api: |
| 307 | + batch_classification: |
| 308 | + # Metrics configuration for monitoring batch classification performance |
| 309 | + metrics: |
| 310 | + enabled: true # Enable comprehensive metrics collection |
| 311 | + detailed_goroutine_tracking: true # Track individual goroutine lifecycle |
| 312 | + high_resolution_timing: false # Use nanosecond precision timing |
| 313 | + sample_rate: 1.0 # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%) |
| 314 | + # Histogram buckets for metrics (directly configure what you need) |
| 315 | + duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] |
| 316 | + size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] |
| 317 | + |
| 318 | +# Reasoning family configurations - define how different model families handle reasoning syntax |
| 319 | +reasoning_families: |
| 320 | + deepseek: |
| 321 | + type: "chat_template_kwargs" |
| 322 | + parameter: "thinking" |
| 323 | + |
| 324 | + qwen3: |
| 325 | + type: "chat_template_kwargs" |
| 326 | + parameter: "enable_thinking" |
| 327 | + |
| 328 | + gpt-oss: |
| 329 | + type: "reasoning_effort" |
| 330 | + parameter: "reasoning_effort" |
| 331 | + |
| 332 | + gpt: |
| 333 | + type: "reasoning_effort" |
| 334 | + parameter: "reasoning_effort" |
| 335 | + |
| 336 | +# Global default reasoning effort level |
| 337 | +default_reasoning_effort: medium # Default reasoning effort level (low, medium, high) |
0 commit comments