diff --git a/config/config.yaml b/config/config.yaml index 7db18883..5ad0dbd5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,258 +1,266 @@ -bert_model: - model_id: sentence-transformers/all-MiniLM-L12-v2 - threshold: 0.6 - use_cpu: true -semantic_cache: - enabled: true - similarity_threshold: 0.8 - max_entries: 1000 - ttl_seconds: 3600 -tools: - enabled: true # Set to true to enable automatic tool selection - top_k: 3 # Number of most relevant tools to select - similarity_threshold: 0.2 # Threshold for tool similarity - tools_db_path: "config/tools_db.json" - fallback_to_empty: true # If true, return no tools on failure; if false, return error -prompt_guard: - enabled: true - use_modernbert: true - model_id: "models/jailbreak_classifier_modernbert-base_model" - threshold: 0.7 - use_cpu: true - jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" -gpu_config: - flops: 312000000000000 # 312e12 fp16 - hbm: 2000000000000 # 2e12 (2 TB/s) - description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf - -# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models -vllm_endpoints: - - name: "endpoint1" - address: "127.0.0.1" - port: 11434 - models: - - "phi4" - - "gemma3:27b" - weight: 1 # Load balancing weight - health_check_path: "/health" # Optional health check endpoint - - name: "endpoint2" - address: "127.0.0.1" - port: 11434 - models: - - "mistral-small3.1" - weight: 1 - health_check_path: "/health" - - name: "endpoint3" - address: "127.0.0.1" - port: 11434 - models: - - "phi4" # Same model can be served by multiple endpoints for redundancy - - "mistral-small3.1" - weight: 2 # Higher weight for more powerful endpoint - -model_config: - phi4: - param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4 - batch_size: 512.0 # vLLM default batch size - context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4 - pii_policy: - allow_by_default: false # Deny all PII by default - pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types - # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model) - preferred_endpoints: ["endpoint1", "endpoint3"] - gemma3:27b: - param_count: 27000000000 # 27B parameters (base version) - batch_size: 512.0 - context_size: 16384.0 - pii_policy: - allow_by_default: false # Deny all PII by default - pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types - preferred_endpoints: ["endpoint1"] - "mistral-small3.1": - param_count: 22000000000 - batch_size: 512.0 - context_size: 16384.0 - pii_policy: - allow_by_default: false # Deny all PII by default - pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types - preferred_endpoints: ["endpoint2", "endpoint3"] - -# Classifier configuration for text classification -classifier: - category_model: - model_id: "models/category_classifier_modernbert-base_model" #TODO: Use local model for now before the code can download the entire model from huggingface - use_modernbert: true - threshold: 0.6 - use_cpu: true - category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" - pii_model: - model_id: "models/pii_classifier_modernbert-base_presidio_token_model" #TODO: Use local model for now before the code can download the entire model from huggingface - use_modernbert: true - threshold: 0.7 - use_cpu: true - pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" - load_aware: false -categories: -- name: business - use_reasoning: false - reasoning_description: "Business content is typically conversational" - reasoning_effort: low # Business conversations need low reasoning effort - model_scores: - - model: phi4 - score: 0.8 - - model: gemma3:27b - score: 0.4 - - model: mistral-small3.1 - score: 0.2 -- name: law - use_reasoning: false - reasoning_description: "Legal content is typically explanatory" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.6 - - model: mistral-small3.1 - score: 0.4 -- name: psychology - use_reasoning: false - reasoning_description: "Psychology content is usually explanatory" - model_scores: - - model: mistral-small3.1 - score: 0.6 - - model: gemma3:27b - score: 0.4 - - model: phi4 - score: 0.4 -- name: biology - use_reasoning: true - reasoning_description: "Biological processes benefit from structured analysis" - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 - - model: phi4 - score: 0.2 -- name: chemistry - use_reasoning: true - reasoning_description: "Chemical reactions and formulas require systematic thinking" - reasoning_effort: high # Chemistry requires high reasoning effort - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 - - model: phi4 - score: 0.6 -- name: history - use_reasoning: false - reasoning_description: "Historical content is narrative-based" - model_scores: - - model: mistral-small3.1 - score: 0.8 - - model: phi4 - score: 0.6 - - model: gemma3:27b - score: 0.4 -- name: other - use_reasoning: false - reasoning_description: "General content doesn't require reasoning" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.6 - - model: mistral-small3.1 - score: 0.6 -- name: health - use_reasoning: false - reasoning_description: "Health information is typically informational" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: phi4 - score: 0.8 - - model: mistral-small3.1 - score: 0.6 -- name: economics - use_reasoning: false - reasoning_description: "Economic discussions are usually explanatory" - model_scores: - - model: gemma3:27b - score: 0.8 - - model: mistral-small3.1 - score: 0.8 - - model: phi4 - score: 0.0 -- name: math - use_reasoning: true - reasoning_description: "Mathematical problems require step-by-step reasoning" - reasoning_effort: high # Math problems need high reasoning effort - model_scores: - - model: phi4 - score: 1.0 - - model: mistral-small3.1 - score: 0.8 - - model: gemma3:27b - score: 0.6 -- name: physics - use_reasoning: true - reasoning_description: "Physics concepts need logical analysis" - model_scores: - - model: gemma3:27b - score: 0.4 - - model: phi4 - score: 0.4 - - model: mistral-small3.1 - score: 0.4 -- name: computer science - use_reasoning: true - reasoning_description: "Programming and algorithms need logical reasoning" - model_scores: - - model: gemma3:27b - score: 0.6 - - model: mistral-small3.1 - score: 0.6 - - model: phi4 - score: 0.0 -- name: philosophy - use_reasoning: false - reasoning_description: "Philosophical discussions are conversational" - model_scores: - - model: phi4 - score: 0.6 - - model: gemma3:27b - score: 0.2 - - model: mistral-small3.1 - score: 0.2 -- name: engineering - use_reasoning: true - reasoning_description: "Engineering problems require systematic problem-solving" - model_scores: - - model: gemma3:27b - score: 0.6 - - model: mistral-small3.1 - score: 0.6 - - model: phi4 - score: 0.2 -default_model: mistral-small3.1 -default_reasoning_effort: medium # Default reasoning effort level (low, medium, high) - -# API Configuration -api: - batch_classification: - max_batch_size: 100 # Maximum number of texts in a single batch - concurrency_threshold: 5 # Switch to concurrent processing when batch size > this value - max_concurrency: 8 # Maximum number of concurrent goroutines - - # Metrics configuration for monitoring batch classification performance - metrics: - enabled: true # Enable comprehensive metrics collection - detailed_goroutine_tracking: true # Track individual goroutine lifecycle - high_resolution_timing: false # Use nanosecond precision timing - sample_rate: 1.0 # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%) - - # Histogram buckets for metrics (directly configure what you need) - duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] - size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] +bert_model: + model_id: sentence-transformers/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true +semantic_cache: + enabled: true + similarity_threshold: 0.8 + max_entries: 1000 + ttl_seconds: 3600 +tools: + enabled: true # Set to true to enable automatic tool selection + top_k: 3 # Number of most relevant tools to select + similarity_threshold: 0.2 # Threshold for tool similarity + tools_db_path: "config/tools_db.json" + fallback_to_empty: true # If true, return no tools on failure; if false, return error +prompt_guard: + enabled: true + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" +gpu_config: + flops: 312000000000000 # 312e12 fp16 + hbm: 2000000000000 # 2e12 (2 TB/s) + description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf + +# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models +vllm_endpoints: + - name: "endpoint1" + address: "127.0.0.1" + port: 11434 + models: + - "phi4" + - "gemma3:27b" + weight: 1 # Load balancing weight + health_check_path: "/health" # Optional health check endpoint + - name: "endpoint2" + address: "127.0.0.1" + port: 11434 + models: + - "mistral-small3.1" + weight: 1 + health_check_path: "/health" + - name: "endpoint3" + address: "127.0.0.1" + port: 11434 + models: + - "phi4" # Same model can be served by multiple endpoints for redundancy + - "mistral-small3.1" + weight: 2 # Higher weight for more powerful endpoint + +model_config: + phi4: + param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4 + batch_size: 512.0 # vLLM default batch size + context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4 + pricing: + prompt_usd_per_1m: 200.0 + completion_usd_per_1m: 600.0 + pii_policy: + allow_by_default: false # Deny all PII by default + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types + # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model) + preferred_endpoints: ["endpoint1", "endpoint3"] + gemma3:27b: + param_count: 27000000000 # 27B parameters (base version) + batch_size: 512.0 + context_size: 16384.0 + pricing: + prompt_usd_per_1m: 500.0 + completion_usd_per_1m: 1500.0 + pii_policy: + allow_by_default: false # Deny all PII by default + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types + preferred_endpoints: ["endpoint1"] + "mistral-small3.1": + param_count: 22000000000 + batch_size: 512.0 + context_size: 16384.0 + pricing: + prompt_usd_per_1m: 300.0 + completion_usd_per_1m: 900.0 + pii_policy: + allow_by_default: false # Deny all PII by default + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types + preferred_endpoints: ["endpoint2", "endpoint3"] + +# Classifier configuration for text classification +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" #TODO: Use local model for now before the code can download the entire model from huggingface + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" #TODO: Use local model for now before the code can download the entire model from huggingface + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + load_aware: false +categories: +- name: business + use_reasoning: false + reasoning_description: "Business content is typically conversational" + reasoning_effort: low # Business conversations need low reasoning effort + model_scores: + - model: phi4 + score: 0.8 + - model: gemma3:27b + score: 0.4 + - model: mistral-small3.1 + score: 0.2 +- name: law + use_reasoning: false + reasoning_description: "Legal content is typically explanatory" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.6 + - model: mistral-small3.1 + score: 0.4 +- name: psychology + use_reasoning: false + reasoning_description: "Psychology content is usually explanatory" + model_scores: + - model: mistral-small3.1 + score: 0.6 + - model: gemma3:27b + score: 0.4 + - model: phi4 + score: 0.4 +- name: biology + use_reasoning: true + reasoning_description: "Biological processes benefit from structured analysis" + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - model: phi4 + score: 0.2 +- name: chemistry + use_reasoning: true + reasoning_description: "Chemical reactions and formulas require systematic thinking" + reasoning_effort: high # Chemistry requires high reasoning effort + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 + - model: phi4 + score: 0.6 +- name: history + use_reasoning: false + reasoning_description: "Historical content is narrative-based" + model_scores: + - model: mistral-small3.1 + score: 0.8 + - model: phi4 + score: 0.6 + - model: gemma3:27b + score: 0.4 +- name: other + use_reasoning: false + reasoning_description: "General content doesn't require reasoning" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.6 + - model: mistral-small3.1 + score: 0.6 +- name: health + use_reasoning: false + reasoning_description: "Health information is typically informational" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: phi4 + score: 0.8 + - model: mistral-small3.1 + score: 0.6 +- name: economics + use_reasoning: false + reasoning_description: "Economic discussions are usually explanatory" + model_scores: + - model: gemma3:27b + score: 0.8 + - model: mistral-small3.1 + score: 0.8 + - model: phi4 + score: 0.0 +- name: math + use_reasoning: true + reasoning_description: "Mathematical problems require step-by-step reasoning" + reasoning_effort: high # Math problems need high reasoning effort + model_scores: + - model: phi4 + score: 1.0 + - model: mistral-small3.1 + score: 0.8 + - model: gemma3:27b + score: 0.6 +- name: physics + use_reasoning: true + reasoning_description: "Physics concepts need logical analysis" + model_scores: + - model: gemma3:27b + score: 0.4 + - model: phi4 + score: 0.4 + - model: mistral-small3.1 + score: 0.4 +- name: computer science + use_reasoning: true + reasoning_description: "Programming and algorithms need logical reasoning" + model_scores: + - model: gemma3:27b + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - model: phi4 + score: 0.0 +- name: philosophy + use_reasoning: false + reasoning_description: "Philosophical discussions are conversational" + model_scores: + - model: phi4 + score: 0.6 + - model: gemma3:27b + score: 0.2 + - model: mistral-small3.1 + score: 0.2 +- name: engineering + use_reasoning: true + reasoning_description: "Engineering problems require systematic problem-solving" + model_scores: + - model: gemma3:27b + score: 0.6 + - model: mistral-small3.1 + score: 0.6 + - model: phi4 + score: 0.2 +default_model: mistral-small3.1 +default_reasoning_effort: medium # Default reasoning effort level (low, medium, high) + +# API Configuration +api: + batch_classification: + max_batch_size: 100 # Maximum number of texts in a single batch + concurrency_threshold: 5 # Switch to concurrent processing when batch size > this value + max_concurrency: 8 # Maximum number of concurrent goroutines + + # Metrics configuration for monitoring batch classification performance + metrics: + enabled: true # Enable comprehensive metrics collection + detailed_goroutine_tracking: true # Track individual goroutine lifecycle + high_resolution_timing: false # Use nanosecond precision timing + sample_rate: 1.0 # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%) + # Histogram buckets for metrics (directly configure what you need) + duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] + size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 4a8aa580..cc0fb15c 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -192,6 +192,13 @@ type VLLMEndpoint struct { } // ModelParams represents configuration for model-specific parameters +type ModelPricing struct { + // Price in USD per 1M prompt tokens + PromptUSDPer1M float64 `yaml:"prompt_usd_per_1m,omitempty"` + // Price in USD per 1M completion tokens + CompletionUSDPer1M float64 `yaml:"completion_usd_per_1m,omitempty"` +} + type ModelParams struct { // Number of parameters in the model ParamCount float64 `yaml:"param_count"` @@ -207,6 +214,9 @@ type ModelParams struct { // Preferred endpoints for this model (optional) PreferredEndpoints []string `yaml:"preferred_endpoints,omitempty"` + + // Optional pricing used for cost computation + Pricing ModelPricing `yaml:"pricing,omitempty"` } // PIIPolicy represents the PII (Personally Identifiable Information) policy for a model @@ -364,6 +374,17 @@ func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float6 return defaultValue } +// GetModelPricing returns pricing in USD per 1M tokens for prompt and completion. +func (c *RouterConfig) GetModelPricing(modelName string) (promptUSDPer1M float64, completionUSDPer1M float64, ok bool) { + if modelConfig, okc := c.ModelConfig[modelName]; okc { + p := modelConfig.Pricing + if p.PromptUSDPer1M != 0 || p.CompletionUSDPer1M != 0 { + return p.PromptUSDPer1M, p.CompletionUSDPer1M, true + } + } + return 0, 0, false +} + // GetModelPIIPolicy returns the PII policy for a given model // If the model is not found in the config, returns a default policy that allows all PII func (c *RouterConfig) GetModelPIIPolicy(modelName string) PIIPolicy { diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 31db6d7a..0bfa4431 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -14,6 +14,7 @@ import ( "github.com/vllm-project/semantic-router/semantic-router/pkg/cache" "github.com/vllm-project/semantic-router/semantic-router/pkg/metrics" + "github.com/vllm-project/semantic-router/semantic-router/pkg/observability" "github.com/vllm-project/semantic-router/semantic-router/pkg/utils/http" "github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii" ) @@ -173,7 +174,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest) // Perform security checks - if response, shouldReturn := r.performSecurityChecks(userContent, nonUserMessages); shouldReturn { + if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn { return response, nil } @@ -187,7 +188,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo } // performSecurityChecks performs PII and jailbreak detection -func (r *OpenAIRouter) performSecurityChecks(userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) { +func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) { // Perform PII classification on all message content allContent := pii.ExtractAllContent(userContent, nonUserMessages) @@ -212,6 +213,13 @@ func (r *OpenAIRouter) performSecurityChecks(userContent string, nonUserMessages log.Printf("JAILBREAK ATTEMPT BLOCKED: %s (confidence: %.3f)", jailbreakType, confidence) // Return immediate jailbreak violation response + // Structured log for security block + observability.LogEvent("security_block", map[string]interface{}{ + "reason_code": "jailbreak_detected", + "jailbreak_type": jailbreakType, + "confidence": confidence, + "request_id": ctx.RequestID, + }) jailbreakResponse := http.CreateJailbreakViolationResponse(jailbreakType, confidence) return jailbreakResponse, true } else { @@ -241,6 +249,13 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR if err != nil { log.Printf("Error searching cache: %v", err) } else if found { + // Record and log cache hit + metrics.RecordCacheHit() + observability.LogEvent("cache_hit", map[string]interface{}{ + "request_id": ctx.RequestID, + "model": requestModel, + "query": requestQuery, + }) // Return immediate response from cache response := http.CreateCacheHitResponse(cachedResponse) return response, true @@ -313,6 +328,8 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe // Select the best allowed model from this category matchedModel = r.Classifier.SelectBestModelFromList(allowedModels, categoryName) log.Printf("Selected alternative model %s that passes PII policy", matchedModel) + // Record reason code for selecting alternative due to PII + metrics.RecordRoutingReasonCode("pii_policy_alternative_selected", matchedModel) } else { log.Printf("No models in category %s pass PII policy, using default", categoryName) matchedModel = r.Config.DefaultModel @@ -320,12 +337,24 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe defaultAllowed, defaultDeniedPII, _ := r.PIIChecker.CheckPolicy(matchedModel, detectedPII) if !defaultAllowed { log.Printf("Default model also violates PII policy, returning error") + observability.LogEvent("routing_block", map[string]interface{}{ + "reason_code": "pii_policy_denied_default_model", + "request_id": ctx.RequestID, + "model": matchedModel, + "denied_pii": defaultDeniedPII, + }) piiResponse := http.CreatePIIViolationResponse(matchedModel, defaultDeniedPII) return piiResponse, nil } } } else { log.Printf("Could not determine category, returning PII violation for model %s", matchedModel) + observability.LogEvent("routing_block", map[string]interface{}{ + "reason_code": "pii_policy_denied", + "request_id": ctx.RequestID, + "model": matchedModel, + "denied_pii": deniedPII, + }) piiResponse := http.CreatePIIViolationResponse(matchedModel, deniedPII) return piiResponse, nil } @@ -424,6 +453,20 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe } log.Printf("Use new model: %s", matchedModel) + + // Structured log for routing decision (auto) + observability.LogEvent("routing_decision", map[string]interface{}{ + "reason_code": "auto_routing", + "request_id": ctx.RequestID, + "original_model": originalModel, + "selected_model": matchedModel, + "category": categoryName, + "reasoning_enabled": useReasoning, + "reasoning_effort": effortForMetrics, + "selected_endpoint": selectedEndpoint, + "routing_latency_ms": time.Since(ctx.ProcessingStartTime).Milliseconds(), + }) + metrics.RecordRoutingReasonCode("auto_routing", matchedModel) } } } else if originalModel != "auto" { @@ -438,6 +481,12 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe // Continue with request on error } else if !allowed { log.Printf("Model %s violates PII policy, returning error", originalModel) + observability.LogEvent("routing_block", map[string]interface{}{ + "reason_code": "pii_policy_denied", + "request_id": ctx.RequestID, + "model": originalModel, + "denied_pii": deniedPII, + }) piiResponse := http.CreatePIIViolationResponse(originalModel, deniedPII) return piiResponse, nil } @@ -472,6 +521,19 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe }, }, } + // Structured log for routing decision (explicit model) + observability.LogEvent("routing_decision", map[string]interface{}{ + "reason_code": "model_specified", + "request_id": ctx.RequestID, + "original_model": originalModel, + "selected_model": originalModel, + "category": "", + "reasoning_enabled": false, + "reasoning_effort": "", + "selected_endpoint": selectedEndpoint, + "routing_latency_ms": time.Since(ctx.ProcessingStartTime).Milliseconds(), + }) + metrics.RecordRoutingReasonCode("model_specified", originalModel) } // Save the actual model that will be used for token tracking diff --git a/src/semantic-router/pkg/extproc/response_handler.go b/src/semantic-router/pkg/extproc/response_handler.go index 7015acc6..1fdbe7bf 100644 --- a/src/semantic-router/pkg/extproc/response_handler.go +++ b/src/semantic-router/pkg/extproc/response_handler.go @@ -9,6 +9,7 @@ import ( "github.com/openai/openai-go" "github.com/vllm-project/semantic-router/semantic-router/pkg/metrics" + "github.com/vllm-project/semantic-router/semantic-router/pkg/observability" ) // handleResponseHeaders processes the response headers @@ -52,6 +53,35 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response ) metrics.RecordModelCompletionLatency(ctx.RequestModel, completionLatency.Seconds()) r.Classifier.DecrementModelLoad(ctx.RequestModel) + + // Compute and record cost if pricing is configured + if r.Config != nil { + promptRatePer1M, completionRatePer1M, ok := r.Config.GetModelPricing(ctx.RequestModel) + if ok { + costUSD := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0 + metrics.RecordModelCostUSD(ctx.RequestModel, costUSD) + observability.LogEvent("llm_usage", map[string]interface{}{ + "request_id": ctx.RequestID, + "model": ctx.RequestModel, + "prompt_tokens": promptTokens, + "completion_tokens": completionTokens, + "total_tokens": promptTokens + completionTokens, + "completion_latency_ms": completionLatency.Milliseconds(), + "cost_usd": costUSD, + }) + } else { + observability.LogEvent("llm_usage", map[string]interface{}{ + "request_id": ctx.RequestID, + "model": ctx.RequestModel, + "prompt_tokens": promptTokens, + "completion_tokens": completionTokens, + "total_tokens": promptTokens + completionTokens, + "completion_latency_ms": completionLatency.Milliseconds(), + "cost_usd": 0.0, + "pricing": "not_configured", + }) + } + } } // Check if this request has a pending cache entry diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go index 42a1fb6b..cd1b1553 100644 --- a/src/semantic-router/pkg/metrics/metrics.go +++ b/src/semantic-router/pkg/metrics/metrics.go @@ -102,6 +102,15 @@ var ( []string{"model"}, ) + // ModelCostUSD tracks the total USD cost attributed to each model + ModelCostUSD = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_model_cost_usd_total", + Help: "The total USD cost attributed to each LLM model", + }, + []string{"model"}, + ) + // ModelTokens tracks the number of tokens used by each model ModelTokens = promauto.NewCounterVec( prometheus.CounterOpts{ @@ -138,6 +147,15 @@ var ( []string{"source_model", "target_model"}, ) + // RoutingReasonCodes tracks routing decisions by reason_code and model + RoutingReasonCodes = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_routing_reason_codes_total", + Help: "The total number of routing decisions by reason code and model", + }, + []string{"reason_code", "model"}, + ) + // ModelCompletionLatency tracks the latency of completions by model ModelCompletionLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ @@ -238,6 +256,25 @@ func RecordModelTokens(model string, tokens float64) { ModelTokens.WithLabelValues(model).Add(tokens) } +// RecordModelCostUSD adds the dollar cost attributed to a specific model +func RecordModelCostUSD(model string, usd float64) { + if usd < 0 { + return + } + ModelCostUSD.WithLabelValues(model).Add(usd) +} + +// RecordRoutingReasonCode increments the counter for a routing decision reason code and model +func RecordRoutingReasonCode(reasonCode, model string) { + if reasonCode == "" { + reasonCode = "unknown" + } + if model == "" { + model = "unknown" + } + RoutingReasonCodes.WithLabelValues(reasonCode, model).Inc() +} + // RecordModelTokensDetailed records detailed token usage (prompt and completion) func RecordModelTokensDetailed(model string, promptTokens, completionTokens float64) { // Record in both the aggregated and detailed metrics diff --git a/src/semantic-router/pkg/observability/logging.go b/src/semantic-router/pkg/observability/logging.go new file mode 100644 index 00000000..ad44ae31 --- /dev/null +++ b/src/semantic-router/pkg/observability/logging.go @@ -0,0 +1,28 @@ +package observability + +import ( + "encoding/json" + "log" + "time" +) + +// LogEvent emits a structured JSON log line with a standard envelope +// Fields provided by callers take precedence and will not be overwritten. +func LogEvent(event string, fields map[string]interface{}) { + if fields == nil { + fields = map[string]interface{}{} + } + if _, ok := fields["event"]; !ok { + fields["event"] = event + } + if _, ok := fields["ts"]; !ok { + fields["ts"] = time.Now().UTC().Format(time.RFC3339Nano) + } + b, err := json.Marshal(fields) + if err != nil { + // Fallback to regular log on marshal error + log.Printf("event=%s marshal_error=%v fields_len=%d", event, err, len(fields)) + return + } + log.Println(string(b)) +} diff --git a/website/docs/api/router.md b/website/docs/api/router.md index 26a1e557..b76986c9 100644 --- a/website/docs/api/router.md +++ b/website/docs/api/router.md @@ -218,6 +218,55 @@ sum by (family, effort) ( ) ``` +### Cost and Routing Metrics + +The router exposes additional metrics for cost accounting and routing decisions. + +- `llm_model_cost_usd_total{model}` + - Description: Total accumulated USD cost attributed to each model (computed from token usage and per-1M pricing). + - Labels: + - model: model name used for the request + +- `llm_routing_reason_codes_total{reason_code, model}` + - Description: Count of routing decisions by reason code and selected model. + - Labels: + - reason_code: why a routing decision happened (e.g., auto_routing, model_specified, pii_policy_alternative_selected) + - model: final selected model + +Example PromQL: + +```prometheus +# Cost by model over the last hour +sum by (model) (increase(llm_model_cost_usd_total[1h])) + +# Routing decisions by reason code over the last 15 minutes +sum by (reason_code) (increase(llm_routing_reason_codes_total[15m])) +``` + +### Pricing Configuration + +Provide per-1M pricing for your models so the router can compute request cost and emit metrics/logs. + +```yaml +model_config: + phi4: + pricing: + prompt_usd_per_1m: 200.0 + completion_usd_per_1m: 600.0 + "mistral-small3.1": + pricing: + prompt_usd_per_1m: 300.0 + completion_usd_per_1m: 900.0 + gemma3:27b: + pricing: + prompt_usd_per_1m: 500.0 + completion_usd_per_1m: 1500.0 +``` + +Notes: +- Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted. +- Cost is computed as: (prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000. + ## gRPC ExtProc API For direct integration with the ExtProc protocol: diff --git a/website/docs/getting-started/configuration.md b/website/docs/getting-started/configuration.md index dbba7606..074793a9 100644 --- a/website/docs/getting-started/configuration.md +++ b/website/docs/getting-started/configuration.md @@ -124,6 +124,29 @@ model_config: preferred_endpoints: ["my_endpoint"] ``` +### Pricing (Optional) + +If you want the router to compute USD cost per request and expose Prometheus cost metrics, add per-1M token pricing under each model in `model_config`. + +```yaml +model_config: + phi4: + pricing: + prompt_usd_per_1m: 0.07 + completion_usd_per_1m: 0.35 + "mistral-small3.1": + pricing: + prompt_usd_per_1m: 0.1 + completion_usd_per_1m: 0.3 + gemma3:27b: + pricing: + prompt_usd_per_1m: 0.067 + completion_usd_per_1m: 0.267 +``` + +- Cost formula: `(prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000`. +- When not configured, the router still reports token and latency metrics; cost is treated as 0. + ### Classification Models Configure the BERT classification models: @@ -658,4 +681,4 @@ This workflow ensures your configuration is: - **[Quick Start Guide](installation.md)** - Basic usage examples - **[API Documentation](../api/router.md)** - Complete API reference -The configuration system is designed to be simple yet powerful. Start with the basic configuration and gradually enable advanced features as needed. \ No newline at end of file +The configuration system is designed to be simple yet powerful. Start with the basic configuration and gradually enable advanced features as needed.