diff --git a/config/config.yaml b/config/config.yaml index 5ad0dbd5..d72624e8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -56,8 +56,9 @@ model_config: batch_size: 512.0 # vLLM default batch size context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4 pricing: - prompt_usd_per_1m: 200.0 - completion_usd_per_1m: 600.0 + currency: USD + prompt_per_1m: 0.07 + completion_per_1m: 0.35 pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types @@ -68,8 +69,9 @@ model_config: batch_size: 512.0 context_size: 16384.0 pricing: - prompt_usd_per_1m: 500.0 - completion_usd_per_1m: 1500.0 + currency: USD + prompt_per_1m: 0.067 + completion_per_1m: 0.267 pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types @@ -79,8 +81,9 @@ model_config: batch_size: 512.0 context_size: 16384.0 pricing: - prompt_usd_per_1m: 300.0 - completion_usd_per_1m: 900.0 + currency: USD + prompt_per_1m: 0.1 + completion_per_1m: 0.3 pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index cc0fb15c..85ee09f1 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -193,10 +193,12 @@ type VLLMEndpoint struct { // ModelParams represents configuration for model-specific parameters type ModelPricing struct { - // Price in USD per 1M prompt tokens - PromptUSDPer1M float64 `yaml:"prompt_usd_per_1m,omitempty"` - // Price in USD per 1M completion tokens - CompletionUSDPer1M float64 `yaml:"completion_usd_per_1m,omitempty"` + // ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted. + Currency string `yaml:"currency,omitempty"` + + // Price per 1M tokens (unit: /1_000_000 tokens) + PromptPer1M float64 `yaml:"prompt_per_1m,omitempty"` + CompletionPer1M float64 `yaml:"completion_per_1m,omitempty"` } type ModelParams struct { @@ -374,15 +376,20 @@ func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float6 return defaultValue } -// GetModelPricing returns pricing in USD per 1M tokens for prompt and completion. -func (c *RouterConfig) GetModelPricing(modelName string) (promptUSDPer1M float64, completionUSDPer1M float64, ok bool) { +// GetModelPricing returns pricing per 1M tokens and its currency for the given model. +// The currency indicates the unit of the returned rates (e.g., "USD"). +func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) { if modelConfig, okc := c.ModelConfig[modelName]; okc { p := modelConfig.Pricing - if p.PromptUSDPer1M != 0 || p.CompletionUSDPer1M != 0 { - return p.PromptUSDPer1M, p.CompletionUSDPer1M, true + if p.PromptPer1M != 0 || p.CompletionPer1M != 0 { + cur := p.Currency + if cur == "" { + cur = "USD" + } + return p.PromptPer1M, p.CompletionPer1M, cur, true } } - return 0, 0, false + return 0, 0, "", false } // GetModelPIIPolicy returns the PII policy for a given model diff --git a/src/semantic-router/pkg/extproc/response_handler.go b/src/semantic-router/pkg/extproc/response_handler.go index 1fdbe7bf..a4cd3290 100644 --- a/src/semantic-router/pkg/extproc/response_handler.go +++ b/src/semantic-router/pkg/extproc/response_handler.go @@ -56,10 +56,13 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response // Compute and record cost if pricing is configured if r.Config != nil { - promptRatePer1M, completionRatePer1M, ok := r.Config.GetModelPricing(ctx.RequestModel) + promptRatePer1M, completionRatePer1M, currency, ok := r.Config.GetModelPricing(ctx.RequestModel) if ok { - costUSD := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0 - metrics.RecordModelCostUSD(ctx.RequestModel, costUSD) + costAmount := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0 + if currency == "" { + currency = "USD" + } + metrics.RecordModelCost(ctx.RequestModel, currency, costAmount) observability.LogEvent("llm_usage", map[string]interface{}{ "request_id": ctx.RequestID, "model": ctx.RequestModel, @@ -67,7 +70,8 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response "completion_tokens": completionTokens, "total_tokens": promptTokens + completionTokens, "completion_latency_ms": completionLatency.Milliseconds(), - "cost_usd": costUSD, + "cost": costAmount, + "currency": currency, }) } else { observability.LogEvent("llm_usage", map[string]interface{}{ @@ -77,7 +81,8 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response "completion_tokens": completionTokens, "total_tokens": promptTokens + completionTokens, "completion_latency_ms": completionLatency.Milliseconds(), - "cost_usd": 0.0, + "cost": 0.0, + "currency": "unknown", "pricing": "not_configured", }) } diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go index cd1b1553..7be69000 100644 --- a/src/semantic-router/pkg/metrics/metrics.go +++ b/src/semantic-router/pkg/metrics/metrics.go @@ -102,13 +102,13 @@ var ( []string{"model"}, ) - // ModelCostUSD tracks the total USD cost attributed to each model - ModelCostUSD = promauto.NewCounterVec( + // ModelCost tracks the total cost attributed to each model by currency + ModelCost = promauto.NewCounterVec( prometheus.CounterOpts{ - Name: "llm_model_cost_usd_total", - Help: "The total USD cost attributed to each LLM model", + Name: "llm_model_cost_total", + Help: "The total cost attributed to each LLM model, labeled by currency", }, - []string{"model"}, + []string{"model", "currency"}, ) // ModelTokens tracks the number of tokens used by each model @@ -256,12 +256,15 @@ func RecordModelTokens(model string, tokens float64) { ModelTokens.WithLabelValues(model).Add(tokens) } -// RecordModelCostUSD adds the dollar cost attributed to a specific model -func RecordModelCostUSD(model string, usd float64) { - if usd < 0 { +// RecordModelCost records the cost attributed to a specific model with a currency label +func RecordModelCost(model string, currency string, amount float64) { + if amount < 0 { return } - ModelCostUSD.WithLabelValues(model).Add(usd) + if currency == "" { + currency = "USD" + } + ModelCost.WithLabelValues(model, currency).Add(amount) } // RecordRoutingReasonCode increments the counter for a routing decision reason code and model diff --git a/website/docs/api/router.md b/website/docs/api/router.md index b76986c9..9795ac5c 100644 --- a/website/docs/api/router.md +++ b/website/docs/api/router.md @@ -222,10 +222,11 @@ sum by (family, effort) ( The router exposes additional metrics for cost accounting and routing decisions. -- `llm_model_cost_usd_total{model}` - - Description: Total accumulated USD cost attributed to each model (computed from token usage and per-1M pricing). +- `llm_model_cost_total{model, currency}` + - Description: Total accumulated cost attributed to each model (computed from token usage and per-1M pricing), labeled by currency. - Labels: - model: model name used for the request + - currency: currency code (e.g., "USD") - `llm_routing_reason_codes_total{reason_code, model}` - Description: Count of routing decisions by reason code and selected model. @@ -236,8 +237,11 @@ The router exposes additional metrics for cost accounting and routing decisions. Example PromQL: ```prometheus -# Cost by model over the last hour -sum by (model) (increase(llm_model_cost_usd_total[1h])) +# Cost by model and currency over the last hour +sum by (model, currency) (increase(llm_model_cost_total[1h])) + +# Or, if you only use USD, a common query is: +sum by (model) (increase(llm_model_cost_total{currency="USD"}[1h])) # Routing decisions by reason code over the last 15 minutes sum by (reason_code) (increase(llm_routing_reason_codes_total[15m])) @@ -251,21 +255,24 @@ Provide per-1M pricing for your models so the router can compute request cost an model_config: phi4: pricing: - prompt_usd_per_1m: 200.0 - completion_usd_per_1m: 600.0 + currency: USD + prompt_per_1m: 0.07 + completion_per_1m: 0.35 "mistral-small3.1": pricing: - prompt_usd_per_1m: 300.0 - completion_usd_per_1m: 900.0 + currency: USD + prompt_per_1m: 0.1 + completion_per_1m: 0.3 gemma3:27b: pricing: - prompt_usd_per_1m: 500.0 - completion_usd_per_1m: 1500.0 + currency: USD + prompt_per_1m: 0.067 + completion_per_1m: 0.267 ``` Notes: - Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted. -- Cost is computed as: (prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000. +- Cost is computed as: (prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000 (in the configured currency). ## gRPC ExtProc API diff --git a/website/docs/getting-started/configuration.md b/website/docs/getting-started/configuration.md index 074793a9..51f42175 100644 --- a/website/docs/getting-started/configuration.md +++ b/website/docs/getting-started/configuration.md @@ -126,25 +126,28 @@ model_config: ### Pricing (Optional) -If you want the router to compute USD cost per request and expose Prometheus cost metrics, add per-1M token pricing under each model in `model_config`. +If you want the router to compute request cost and expose Prometheus cost metrics, add per-1M token pricing and currency under each model in `model_config`. ```yaml model_config: phi4: pricing: - prompt_usd_per_1m: 0.07 - completion_usd_per_1m: 0.35 + currency: USD + prompt_per_1m: 0.07 + completion_per_1m: 0.35 "mistral-small3.1": pricing: - prompt_usd_per_1m: 0.1 - completion_usd_per_1m: 0.3 + currency: USD + prompt_per_1m: 0.1 + completion_per_1m: 0.3 gemma3:27b: pricing: - prompt_usd_per_1m: 0.067 - completion_usd_per_1m: 0.267 + currency: USD + prompt_per_1m: 0.067 + completion_per_1m: 0.267 ``` -- Cost formula: `(prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000`. +- Cost formula: `(prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000` (in the given currency). - When not configured, the router still reports token and latency metrics; cost is treated as 0. ### Classification Models