Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ model_config:
batch_size: 512.0 # vLLM default batch size
context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
pricing:
prompt_usd_per_1m: 200.0
completion_usd_per_1m: 600.0
currency: USD
prompt_per_1m: 0.07
completion_per_1m: 0.35
pii_policy:
allow_by_default: false # Deny all PII by default
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
Expand All @@ -68,8 +69,9 @@ model_config:
batch_size: 512.0
context_size: 16384.0
pricing:
prompt_usd_per_1m: 500.0
completion_usd_per_1m: 1500.0
currency: USD
prompt_per_1m: 0.067
completion_per_1m: 0.267
pii_policy:
allow_by_default: false # Deny all PII by default
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
Expand All @@ -79,8 +81,9 @@ model_config:
batch_size: 512.0
context_size: 16384.0
pricing:
prompt_usd_per_1m: 300.0
completion_usd_per_1m: 900.0
currency: USD
prompt_per_1m: 0.1
completion_per_1m: 0.3
pii_policy:
allow_by_default: false # Deny all PII by default
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
Expand Down
25 changes: 16 additions & 9 deletions src/semantic-router/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,12 @@ type VLLMEndpoint struct {

// ModelParams represents configuration for model-specific parameters
type ModelPricing struct {
// Price in USD per 1M prompt tokens
PromptUSDPer1M float64 `yaml:"prompt_usd_per_1m,omitempty"`
// Price in USD per 1M completion tokens
CompletionUSDPer1M float64 `yaml:"completion_usd_per_1m,omitempty"`
// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
Currency string `yaml:"currency,omitempty"`

// Price per 1M tokens (unit: <currency>/1_000_000 tokens)
PromptPer1M float64 `yaml:"prompt_per_1m,omitempty"`
CompletionPer1M float64 `yaml:"completion_per_1m,omitempty"`
}

type ModelParams struct {
Expand Down Expand Up @@ -374,15 +376,20 @@ func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float6
return defaultValue
}

// GetModelPricing returns pricing in USD per 1M tokens for prompt and completion.
func (c *RouterConfig) GetModelPricing(modelName string) (promptUSDPer1M float64, completionUSDPer1M float64, ok bool) {
// GetModelPricing returns pricing per 1M tokens and its currency for the given model.
// The currency indicates the unit of the returned rates (e.g., "USD").
func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {
if modelConfig, okc := c.ModelConfig[modelName]; okc {
p := modelConfig.Pricing
if p.PromptUSDPer1M != 0 || p.CompletionUSDPer1M != 0 {
return p.PromptUSDPer1M, p.CompletionUSDPer1M, true
if p.PromptPer1M != 0 || p.CompletionPer1M != 0 {
cur := p.Currency
if cur == "" {
cur = "USD"
}
return p.PromptPer1M, p.CompletionPer1M, cur, true
}
}
return 0, 0, false
return 0, 0, "", false
}

// GetModelPIIPolicy returns the PII policy for a given model
Expand Down
15 changes: 10 additions & 5 deletions src/semantic-router/pkg/extproc/response_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,22 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response

// Compute and record cost if pricing is configured
if r.Config != nil {
promptRatePer1M, completionRatePer1M, ok := r.Config.GetModelPricing(ctx.RequestModel)
promptRatePer1M, completionRatePer1M, currency, ok := r.Config.GetModelPricing(ctx.RequestModel)
if ok {
costUSD := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0
metrics.RecordModelCostUSD(ctx.RequestModel, costUSD)
costAmount := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0
if currency == "" {
currency = "USD"
}
metrics.RecordModelCost(ctx.RequestModel, currency, costAmount)
observability.LogEvent("llm_usage", map[string]interface{}{
"request_id": ctx.RequestID,
"model": ctx.RequestModel,
"prompt_tokens": promptTokens,
"completion_tokens": completionTokens,
"total_tokens": promptTokens + completionTokens,
"completion_latency_ms": completionLatency.Milliseconds(),
"cost_usd": costUSD,
"cost": costAmount,
"currency": currency,
})
} else {
observability.LogEvent("llm_usage", map[string]interface{}{
Expand All @@ -77,7 +81,8 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response
"completion_tokens": completionTokens,
"total_tokens": promptTokens + completionTokens,
"completion_latency_ms": completionLatency.Milliseconds(),
"cost_usd": 0.0,
"cost": 0.0,
"currency": "unknown",
"pricing": "not_configured",
})
}
Expand Down
21 changes: 12 additions & 9 deletions src/semantic-router/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,13 @@ var (
[]string{"model"},
)

// ModelCostUSD tracks the total USD cost attributed to each model
ModelCostUSD = promauto.NewCounterVec(
// ModelCost tracks the total cost attributed to each model by currency
ModelCost = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "llm_model_cost_usd_total",
Help: "The total USD cost attributed to each LLM model",
Name: "llm_model_cost_total",
Help: "The total cost attributed to each LLM model, labeled by currency",
},
[]string{"model"},
[]string{"model", "currency"},
)

// ModelTokens tracks the number of tokens used by each model
Expand Down Expand Up @@ -256,12 +256,15 @@ func RecordModelTokens(model string, tokens float64) {
ModelTokens.WithLabelValues(model).Add(tokens)
}

// RecordModelCostUSD adds the dollar cost attributed to a specific model
func RecordModelCostUSD(model string, usd float64) {
if usd < 0 {
// RecordModelCost records the cost attributed to a specific model with a currency label
func RecordModelCost(model string, currency string, amount float64) {
if amount < 0 {
return
}
ModelCostUSD.WithLabelValues(model).Add(usd)
if currency == "" {
currency = "USD"
}
ModelCost.WithLabelValues(model, currency).Add(amount)
}

// RecordRoutingReasonCode increments the counter for a routing decision reason code and model
Expand Down
29 changes: 18 additions & 11 deletions website/docs/api/router.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,11 @@ sum by (family, effort) (

The router exposes additional metrics for cost accounting and routing decisions.

- `llm_model_cost_usd_total{model}`
- Description: Total accumulated USD cost attributed to each model (computed from token usage and per-1M pricing).
- `llm_model_cost_total{model, currency}`
- Description: Total accumulated cost attributed to each model (computed from token usage and per-1M pricing), labeled by currency.
- Labels:
- model: model name used for the request
- currency: currency code (e.g., "USD")

- `llm_routing_reason_codes_total{reason_code, model}`
- Description: Count of routing decisions by reason code and selected model.
Expand All @@ -236,8 +237,11 @@ The router exposes additional metrics for cost accounting and routing decisions.
Example PromQL:

```prometheus
# Cost by model over the last hour
sum by (model) (increase(llm_model_cost_usd_total[1h]))
# Cost by model and currency over the last hour
sum by (model, currency) (increase(llm_model_cost_total[1h]))

# Or, if you only use USD, a common query is:
sum by (model) (increase(llm_model_cost_total{currency="USD"}[1h]))

# Routing decisions by reason code over the last 15 minutes
sum by (reason_code) (increase(llm_routing_reason_codes_total[15m]))
Expand All @@ -251,21 +255,24 @@ Provide per-1M pricing for your models so the router can compute request cost an
model_config:
phi4:
pricing:
prompt_usd_per_1m: 200.0
completion_usd_per_1m: 600.0
currency: USD
prompt_per_1m: 0.07
completion_per_1m: 0.35
"mistral-small3.1":
pricing:
prompt_usd_per_1m: 300.0
completion_usd_per_1m: 900.0
currency: USD
prompt_per_1m: 0.1
completion_per_1m: 0.3
gemma3:27b:
pricing:
prompt_usd_per_1m: 500.0
completion_usd_per_1m: 1500.0
currency: USD
prompt_per_1m: 0.067
completion_per_1m: 0.267
```

Notes:
- Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted.
- Cost is computed as: (prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000.
- Cost is computed as: (prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000 (in the configured currency).

## gRPC ExtProc API

Expand Down
19 changes: 11 additions & 8 deletions website/docs/getting-started/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,25 +126,28 @@ model_config:

### Pricing (Optional)

If you want the router to compute USD cost per request and expose Prometheus cost metrics, add per-1M token pricing under each model in `model_config`.
If you want the router to compute request cost and expose Prometheus cost metrics, add per-1M token pricing and currency under each model in `model_config`.

```yaml
model_config:
phi4:
pricing:
prompt_usd_per_1m: 0.07
completion_usd_per_1m: 0.35
currency: USD
prompt_per_1m: 0.07
completion_per_1m: 0.35
"mistral-small3.1":
pricing:
prompt_usd_per_1m: 0.1
completion_usd_per_1m: 0.3
currency: USD
prompt_per_1m: 0.1
completion_per_1m: 0.3
gemma3:27b:
pricing:
prompt_usd_per_1m: 0.067
completion_usd_per_1m: 0.267
currency: USD
prompt_per_1m: 0.067
completion_per_1m: 0.267
```

- Cost formula: `(prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000`.
- Cost formula: `(prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000` (in the given currency).
- When not configured, the router still reports token and latency metrics; cost is treated as 0.

### Classification Models
Expand Down
Loading