Skip to content

Commit 5f5dc20

Browse files
authored
Merge branch 'main' into 0907-yuluo/remove-docker-compose-verison
2 parents 009563e + 89dfce7 commit 5f5dc20

File tree

8 files changed

+519
-261
lines changed

8 files changed

+519
-261
lines changed

config/config.yaml

Lines changed: 266 additions & 258 deletions
Large diffs are not rendered by default.

src/semantic-router/pkg/config/config.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,13 @@ type VLLMEndpoint struct {
192192
}
193193

194194
// ModelParams represents configuration for model-specific parameters
195+
type ModelPricing struct {
196+
// Price in USD per 1M prompt tokens
197+
PromptUSDPer1M float64 `yaml:"prompt_usd_per_1m,omitempty"`
198+
// Price in USD per 1M completion tokens
199+
CompletionUSDPer1M float64 `yaml:"completion_usd_per_1m,omitempty"`
200+
}
201+
195202
type ModelParams struct {
196203
// Number of parameters in the model
197204
ParamCount float64 `yaml:"param_count"`
@@ -207,6 +214,9 @@ type ModelParams struct {
207214

208215
// Preferred endpoints for this model (optional)
209216
PreferredEndpoints []string `yaml:"preferred_endpoints,omitempty"`
217+
218+
// Optional pricing used for cost computation
219+
Pricing ModelPricing `yaml:"pricing,omitempty"`
210220
}
211221

212222
// PIIPolicy represents the PII (Personally Identifiable Information) policy for a model
@@ -364,6 +374,17 @@ func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float6
364374
return defaultValue
365375
}
366376

377+
// GetModelPricing returns pricing in USD per 1M tokens for prompt and completion.
378+
func (c *RouterConfig) GetModelPricing(modelName string) (promptUSDPer1M float64, completionUSDPer1M float64, ok bool) {
379+
if modelConfig, okc := c.ModelConfig[modelName]; okc {
380+
p := modelConfig.Pricing
381+
if p.PromptUSDPer1M != 0 || p.CompletionUSDPer1M != 0 {
382+
return p.PromptUSDPer1M, p.CompletionUSDPer1M, true
383+
}
384+
}
385+
return 0, 0, false
386+
}
387+
367388
// GetModelPIIPolicy returns the PII policy for a given model
368389
// If the model is not found in the config, returns a default policy that allows all PII
369390
func (c *RouterConfig) GetModelPIIPolicy(modelName string) PIIPolicy {

src/semantic-router/pkg/extproc/request_handler.go

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
"github.com/vllm-project/semantic-router/semantic-router/pkg/cache"
1616
"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
17+
"github.com/vllm-project/semantic-router/semantic-router/pkg/observability"
1718
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/http"
1819
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii"
1920
)
@@ -173,7 +174,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
173174
userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest)
174175

175176
// Perform security checks
176-
if response, shouldReturn := r.performSecurityChecks(userContent, nonUserMessages); shouldReturn {
177+
if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn {
177178
return response, nil
178179
}
179180

@@ -187,7 +188,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
187188
}
188189

189190
// performSecurityChecks performs PII and jailbreak detection
190-
func (r *OpenAIRouter) performSecurityChecks(userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) {
191+
func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) {
191192
// Perform PII classification on all message content
192193
allContent := pii.ExtractAllContent(userContent, nonUserMessages)
193194

@@ -212,6 +213,13 @@ func (r *OpenAIRouter) performSecurityChecks(userContent string, nonUserMessages
212213
log.Printf("JAILBREAK ATTEMPT BLOCKED: %s (confidence: %.3f)", jailbreakType, confidence)
213214

214215
// Return immediate jailbreak violation response
216+
// Structured log for security block
217+
observability.LogEvent("security_block", map[string]interface{}{
218+
"reason_code": "jailbreak_detected",
219+
"jailbreak_type": jailbreakType,
220+
"confidence": confidence,
221+
"request_id": ctx.RequestID,
222+
})
215223
jailbreakResponse := http.CreateJailbreakViolationResponse(jailbreakType, confidence)
216224
return jailbreakResponse, true
217225
} else {
@@ -241,6 +249,13 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR
241249
if err != nil {
242250
log.Printf("Error searching cache: %v", err)
243251
} else if found {
252+
// Record and log cache hit
253+
metrics.RecordCacheHit()
254+
observability.LogEvent("cache_hit", map[string]interface{}{
255+
"request_id": ctx.RequestID,
256+
"model": requestModel,
257+
"query": requestQuery,
258+
})
244259
// Return immediate response from cache
245260
response := http.CreateCacheHitResponse(cachedResponse)
246261
return response, true
@@ -313,19 +328,33 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
313328
// Select the best allowed model from this category
314329
matchedModel = r.Classifier.SelectBestModelFromList(allowedModels, categoryName)
315330
log.Printf("Selected alternative model %s that passes PII policy", matchedModel)
331+
// Record reason code for selecting alternative due to PII
332+
metrics.RecordRoutingReasonCode("pii_policy_alternative_selected", matchedModel)
316333
} else {
317334
log.Printf("No models in category %s pass PII policy, using default", categoryName)
318335
matchedModel = r.Config.DefaultModel
319336
// Check if default model passes policy
320337
defaultAllowed, defaultDeniedPII, _ := r.PIIChecker.CheckPolicy(matchedModel, detectedPII)
321338
if !defaultAllowed {
322339
log.Printf("Default model also violates PII policy, returning error")
340+
observability.LogEvent("routing_block", map[string]interface{}{
341+
"reason_code": "pii_policy_denied_default_model",
342+
"request_id": ctx.RequestID,
343+
"model": matchedModel,
344+
"denied_pii": defaultDeniedPII,
345+
})
323346
piiResponse := http.CreatePIIViolationResponse(matchedModel, defaultDeniedPII)
324347
return piiResponse, nil
325348
}
326349
}
327350
} else {
328351
log.Printf("Could not determine category, returning PII violation for model %s", matchedModel)
352+
observability.LogEvent("routing_block", map[string]interface{}{
353+
"reason_code": "pii_policy_denied",
354+
"request_id": ctx.RequestID,
355+
"model": matchedModel,
356+
"denied_pii": deniedPII,
357+
})
329358
piiResponse := http.CreatePIIViolationResponse(matchedModel, deniedPII)
330359
return piiResponse, nil
331360
}
@@ -424,6 +453,20 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
424453
}
425454

426455
log.Printf("Use new model: %s", matchedModel)
456+
457+
// Structured log for routing decision (auto)
458+
observability.LogEvent("routing_decision", map[string]interface{}{
459+
"reason_code": "auto_routing",
460+
"request_id": ctx.RequestID,
461+
"original_model": originalModel,
462+
"selected_model": matchedModel,
463+
"category": categoryName,
464+
"reasoning_enabled": useReasoning,
465+
"reasoning_effort": effortForMetrics,
466+
"selected_endpoint": selectedEndpoint,
467+
"routing_latency_ms": time.Since(ctx.ProcessingStartTime).Milliseconds(),
468+
})
469+
metrics.RecordRoutingReasonCode("auto_routing", matchedModel)
427470
}
428471
}
429472
} else if originalModel != "auto" {
@@ -438,6 +481,12 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
438481
// Continue with request on error
439482
} else if !allowed {
440483
log.Printf("Model %s violates PII policy, returning error", originalModel)
484+
observability.LogEvent("routing_block", map[string]interface{}{
485+
"reason_code": "pii_policy_denied",
486+
"request_id": ctx.RequestID,
487+
"model": originalModel,
488+
"denied_pii": deniedPII,
489+
})
441490
piiResponse := http.CreatePIIViolationResponse(originalModel, deniedPII)
442491
return piiResponse, nil
443492
}
@@ -472,6 +521,19 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
472521
},
473522
},
474523
}
524+
// Structured log for routing decision (explicit model)
525+
observability.LogEvent("routing_decision", map[string]interface{}{
526+
"reason_code": "model_specified",
527+
"request_id": ctx.RequestID,
528+
"original_model": originalModel,
529+
"selected_model": originalModel,
530+
"category": "",
531+
"reasoning_enabled": false,
532+
"reasoning_effort": "",
533+
"selected_endpoint": selectedEndpoint,
534+
"routing_latency_ms": time.Since(ctx.ProcessingStartTime).Milliseconds(),
535+
})
536+
metrics.RecordRoutingReasonCode("model_specified", originalModel)
475537
}
476538

477539
// Save the actual model that will be used for token tracking

src/semantic-router/pkg/extproc/response_handler.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
"github.com/openai/openai-go"
1111
"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
12+
"github.com/vllm-project/semantic-router/semantic-router/pkg/observability"
1213
)
1314

1415
// handleResponseHeaders processes the response headers
@@ -52,6 +53,35 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response
5253
)
5354
metrics.RecordModelCompletionLatency(ctx.RequestModel, completionLatency.Seconds())
5455
r.Classifier.DecrementModelLoad(ctx.RequestModel)
56+
57+
// Compute and record cost if pricing is configured
58+
if r.Config != nil {
59+
promptRatePer1M, completionRatePer1M, ok := r.Config.GetModelPricing(ctx.RequestModel)
60+
if ok {
61+
costUSD := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0
62+
metrics.RecordModelCostUSD(ctx.RequestModel, costUSD)
63+
observability.LogEvent("llm_usage", map[string]interface{}{
64+
"request_id": ctx.RequestID,
65+
"model": ctx.RequestModel,
66+
"prompt_tokens": promptTokens,
67+
"completion_tokens": completionTokens,
68+
"total_tokens": promptTokens + completionTokens,
69+
"completion_latency_ms": completionLatency.Milliseconds(),
70+
"cost_usd": costUSD,
71+
})
72+
} else {
73+
observability.LogEvent("llm_usage", map[string]interface{}{
74+
"request_id": ctx.RequestID,
75+
"model": ctx.RequestModel,
76+
"prompt_tokens": promptTokens,
77+
"completion_tokens": completionTokens,
78+
"total_tokens": promptTokens + completionTokens,
79+
"completion_latency_ms": completionLatency.Milliseconds(),
80+
"cost_usd": 0.0,
81+
"pricing": "not_configured",
82+
})
83+
}
84+
}
5585
}
5686

5787
// Check if this request has a pending cache entry

src/semantic-router/pkg/metrics/metrics.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,15 @@ var (
102102
[]string{"model"},
103103
)
104104

105+
// ModelCostUSD tracks the total USD cost attributed to each model
106+
ModelCostUSD = promauto.NewCounterVec(
107+
prometheus.CounterOpts{
108+
Name: "llm_model_cost_usd_total",
109+
Help: "The total USD cost attributed to each LLM model",
110+
},
111+
[]string{"model"},
112+
)
113+
105114
// ModelTokens tracks the number of tokens used by each model
106115
ModelTokens = promauto.NewCounterVec(
107116
prometheus.CounterOpts{
@@ -138,6 +147,15 @@ var (
138147
[]string{"source_model", "target_model"},
139148
)
140149

150+
// RoutingReasonCodes tracks routing decisions by reason_code and model
151+
RoutingReasonCodes = promauto.NewCounterVec(
152+
prometheus.CounterOpts{
153+
Name: "llm_routing_reason_codes_total",
154+
Help: "The total number of routing decisions by reason code and model",
155+
},
156+
[]string{"reason_code", "model"},
157+
)
158+
141159
// ModelCompletionLatency tracks the latency of completions by model
142160
ModelCompletionLatency = promauto.NewHistogramVec(
143161
prometheus.HistogramOpts{
@@ -238,6 +256,25 @@ func RecordModelTokens(model string, tokens float64) {
238256
ModelTokens.WithLabelValues(model).Add(tokens)
239257
}
240258

259+
// RecordModelCostUSD adds the dollar cost attributed to a specific model
260+
func RecordModelCostUSD(model string, usd float64) {
261+
if usd < 0 {
262+
return
263+
}
264+
ModelCostUSD.WithLabelValues(model).Add(usd)
265+
}
266+
267+
// RecordRoutingReasonCode increments the counter for a routing decision reason code and model
268+
func RecordRoutingReasonCode(reasonCode, model string) {
269+
if reasonCode == "" {
270+
reasonCode = "unknown"
271+
}
272+
if model == "" {
273+
model = "unknown"
274+
}
275+
RoutingReasonCodes.WithLabelValues(reasonCode, model).Inc()
276+
}
277+
241278
// RecordModelTokensDetailed records detailed token usage (prompt and completion)
242279
func RecordModelTokensDetailed(model string, promptTokens, completionTokens float64) {
243280
// Record in both the aggregated and detailed metrics
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package observability
2+
3+
import (
4+
"encoding/json"
5+
"log"
6+
"time"
7+
)
8+
9+
// LogEvent emits a structured JSON log line with a standard envelope
10+
// Fields provided by callers take precedence and will not be overwritten.
11+
func LogEvent(event string, fields map[string]interface{}) {
12+
if fields == nil {
13+
fields = map[string]interface{}{}
14+
}
15+
if _, ok := fields["event"]; !ok {
16+
fields["event"] = event
17+
}
18+
if _, ok := fields["ts"]; !ok {
19+
fields["ts"] = time.Now().UTC().Format(time.RFC3339Nano)
20+
}
21+
b, err := json.Marshal(fields)
22+
if err != nil {
23+
// Fallback to regular log on marshal error
24+
log.Printf("event=%s marshal_error=%v fields_len=%d", event, err, len(fields))
25+
return
26+
}
27+
log.Println(string(b))
28+
}

website/docs/api/router.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,55 @@ sum by (family, effort) (
218218
)
219219
```
220220

221+
### Cost and Routing Metrics
222+
223+
The router exposes additional metrics for cost accounting and routing decisions.
224+
225+
- `llm_model_cost_usd_total{model}`
226+
- Description: Total accumulated USD cost attributed to each model (computed from token usage and per-1M pricing).
227+
- Labels:
228+
- model: model name used for the request
229+
230+
- `llm_routing_reason_codes_total{reason_code, model}`
231+
- Description: Count of routing decisions by reason code and selected model.
232+
- Labels:
233+
- reason_code: why a routing decision happened (e.g., auto_routing, model_specified, pii_policy_alternative_selected)
234+
- model: final selected model
235+
236+
Example PromQL:
237+
238+
```prometheus
239+
# Cost by model over the last hour
240+
sum by (model) (increase(llm_model_cost_usd_total[1h]))
241+
242+
# Routing decisions by reason code over the last 15 minutes
243+
sum by (reason_code) (increase(llm_routing_reason_codes_total[15m]))
244+
```
245+
246+
### Pricing Configuration
247+
248+
Provide per-1M pricing for your models so the router can compute request cost and emit metrics/logs.
249+
250+
```yaml
251+
model_config:
252+
phi4:
253+
pricing:
254+
prompt_usd_per_1m: 200.0
255+
completion_usd_per_1m: 600.0
256+
"mistral-small3.1":
257+
pricing:
258+
prompt_usd_per_1m: 300.0
259+
completion_usd_per_1m: 900.0
260+
gemma3:27b:
261+
pricing:
262+
prompt_usd_per_1m: 500.0
263+
completion_usd_per_1m: 1500.0
264+
```
265+
266+
Notes:
267+
- Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted.
268+
- Cost is computed as: (prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000.
269+
221270
## gRPC ExtProc API
222271
223272
For direct integration with the ExtProc protocol:

0 commit comments

Comments
 (0)