Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions deploy/llm-router-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "sum by(category) (llm_category_classifications_total)",
"expr": "sum by(category) (llm_category_classifications_count)",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
Expand Down Expand Up @@ -440,4 +440,4 @@
"uid": "llm-router-metrics",
"version": 12,
"weekStart": ""
}
}
5 changes: 5 additions & 0 deletions src/semantic-router/pkg/consts/consts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package consts

// UnknownLabel is a canonical fallback label value used across the codebase
// when a more specific value (e.g., model, category, reason) is not available.
const UnknownLabel = "unknown"
15 changes: 12 additions & 3 deletions src/semantic-router/pkg/extproc/metrics_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ var _ = Describe("Metrics recording", func() {
StartTime: time.Now().Add(-1 * time.Second),
}

before := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
beforeTPOT := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)

beforePrompt := getHistogramSampleCount("llm_prompt_tokens_per_request", ctx.RequestModel)
beforeCompletion := getHistogramSampleCount("llm_completion_tokens_per_request", ctx.RequestModel)

openAIResponse := map[string]interface{}{
"id": "chatcmpl-xyz",
Expand Down Expand Up @@ -111,7 +114,13 @@ var _ = Describe("Metrics recording", func() {
Expect(err).NotTo(HaveOccurred())
Expect(response.GetResponseBody()).NotTo(BeNil())

after := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
Expect(after).To(BeNumerically(">", before))
afterTPOT := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
Expect(afterTPOT).To(BeNumerically(">", beforeTPOT))

// New per-request token histograms should also be recorded
afterPrompt := getHistogramSampleCount("llm_prompt_tokens_per_request", ctx.RequestModel)
afterCompletion := getHistogramSampleCount("llm_completion_tokens_per_request", ctx.RequestModel)
Expect(afterPrompt).To(BeNumerically(">", beforePrompt))
Expect(afterCompletion).To(BeNumerically(">", beforeCompletion))
})
})
5 changes: 3 additions & 2 deletions src/semantic-router/pkg/extproc/reason_mode_selector.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/vllm-project/semantic-router/src/semantic-router/pkg/config"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/utils/entropy"
Expand Down Expand Up @@ -132,7 +133,7 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
}

// Determine model for kwargs and logging
model := "unknown"
model := consts.UnknownLabel
if modelValue, ok := requestMap["model"]; ok {
if modelStr, ok := modelValue.(string); ok {
model = modelStr
Expand Down Expand Up @@ -191,7 +192,7 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
// Record metrics for template usage and effort when enabled
if enabled {
familyConfig := r.getModelReasoningFamily(model)
modelFamily := "unknown"
modelFamily := consts.UnknownLabel
templateParam := "reasoning_effort" // default fallback

if familyConfig != nil {
Expand Down
78 changes: 55 additions & 23 deletions src/semantic-router/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/vllm-project/semantic-router/src/semantic-router/pkg/config"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts"
)

// Minimal fallback bucket configurations - used only when configuration is completely missing
Expand Down Expand Up @@ -147,6 +148,26 @@ var (
[]string{"model"},
)

// PromptTokensPerRequest tracks the distribution of prompt tokens per request by model
PromptTokensPerRequest = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "llm_prompt_tokens_per_request",
Help: "Distribution of prompt tokens per request by model",
Buckets: []float64{0, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384},
},
[]string{"model"},
)

// CompletionTokensPerRequest tracks the distribution of completion tokens per request by model
CompletionTokensPerRequest = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "llm_completion_tokens_per_request",
Help: "Distribution of completion tokens per request by model",
Buckets: []float64{0, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384},
},
[]string{"model"},
)

// ModelRoutingModifications tracks when a model is changed from one to another
ModelRoutingModifications = promauto.NewCounterVec(
prometheus.CounterOpts{
Expand Down Expand Up @@ -258,11 +279,12 @@ var (
[]string{"backend"},
)

// CategoryClassifications tracks the number of times each category is classified
CategoryClassifications = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "llm_category_classifications_total",
Help: "The total number of times each category is classified",
// CategoryClassificationsCount is an alias with a name preferred by the issue request.
// It mirrors CategoryClassifications and is incremented alongside it for compatibility.
CategoryClassificationsCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "llm_category_classifications_count",
Help: "The total number of times each category is classified (alias metric)",
},
[]string{"category"},
)
Expand Down Expand Up @@ -363,18 +385,18 @@ var (
// RecordModelRequest increments the counter for requests to a specific model
func RecordModelRequest(model string) {
if model == "" {
model = "unknown"
model = consts.UnknownLabel
}
ModelRequests.WithLabelValues(model).Inc()
}

// RecordRequestError increments request error counters labeled by model and normalized reason
func RecordRequestError(model, reason string) {
if model == "" {
model = "unknown"
model = consts.UnknownLabel
}
if reason == "" {
reason = "unknown"
reason = consts.UnknownLabel
}
// Normalize a few common variants to canonical reasons
switch reason {
Expand Down Expand Up @@ -414,10 +436,10 @@ func RecordModelCost(model string, currency string, amount float64) {
// RecordRoutingReasonCode increments the counter for a routing decision reason code and model
func RecordRoutingReasonCode(reasonCode, model string) {
if reasonCode == "" {
reasonCode = "unknown"
reasonCode = consts.UnknownLabel
}
if model == "" {
model = "unknown"
model = consts.UnknownLabel
}
RoutingReasonCodes.WithLabelValues(reasonCode, model).Inc()
}
Expand All @@ -429,6 +451,13 @@ func RecordModelTokensDetailed(model string, promptTokens, completionTokens floa
ModelTokens.WithLabelValues(model).Add(totalTokens)
ModelPromptTokens.WithLabelValues(model).Add(promptTokens)
ModelCompletionTokens.WithLabelValues(model).Add(completionTokens)

// Also record per-request histograms for visibility into distribution
if model == "" {
model = consts.UnknownLabel
}
PromptTokensPerRequest.WithLabelValues(model).Observe(promptTokens)
CompletionTokensPerRequest.WithLabelValues(model).Observe(completionTokens)
}

// RecordModelCompletionLatency records the latency of a model completion
Expand All @@ -442,7 +471,7 @@ func RecordModelTTFT(model string, seconds float64) {
return
}
if model == "" {
model = "unknown"
model = consts.UnknownLabel
}
ModelTTFT.WithLabelValues(model).Observe(seconds)
}
Expand All @@ -453,7 +482,7 @@ func RecordModelTPOT(model string, secondsPerToken float64) {
return
}
if model == "" {
model = "unknown"
model = consts.UnknownLabel
}
ModelTPOT.WithLabelValues(model).Observe(secondsPerToken)
}
Expand Down Expand Up @@ -484,9 +513,12 @@ func UpdateCacheEntries(backend string, count int) {
CacheEntriesTotal.WithLabelValues(backend).Set(float64(count))
}

// RecordCategoryClassification increments the gauge for a specific category classification
// RecordCategoryClassification increments the counter for a specific category classification
func RecordCategoryClassification(category string) {
CategoryClassifications.WithLabelValues(category).Inc()
if category == "" {
category = consts.UnknownLabel
}
CategoryClassificationsCount.WithLabelValues(category).Inc()
}

// RecordPIIViolation records a PII policy violation for a specific model and PII data type
Expand Down Expand Up @@ -544,7 +576,7 @@ func GetBatchSizeRange(size int) string {
}

// Fallback for unexpected cases
return "unknown"
return consts.UnknownLabel
}

// GetBatchSizeRangeFromBuckets generates range labels based on size buckets
Expand Down Expand Up @@ -725,7 +757,7 @@ func RecordReasoningDecision(category, model string, enabled bool, effort string
// RecordReasoningTemplateUsage records usage of a model-family-specific template parameter
func RecordReasoningTemplateUsage(family, param string) {
if family == "" {
family = "unknown"
family = consts.UnknownLabel
}
if param == "" {
param = "none"
Expand All @@ -736,7 +768,7 @@ func RecordReasoningTemplateUsage(family, param string) {
// RecordReasoningEffortUsage records the effort usage by model family
func RecordReasoningEffortUsage(family, effort string) {
if family == "" {
family = "unknown"
family = consts.UnknownLabel
}
if effort == "" {
effort = "unspecified"
Expand All @@ -747,7 +779,7 @@ func RecordReasoningEffortUsage(family, effort string) {
// RecordEntropyClassificationDecision records an entropy-based classification decision
func RecordEntropyClassificationDecision(uncertaintyLevel string, reasoningEnabled bool, decisionReason string, topCategory string) {
if uncertaintyLevel == "" {
uncertaintyLevel = "unknown"
uncertaintyLevel = consts.UnknownLabel
}
if decisionReason == "" {
decisionReason = "unspecified"
Expand All @@ -767,7 +799,7 @@ func RecordEntropyClassificationDecision(uncertaintyLevel string, reasoningEnabl
// RecordEntropyValue records the entropy value for a classification
func RecordEntropyValue(category string, classificationType string, entropyValue float64) {
if category == "" {
category = "unknown"
category = consts.UnknownLabel
}
if classificationType == "" {
classificationType = "standard"
Expand All @@ -779,7 +811,7 @@ func RecordEntropyValue(category string, classificationType string, entropyValue
// RecordClassificationConfidence records the confidence score from classification
func RecordClassificationConfidence(category string, classificationMethod string, confidence float64) {
if category == "" {
category = "unknown"
category = consts.UnknownLabel
}
if classificationMethod == "" {
classificationMethod = "traditional"
Expand All @@ -796,10 +828,10 @@ func RecordEntropyClassificationLatency(seconds float64) {
// RecordProbabilityDistributionQuality records quality checks for probability distributions
func RecordProbabilityDistributionQuality(qualityCheck string, status string) {
if qualityCheck == "" {
qualityCheck = "unknown"
qualityCheck = consts.UnknownLabel
}
if status == "" {
status = "unknown"
status = consts.UnknownLabel
}

ProbabilityDistributionQuality.WithLabelValues(qualityCheck, status).Inc()
Expand All @@ -808,7 +840,7 @@ func RecordProbabilityDistributionQuality(qualityCheck string, status string) {
// RecordEntropyFallback records when entropy-based routing falls back to traditional methods
func RecordEntropyFallback(fallbackReason string, fallbackStrategy string) {
if fallbackReason == "" {
fallbackReason = "unknown"
fallbackReason = consts.UnknownLabel
}
if fallbackStrategy == "" {
fallbackStrategy = "unspecified"
Expand Down
Loading