diff --git a/deploy/llm-router-dashboard.json b/deploy/llm-router-dashboard.json index a23a21e6..44bfb9a4 100644 --- a/deploy/llm-router-dashboard.json +++ b/deploy/llm-router-dashboard.json @@ -94,7 +94,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(category) (llm_category_classifications_total)", + "expr": "sum by(category) (llm_category_classifications_count)", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -440,4 +440,4 @@ "uid": "llm-router-metrics", "version": 12, "weekStart": "" -} \ No newline at end of file +} diff --git a/src/semantic-router/pkg/consts/consts.go b/src/semantic-router/pkg/consts/consts.go new file mode 100644 index 00000000..cf1486bf --- /dev/null +++ b/src/semantic-router/pkg/consts/consts.go @@ -0,0 +1,5 @@ +package consts + +// UnknownLabel is a canonical fallback label value used across the codebase +// when a more specific value (e.g., model, category, reason) is not available. +const UnknownLabel = "unknown" diff --git a/src/semantic-router/pkg/extproc/metrics_integration_test.go b/src/semantic-router/pkg/extproc/metrics_integration_test.go index 397318a8..addf21c2 100644 --- a/src/semantic-router/pkg/extproc/metrics_integration_test.go +++ b/src/semantic-router/pkg/extproc/metrics_integration_test.go @@ -81,7 +81,10 @@ var _ = Describe("Metrics recording", func() { StartTime: time.Now().Add(-1 * time.Second), } - before := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel) + beforeTPOT := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel) + + beforePrompt := getHistogramSampleCount("llm_prompt_tokens_per_request", ctx.RequestModel) + beforeCompletion := getHistogramSampleCount("llm_completion_tokens_per_request", ctx.RequestModel) openAIResponse := map[string]interface{}{ "id": "chatcmpl-xyz", @@ -111,7 +114,13 @@ var _ = Describe("Metrics recording", func() { Expect(err).NotTo(HaveOccurred()) Expect(response.GetResponseBody()).NotTo(BeNil()) - after := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel) - Expect(after).To(BeNumerically(">", before)) + afterTPOT := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel) + Expect(afterTPOT).To(BeNumerically(">", beforeTPOT)) + + // New per-request token histograms should also be recorded + afterPrompt := getHistogramSampleCount("llm_prompt_tokens_per_request", ctx.RequestModel) + afterCompletion := getHistogramSampleCount("llm_completion_tokens_per_request", ctx.RequestModel) + Expect(afterPrompt).To(BeNumerically(">", beforePrompt)) + Expect(afterCompletion).To(BeNumerically(">", beforeCompletion)) }) }) diff --git a/src/semantic-router/pkg/extproc/reason_mode_selector.go b/src/semantic-router/pkg/extproc/reason_mode_selector.go index 7380f54b..58f880a3 100644 --- a/src/semantic-router/pkg/extproc/reason_mode_selector.go +++ b/src/semantic-router/pkg/extproc/reason_mode_selector.go @@ -6,6 +6,7 @@ import ( "strings" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/config" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/utils/entropy" @@ -132,7 +133,7 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled } // Determine model for kwargs and logging - model := "unknown" + model := consts.UnknownLabel if modelValue, ok := requestMap["model"]; ok { if modelStr, ok := modelValue.(string); ok { model = modelStr @@ -191,7 +192,7 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled // Record metrics for template usage and effort when enabled if enabled { familyConfig := r.getModelReasoningFamily(model) - modelFamily := "unknown" + modelFamily := consts.UnknownLabel templateParam := "reasoning_effort" // default fallback if familyConfig != nil { diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go index f5e2db21..50fdd637 100644 --- a/src/semantic-router/pkg/metrics/metrics.go +++ b/src/semantic-router/pkg/metrics/metrics.go @@ -9,6 +9,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/config" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts" ) // Minimal fallback bucket configurations - used only when configuration is completely missing @@ -147,6 +148,26 @@ var ( []string{"model"}, ) + // PromptTokensPerRequest tracks the distribution of prompt tokens per request by model + PromptTokensPerRequest = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "llm_prompt_tokens_per_request", + Help: "Distribution of prompt tokens per request by model", + Buckets: []float64{0, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}, + }, + []string{"model"}, + ) + + // CompletionTokensPerRequest tracks the distribution of completion tokens per request by model + CompletionTokensPerRequest = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "llm_completion_tokens_per_request", + Help: "Distribution of completion tokens per request by model", + Buckets: []float64{0, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}, + }, + []string{"model"}, + ) + // ModelRoutingModifications tracks when a model is changed from one to another ModelRoutingModifications = promauto.NewCounterVec( prometheus.CounterOpts{ @@ -258,11 +279,12 @@ var ( []string{"backend"}, ) - // CategoryClassifications tracks the number of times each category is classified - CategoryClassifications = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "llm_category_classifications_total", - Help: "The total number of times each category is classified", + // CategoryClassificationsCount is an alias with a name preferred by the issue request. + // It mirrors CategoryClassifications and is incremented alongside it for compatibility. + CategoryClassificationsCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_category_classifications_count", + Help: "The total number of times each category is classified (alias metric)", }, []string{"category"}, ) @@ -363,7 +385,7 @@ var ( // RecordModelRequest increments the counter for requests to a specific model func RecordModelRequest(model string) { if model == "" { - model = "unknown" + model = consts.UnknownLabel } ModelRequests.WithLabelValues(model).Inc() } @@ -371,10 +393,10 @@ func RecordModelRequest(model string) { // RecordRequestError increments request error counters labeled by model and normalized reason func RecordRequestError(model, reason string) { if model == "" { - model = "unknown" + model = consts.UnknownLabel } if reason == "" { - reason = "unknown" + reason = consts.UnknownLabel } // Normalize a few common variants to canonical reasons switch reason { @@ -414,10 +436,10 @@ func RecordModelCost(model string, currency string, amount float64) { // RecordRoutingReasonCode increments the counter for a routing decision reason code and model func RecordRoutingReasonCode(reasonCode, model string) { if reasonCode == "" { - reasonCode = "unknown" + reasonCode = consts.UnknownLabel } if model == "" { - model = "unknown" + model = consts.UnknownLabel } RoutingReasonCodes.WithLabelValues(reasonCode, model).Inc() } @@ -429,6 +451,13 @@ func RecordModelTokensDetailed(model string, promptTokens, completionTokens floa ModelTokens.WithLabelValues(model).Add(totalTokens) ModelPromptTokens.WithLabelValues(model).Add(promptTokens) ModelCompletionTokens.WithLabelValues(model).Add(completionTokens) + + // Also record per-request histograms for visibility into distribution + if model == "" { + model = consts.UnknownLabel + } + PromptTokensPerRequest.WithLabelValues(model).Observe(promptTokens) + CompletionTokensPerRequest.WithLabelValues(model).Observe(completionTokens) } // RecordModelCompletionLatency records the latency of a model completion @@ -442,7 +471,7 @@ func RecordModelTTFT(model string, seconds float64) { return } if model == "" { - model = "unknown" + model = consts.UnknownLabel } ModelTTFT.WithLabelValues(model).Observe(seconds) } @@ -453,7 +482,7 @@ func RecordModelTPOT(model string, secondsPerToken float64) { return } if model == "" { - model = "unknown" + model = consts.UnknownLabel } ModelTPOT.WithLabelValues(model).Observe(secondsPerToken) } @@ -484,9 +513,12 @@ func UpdateCacheEntries(backend string, count int) { CacheEntriesTotal.WithLabelValues(backend).Set(float64(count)) } -// RecordCategoryClassification increments the gauge for a specific category classification +// RecordCategoryClassification increments the counter for a specific category classification func RecordCategoryClassification(category string) { - CategoryClassifications.WithLabelValues(category).Inc() + if category == "" { + category = consts.UnknownLabel + } + CategoryClassificationsCount.WithLabelValues(category).Inc() } // RecordPIIViolation records a PII policy violation for a specific model and PII data type @@ -544,7 +576,7 @@ func GetBatchSizeRange(size int) string { } // Fallback for unexpected cases - return "unknown" + return consts.UnknownLabel } // GetBatchSizeRangeFromBuckets generates range labels based on size buckets @@ -725,7 +757,7 @@ func RecordReasoningDecision(category, model string, enabled bool, effort string // RecordReasoningTemplateUsage records usage of a model-family-specific template parameter func RecordReasoningTemplateUsage(family, param string) { if family == "" { - family = "unknown" + family = consts.UnknownLabel } if param == "" { param = "none" @@ -736,7 +768,7 @@ func RecordReasoningTemplateUsage(family, param string) { // RecordReasoningEffortUsage records the effort usage by model family func RecordReasoningEffortUsage(family, effort string) { if family == "" { - family = "unknown" + family = consts.UnknownLabel } if effort == "" { effort = "unspecified" @@ -747,7 +779,7 @@ func RecordReasoningEffortUsage(family, effort string) { // RecordEntropyClassificationDecision records an entropy-based classification decision func RecordEntropyClassificationDecision(uncertaintyLevel string, reasoningEnabled bool, decisionReason string, topCategory string) { if uncertaintyLevel == "" { - uncertaintyLevel = "unknown" + uncertaintyLevel = consts.UnknownLabel } if decisionReason == "" { decisionReason = "unspecified" @@ -767,7 +799,7 @@ func RecordEntropyClassificationDecision(uncertaintyLevel string, reasoningEnabl // RecordEntropyValue records the entropy value for a classification func RecordEntropyValue(category string, classificationType string, entropyValue float64) { if category == "" { - category = "unknown" + category = consts.UnknownLabel } if classificationType == "" { classificationType = "standard" @@ -779,7 +811,7 @@ func RecordEntropyValue(category string, classificationType string, entropyValue // RecordClassificationConfidence records the confidence score from classification func RecordClassificationConfidence(category string, classificationMethod string, confidence float64) { if category == "" { - category = "unknown" + category = consts.UnknownLabel } if classificationMethod == "" { classificationMethod = "traditional" @@ -796,10 +828,10 @@ func RecordEntropyClassificationLatency(seconds float64) { // RecordProbabilityDistributionQuality records quality checks for probability distributions func RecordProbabilityDistributionQuality(qualityCheck string, status string) { if qualityCheck == "" { - qualityCheck = "unknown" + qualityCheck = consts.UnknownLabel } if status == "" { - status = "unknown" + status = consts.UnknownLabel } ProbabilityDistributionQuality.WithLabelValues(qualityCheck, status).Inc() @@ -808,7 +840,7 @@ func RecordProbabilityDistributionQuality(qualityCheck string, status string) { // RecordEntropyFallback records when entropy-based routing falls back to traditional methods func RecordEntropyFallback(fallbackReason string, fallbackStrategy string) { if fallbackReason == "" { - fallbackReason = "unknown" + fallbackReason = consts.UnknownLabel } if fallbackStrategy == "" { fallbackStrategy = "unspecified"