metrics: Add request-level token histograms (vllm-project#157)

tao12345666333 · yossiovadia · commit e3ac4734a312 · 2025-09-22T10:15:13.000-07:00
* metrics: Add request-level token histograms

Signed-off-by: Jintao Zhang &lt;zhangjintao9020@gmail.com&gt;

* add unknown const

Signed-off-by: Jintao Zhang &lt;zhangjintao9020@gmail.com&gt;

---------

Signed-off-by: Jintao Zhang &lt;zhangjintao9020@gmail.com&gt;
diff --git a/deploy/llm-router-dashboard.json b/deploy/llm-router-dashboard.json
@@ -94,7 +94,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "sum by(category) (llm_category_classifications_total)",
+          "expr": "sum by(category) (llm_category_classifications_count)",
           "fullMetaSearch": false,
           "includeNullMetadata": true,
           "instant": false,
@@ -440,4 +440,4 @@
   "uid": "llm-router-metrics",
   "version": 12,
   "weekStart": ""
-}
+}
diff --git a/src/semantic-router/pkg/consts/consts.go b/src/semantic-router/pkg/consts/consts.go
@@ -0,0 +1,5 @@
+package consts
+
+// UnknownLabel is a canonical fallback label value used across the codebase
+// when a more specific value (e.g., model, category, reason) is not available.
+const UnknownLabel = "unknown"
diff --git a/src/semantic-router/pkg/extproc/metrics_integration_test.go b/src/semantic-router/pkg/extproc/metrics_integration_test.go
@@ -81,7 +81,10 @@ var _ = Describe("Metrics recording", func() {
 			StartTime:    time.Now().Add(-1 * time.Second),
 		}
 
-		before := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
+		beforeTPOT := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
+
+		beforePrompt := getHistogramSampleCount("llm_prompt_tokens_per_request", ctx.RequestModel)
+		beforeCompletion := getHistogramSampleCount("llm_completion_tokens_per_request", ctx.RequestModel)
 
 		openAIResponse := map[string]interface{}{
 			"id":      "chatcmpl-xyz",
@@ -111,7 +114,13 @@ var _ = Describe("Metrics recording", func() {
 		Expect(err).NotTo(HaveOccurred())
 		Expect(response.GetResponseBody()).NotTo(BeNil())
 
-		after := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
-		Expect(after).To(BeNumerically(">", before))
+		afterTPOT := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
+		Expect(afterTPOT).To(BeNumerically(">", beforeTPOT))
+
+		// New per-request token histograms should also be recorded
+		afterPrompt := getHistogramSampleCount("llm_prompt_tokens_per_request", ctx.RequestModel)
+		afterCompletion := getHistogramSampleCount("llm_completion_tokens_per_request", ctx.RequestModel)
+		Expect(afterPrompt).To(BeNumerically(">", beforePrompt))
+		Expect(afterCompletion).To(BeNumerically(">", beforeCompletion))
 	})
 })
diff --git a/src/semantic-router/pkg/extproc/reason_mode_selector.go b/src/semantic-router/pkg/extproc/reason_mode_selector.go
@@ -6,6 +6,7 @@ import (
 	"strings"
 
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/config"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts"
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics"
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/utils/entropy"
@@ -132,7 +133,7 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
 	}
 
 	// Determine model for kwargs and logging
-	model := "unknown"
+	model := consts.UnknownLabel
 	if modelValue, ok := requestMap["model"]; ok {
 		if modelStr, ok := modelValue.(string); ok {
 			model = modelStr
@@ -191,7 +192,7 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
 	// Record metrics for template usage and effort when enabled
 	if enabled {
 		familyConfig := r.getModelReasoningFamily(model)
-		modelFamily := "unknown"
+		modelFamily := consts.UnknownLabel
 		templateParam := "reasoning_effort" // default fallback
 
 		if familyConfig != nil {
diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go
@@ -9,6 +9,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus/promauto"
 
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/config"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts"
 )
 
 // Minimal fallback bucket configurations - used only when configuration is completely missing
@@ -147,6 +148,26 @@ var (
 		[]string{"model"},
 	)
 
+	// PromptTokensPerRequest tracks the distribution of prompt tokens per request by model
+	PromptTokensPerRequest = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "llm_prompt_tokens_per_request",
+			Help:    "Distribution of prompt tokens per request by model",
+			Buckets: []float64{0, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384},
+		},
+		[]string{"model"},
+	)
+
+	// CompletionTokensPerRequest tracks the distribution of completion tokens per request by model
+	CompletionTokensPerRequest = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "llm_completion_tokens_per_request",
+			Help:    "Distribution of completion tokens per request by model",
+			Buckets: []float64{0, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384},
+		},
+		[]string{"model"},
+	)
+
 	// ModelRoutingModifications tracks when a model is changed from one to another
 	ModelRoutingModifications = promauto.NewCounterVec(
 		prometheus.CounterOpts{
@@ -258,11 +279,12 @@ var (
 		[]string{"backend"},
 	)
 
-	// CategoryClassifications tracks the number of times each category is classified
-	CategoryClassifications = promauto.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "llm_category_classifications_total",
-			Help: "The total number of times each category is classified",
+	// CategoryClassificationsCount is an alias with a name preferred by the issue request.
+	// It mirrors CategoryClassifications and is incremented alongside it for compatibility.
+	CategoryClassificationsCount = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "llm_category_classifications_count",
+			Help: "The total number of times each category is classified (alias metric)",
 		},
 		[]string{"category"},
 	)
@@ -363,18 +385,18 @@ var (
 // RecordModelRequest increments the counter for requests to a specific model
 func RecordModelRequest(model string) {
 	if model == "" {
-		model = "unknown"
+		model = consts.UnknownLabel
 	}
 	ModelRequests.WithLabelValues(model).Inc()
 }
 
 // RecordRequestError increments request error counters labeled by model and normalized reason
 func RecordRequestError(model, reason string) {
 	if model == "" {
-		model = "unknown"
+		model = consts.UnknownLabel
 	}
 	if reason == "" {
-		reason = "unknown"
+		reason = consts.UnknownLabel
 	}
 	// Normalize a few common variants to canonical reasons
 	switch reason {
@@ -414,10 +436,10 @@ func RecordModelCost(model string, currency string, amount float64) {
 // RecordRoutingReasonCode increments the counter for a routing decision reason code and model
 func RecordRoutingReasonCode(reasonCode, model string) {
 	if reasonCode == "" {
-		reasonCode = "unknown"
+		reasonCode = consts.UnknownLabel
 	}
 	if model == "" {
-		model = "unknown"
+		model = consts.UnknownLabel
 	}
 	RoutingReasonCodes.WithLabelValues(reasonCode, model).Inc()
 }
@@ -429,6 +451,13 @@ func RecordModelTokensDetailed(model string, promptTokens, completionTokens floa
 	ModelTokens.WithLabelValues(model).Add(totalTokens)
 	ModelPromptTokens.WithLabelValues(model).Add(promptTokens)
 	ModelCompletionTokens.WithLabelValues(model).Add(completionTokens)
+
+	// Also record per-request histograms for visibility into distribution
+	if model == "" {
+		model = consts.UnknownLabel
+	}
+	PromptTokensPerRequest.WithLabelValues(model).Observe(promptTokens)
+	CompletionTokensPerRequest.WithLabelValues(model).Observe(completionTokens)
 }
 
 // RecordModelCompletionLatency records the latency of a model completion
@@ -442,7 +471,7 @@ func RecordModelTTFT(model string, seconds float64) {
 		return
 	}
 	if model == "" {
-		model = "unknown"
+		model = consts.UnknownLabel
 	}
 	ModelTTFT.WithLabelValues(model).Observe(seconds)
 }
@@ -453,7 +482,7 @@ func RecordModelTPOT(model string, secondsPerToken float64) {
 		return
 	}
 	if model == "" {
-		model = "unknown"
+		model = consts.UnknownLabel
 	}
 	ModelTPOT.WithLabelValues(model).Observe(secondsPerToken)
 }
@@ -484,9 +513,12 @@ func UpdateCacheEntries(backend string, count int) {
 	CacheEntriesTotal.WithLabelValues(backend).Set(float64(count))
 }
 
-// RecordCategoryClassification increments the gauge for a specific category classification
+// RecordCategoryClassification increments the counter for a specific category classification
 func RecordCategoryClassification(category string) {
-	CategoryClassifications.WithLabelValues(category).Inc()
+	if category == "" {
+		category = consts.UnknownLabel
+	}
+	CategoryClassificationsCount.WithLabelValues(category).Inc()
 }
 
 // RecordPIIViolation records a PII policy violation for a specific model and PII data type
@@ -544,7 +576,7 @@ func GetBatchSizeRange(size int) string {
 	}
 
 	// Fallback for unexpected cases
-	return "unknown"
+	return consts.UnknownLabel
 }
 
 // GetBatchSizeRangeFromBuckets generates range labels based on size buckets
@@ -725,7 +757,7 @@ func RecordReasoningDecision(category, model string, enabled bool, effort string
 // RecordReasoningTemplateUsage records usage of a model-family-specific template parameter
 func RecordReasoningTemplateUsage(family, param string) {
 	if family == "" {
-		family = "unknown"
+		family = consts.UnknownLabel
 	}
 	if param == "" {
 		param = "none"
@@ -736,7 +768,7 @@ func RecordReasoningTemplateUsage(family, param string) {
 // RecordReasoningEffortUsage records the effort usage by model family
 func RecordReasoningEffortUsage(family, effort string) {
 	if family == "" {
-		family = "unknown"
+		family = consts.UnknownLabel
 	}
 	if effort == "" {
 		effort = "unspecified"
@@ -747,7 +779,7 @@ func RecordReasoningEffortUsage(family, effort string) {
 // RecordEntropyClassificationDecision records an entropy-based classification decision
 func RecordEntropyClassificationDecision(uncertaintyLevel string, reasoningEnabled bool, decisionReason string, topCategory string) {
 	if uncertaintyLevel == "" {
-		uncertaintyLevel = "unknown"
+		uncertaintyLevel = consts.UnknownLabel
 	}
 	if decisionReason == "" {
 		decisionReason = "unspecified"
@@ -767,7 +799,7 @@ func RecordEntropyClassificationDecision(uncertaintyLevel string, reasoningEnabl
 // RecordEntropyValue records the entropy value for a classification
 func RecordEntropyValue(category string, classificationType string, entropyValue float64) {
 	if category == "" {
-		category = "unknown"
+		category = consts.UnknownLabel
 	}
 	if classificationType == "" {
 		classificationType = "standard"
@@ -779,7 +811,7 @@ func RecordEntropyValue(category string, classificationType string, entropyValue
 // RecordClassificationConfidence records the confidence score from classification
 func RecordClassificationConfidence(category string, classificationMethod string, confidence float64) {
 	if category == "" {
-		category = "unknown"
+		category = consts.UnknownLabel
 	}
 	if classificationMethod == "" {
 		classificationMethod = "traditional"
@@ -796,10 +828,10 @@ func RecordEntropyClassificationLatency(seconds float64) {
 // RecordProbabilityDistributionQuality records quality checks for probability distributions
 func RecordProbabilityDistributionQuality(qualityCheck string, status string) {
 	if qualityCheck == "" {
-		qualityCheck = "unknown"
+		qualityCheck = consts.UnknownLabel
 	}
 	if status == "" {
-		status = "unknown"
+		status = consts.UnknownLabel
 	}
 
 	ProbabilityDistributionQuality.WithLabelValues(qualityCheck, status).Inc()
@@ -808,7 +840,7 @@ func RecordProbabilityDistributionQuality(qualityCheck string, status string) {
 // RecordEntropyFallback records when entropy-based routing falls back to traditional methods
 func RecordEntropyFallback(fallbackReason string, fallbackStrategy string) {
 	if fallbackReason == "" {
-		fallbackReason = "unknown"
+		fallbackReason = consts.UnknownLabel
 	}
 	if fallbackStrategy == "" {
 		fallbackStrategy = "unspecified"