From dca51492bb19831be376da81a3f79693a3508865 Mon Sep 17 00:00:00 2001 From: Jintao Zhang Date: Fri, 5 Sep 2025 06:23:38 +0800 Subject: [PATCH 1/2] feat: reasoning model controller Signed-off-by: Jintao Zhang --- .../pkg/extproc/reason_mode_selector.go | 44 ++++++++++++++- .../pkg/extproc/request_handler.go | 3 + src/semantic-router/pkg/metrics/metrics.go | 56 +++++++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/src/semantic-router/pkg/extproc/reason_mode_selector.go b/src/semantic-router/pkg/extproc/reason_mode_selector.go index 00b97792..2d41dc25 100644 --- a/src/semantic-router/pkg/extproc/reason_mode_selector.go +++ b/src/semantic-router/pkg/extproc/reason_mode_selector.go @@ -5,6 +5,8 @@ import ( "fmt" "log" "strings" + + "github.com/vllm-project/semantic-router/semantic-router/pkg/metrics" ) // shouldUseReasoningMode determines if reasoning mode should be enabled based on the query category @@ -45,6 +47,25 @@ func (r *OpenAIRouter) getReasoningModeAndCategory(query string) (bool, string) return false, categoryName } +// getModelFamilyAndTemplateParam returns a normalized model family name and the template param to be used (if any) +func getModelFamilyAndTemplateParam(model string) (string, string) { + lower := strings.ToLower(strings.TrimSpace(model)) + if strings.Contains(lower, "qwen3") { + return "qwen3", "enable_thinking" + } + if strings.Contains(lower, "deepseek") || strings.Contains(lower, "ds") { + return "deepseek", "thinking" + } + // GPT-OSS family and generic GPT fall back to using reasoning_effort (OpenAI-compatible field) + if strings.Contains(lower, "gpt-oss") || strings.Contains(lower, "gpt_oss") { + return "gpt-oss", "reasoning_effort" + } + if strings.Contains(lower, "gpt") { + return "gpt", "reasoning_effort" + } + return "unknown", "" +} + // getChatTemplateKwargs returns the appropriate chat template kwargs based on model and reasoning mode func getChatTemplateKwargs(model string, useReasoning bool) map[string]interface{} { lower := strings.ToLower(strings.TrimSpace(model)) @@ -83,8 +104,11 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled } } + family, param := getModelFamilyAndTemplateParam(model) + // Add chat_template_kwargs for reasoning mode - if kwargs := getChatTemplateKwargs(model, enabled); kwargs != nil { + kwargs := getChatTemplateKwargs(model, enabled) + if kwargs != nil { requestMap["chat_template_kwargs"] = kwargs } else { delete(requestMap, "chat_template_kwargs") @@ -96,17 +120,35 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled // This seems to be the default for openai/gpt-oss models originalReasoningEffort = "low" } + var appliedEffort string if enabled { // Use configurable reasoning effort based on category effort := r.getReasoningEffort(categoryName) requestMap["reasoning_effort"] = effort + appliedEffort = effort } else { requestMap["reasoning_effort"] = originalReasoningEffort + if s, ok := originalReasoningEffort.(string); ok { + appliedEffort = s + } } log.Printf("Original reasoning effort: %s", originalReasoningEffort) log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model) + // Record metrics for template usage and effort when enabled + if enabled { + // If we applied a known template param, record its usage + if kwargs != nil && param != "" { + metrics.RecordReasoningTemplateUsage(family, param) + } else if kwargs == nil && param == "reasoning_effort" { + // For GPT/GPT-OSS, we only set reasoning_effort + metrics.RecordReasoningTemplateUsage(family, param) + } + // Record which effort level was used for this family + metrics.RecordReasoningEffortUsage(family, appliedEffort) + } + // Serialize back to JSON modifiedBody, err := json.Marshal(requestMap) if err != nil { diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 09e40480..1e8ff7b5 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -337,6 +337,9 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe // Check reasoning mode for this category useReasoning, categoryName := r.getReasoningModeAndCategory(userContent) log.Printf("Reasoning mode decision for this query: %v on [%s] model", useReasoning, matchedModel) + // Record reasoning decision metric with the effort that will be applied if enabled + effortForMetrics := r.getReasoningEffort(categoryName) + metrics.RecordReasoningDecision(categoryName, useReasoning, effortForMetrics) // Track the model load for the selected model r.Classifier.IncrementModelLoad(matchedModel) diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go index ad3f9b41..ce3c1d63 100644 --- a/src/semantic-router/pkg/metrics/metrics.go +++ b/src/semantic-router/pkg/metrics/metrics.go @@ -192,6 +192,33 @@ var ( }, []string{"model", "pii_type"}, ) + + // ReasoningDecisions tracks the reasoning mode decision outcome by category, model, and effort + ReasoningDecisions = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_reasoning_decisions_total", + Help: "The total number of reasoning mode decisions by category, model, and effort", + }, + []string{"category", "model", "enabled", "effort"}, + ) + + // ReasoningTemplateUsage tracks usage of model-family-specific template parameters + ReasoningTemplateUsage = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_reasoning_template_usage_total", + Help: "The total number of times a model family template parameter was applied", + }, + []string{"family", "param"}, + ) + + // ReasoningEffortUsage tracks the distribution of reasoning efforts by model family + ReasoningEffortUsage = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_reasoning_effort_usage_total", + Help: "The total number of times a reasoning effort level was set per model family", + }, + []string{"family", "effort"}, + ) ) // RecordModelRequest increments the counter for requests to a specific model @@ -462,4 +489,33 @@ func InitializeBatchMetrics(config BatchMetricsConfig) { []string{"processing_type"}, ) }) +// RecordReasoningDecision records a reasoning-mode decision for a category, model and effort +func RecordReasoningDecision(category, model string, enabled bool, effort string) { + status := "false" + if enabled { + status = "true" + } + ReasoningDecisions.WithLabelValues(category, model, status, effort).Inc() +} + +// RecordReasoningTemplateUsage records usage of a model-family-specific template parameter +func RecordReasoningTemplateUsage(family, param string) { + if family == "" { + family = "unknown" + } + if param == "" { + param = "none" + } + ReasoningTemplateUsage.WithLabelValues(family, param).Inc() +} + +// RecordReasoningEffortUsage records the effort usage by model family +func RecordReasoningEffortUsage(family, effort string) { + if family == "" { + family = "unknown" + } + if effort == "" { + effort = "unspecified" + } + ReasoningEffortUsage.WithLabelValues(family, effort).Inc() } From 467f36a07b227bc31e13280051259e76589bc557 Mon Sep 17 00:00:00 2001 From: Jintao Zhang Date: Fri, 5 Sep 2025 06:56:51 +0800 Subject: [PATCH 2/2] add model name for metrics Signed-off-by: Jintao Zhang --- src/semantic-router/pkg/extproc/request_handler.go | 2 +- src/semantic-router/pkg/metrics/metrics.go | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 1e8ff7b5..31db6d7a 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -339,7 +339,7 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe log.Printf("Reasoning mode decision for this query: %v on [%s] model", useReasoning, matchedModel) // Record reasoning decision metric with the effort that will be applied if enabled effortForMetrics := r.getReasoningEffort(categoryName) - metrics.RecordReasoningDecision(categoryName, useReasoning, effortForMetrics) + metrics.RecordReasoningDecision(categoryName, matchedModel, useReasoning, effortForMetrics) // Track the model load for the selected model r.Classifier.IncrementModelLoad(matchedModel) diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go index ce3c1d63..42a1fb6b 100644 --- a/src/semantic-router/pkg/metrics/metrics.go +++ b/src/semantic-router/pkg/metrics/metrics.go @@ -489,6 +489,8 @@ func InitializeBatchMetrics(config BatchMetricsConfig) { []string{"processing_type"}, ) }) +} + // RecordReasoningDecision records a reasoning-mode decision for a category, model and effort func RecordReasoningDecision(category, model string, enabled bool, effort string) { status := "false"