Skip to content

Commit ddc86e3

Browse files
authored
Merge pull request #56 from tao12345666333/feat-reasoning-mode-ctl
feat: reasoning model controller
2 parents 945fb5c + 467f36a commit ddc86e3

File tree

3 files changed

+104
-1
lines changed

3 files changed

+104
-1
lines changed

src/semantic-router/pkg/extproc/reason_mode_selector.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"fmt"
66
"log"
77
"strings"
8+
9+
"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
810
)
911

1012
// shouldUseReasoningMode determines if reasoning mode should be enabled based on the query category
@@ -45,6 +47,25 @@ func (r *OpenAIRouter) getReasoningModeAndCategory(query string) (bool, string)
4547
return false, categoryName
4648
}
4749

50+
// getModelFamilyAndTemplateParam returns a normalized model family name and the template param to be used (if any)
51+
func getModelFamilyAndTemplateParam(model string) (string, string) {
52+
lower := strings.ToLower(strings.TrimSpace(model))
53+
if strings.Contains(lower, "qwen3") {
54+
return "qwen3", "enable_thinking"
55+
}
56+
if strings.Contains(lower, "deepseek") || strings.Contains(lower, "ds") {
57+
return "deepseek", "thinking"
58+
}
59+
// GPT-OSS family and generic GPT fall back to using reasoning_effort (OpenAI-compatible field)
60+
if strings.Contains(lower, "gpt-oss") || strings.Contains(lower, "gpt_oss") {
61+
return "gpt-oss", "reasoning_effort"
62+
}
63+
if strings.Contains(lower, "gpt") {
64+
return "gpt", "reasoning_effort"
65+
}
66+
return "unknown", ""
67+
}
68+
4869
// getChatTemplateKwargs returns the appropriate chat template kwargs based on model and reasoning mode
4970
func getChatTemplateKwargs(model string, useReasoning bool) map[string]interface{} {
5071
lower := strings.ToLower(strings.TrimSpace(model))
@@ -83,8 +104,11 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
83104
}
84105
}
85106

107+
family, param := getModelFamilyAndTemplateParam(model)
108+
86109
// Add chat_template_kwargs for reasoning mode
87-
if kwargs := getChatTemplateKwargs(model, enabled); kwargs != nil {
110+
kwargs := getChatTemplateKwargs(model, enabled)
111+
if kwargs != nil {
88112
requestMap["chat_template_kwargs"] = kwargs
89113
} else {
90114
delete(requestMap, "chat_template_kwargs")
@@ -96,17 +120,35 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
96120
// This seems to be the default for openai/gpt-oss models
97121
originalReasoningEffort = "low"
98122
}
123+
var appliedEffort string
99124
if enabled {
100125
// Use configurable reasoning effort based on category
101126
effort := r.getReasoningEffort(categoryName)
102127
requestMap["reasoning_effort"] = effort
128+
appliedEffort = effort
103129
} else {
104130
requestMap["reasoning_effort"] = originalReasoningEffort
131+
if s, ok := originalReasoningEffort.(string); ok {
132+
appliedEffort = s
133+
}
105134
}
106135

107136
log.Printf("Original reasoning effort: %s", originalReasoningEffort)
108137
log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model)
109138

139+
// Record metrics for template usage and effort when enabled
140+
if enabled {
141+
// If we applied a known template param, record its usage
142+
if kwargs != nil && param != "" {
143+
metrics.RecordReasoningTemplateUsage(family, param)
144+
} else if kwargs == nil && param == "reasoning_effort" {
145+
// For GPT/GPT-OSS, we only set reasoning_effort
146+
metrics.RecordReasoningTemplateUsage(family, param)
147+
}
148+
// Record which effort level was used for this family
149+
metrics.RecordReasoningEffortUsage(family, appliedEffort)
150+
}
151+
110152
// Serialize back to JSON
111153
modifiedBody, err := json.Marshal(requestMap)
112154
if err != nil {

src/semantic-router/pkg/extproc/request_handler.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
337337
// Check reasoning mode for this category
338338
useReasoning, categoryName := r.getReasoningModeAndCategory(userContent)
339339
log.Printf("Reasoning mode decision for this query: %v on [%s] model", useReasoning, matchedModel)
340+
// Record reasoning decision metric with the effort that will be applied if enabled
341+
effortForMetrics := r.getReasoningEffort(categoryName)
342+
metrics.RecordReasoningDecision(categoryName, matchedModel, useReasoning, effortForMetrics)
340343

341344
// Track the model load for the selected model
342345
r.Classifier.IncrementModelLoad(matchedModel)

src/semantic-router/pkg/metrics/metrics.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,33 @@ var (
192192
},
193193
[]string{"model", "pii_type"},
194194
)
195+
196+
// ReasoningDecisions tracks the reasoning mode decision outcome by category, model, and effort
197+
ReasoningDecisions = promauto.NewCounterVec(
198+
prometheus.CounterOpts{
199+
Name: "llm_reasoning_decisions_total",
200+
Help: "The total number of reasoning mode decisions by category, model, and effort",
201+
},
202+
[]string{"category", "model", "enabled", "effort"},
203+
)
204+
205+
// ReasoningTemplateUsage tracks usage of model-family-specific template parameters
206+
ReasoningTemplateUsage = promauto.NewCounterVec(
207+
prometheus.CounterOpts{
208+
Name: "llm_reasoning_template_usage_total",
209+
Help: "The total number of times a model family template parameter was applied",
210+
},
211+
[]string{"family", "param"},
212+
)
213+
214+
// ReasoningEffortUsage tracks the distribution of reasoning efforts by model family
215+
ReasoningEffortUsage = promauto.NewCounterVec(
216+
prometheus.CounterOpts{
217+
Name: "llm_reasoning_effort_usage_total",
218+
Help: "The total number of times a reasoning effort level was set per model family",
219+
},
220+
[]string{"family", "effort"},
221+
)
195222
)
196223

197224
// RecordModelRequest increments the counter for requests to a specific model
@@ -463,3 +490,34 @@ func InitializeBatchMetrics(config BatchMetricsConfig) {
463490
)
464491
})
465492
}
493+
494+
// RecordReasoningDecision records a reasoning-mode decision for a category, model and effort
495+
func RecordReasoningDecision(category, model string, enabled bool, effort string) {
496+
status := "false"
497+
if enabled {
498+
status = "true"
499+
}
500+
ReasoningDecisions.WithLabelValues(category, model, status, effort).Inc()
501+
}
502+
503+
// RecordReasoningTemplateUsage records usage of a model-family-specific template parameter
504+
func RecordReasoningTemplateUsage(family, param string) {
505+
if family == "" {
506+
family = "unknown"
507+
}
508+
if param == "" {
509+
param = "none"
510+
}
511+
ReasoningTemplateUsage.WithLabelValues(family, param).Inc()
512+
}
513+
514+
// RecordReasoningEffortUsage records the effort usage by model family
515+
func RecordReasoningEffortUsage(family, effort string) {
516+
if family == "" {
517+
family = "unknown"
518+
}
519+
if effort == "" {
520+
effort = "unspecified"
521+
}
522+
ReasoningEffortUsage.WithLabelValues(family, effort).Inc()
523+
}

0 commit comments

Comments
 (0)