55 "fmt"
66 "log"
77 "strings"
8+
9+ "github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
810)
911
1012// shouldUseReasoningMode determines if reasoning mode should be enabled based on the query category
@@ -45,6 +47,25 @@ func (r *OpenAIRouter) getReasoningModeAndCategory(query string) (bool, string)
4547 return false , categoryName
4648}
4749
50+ // getModelFamilyAndTemplateParam returns a normalized model family name and the template param to be used (if any)
51+ func getModelFamilyAndTemplateParam (model string ) (string , string ) {
52+ lower := strings .ToLower (strings .TrimSpace (model ))
53+ if strings .Contains (lower , "qwen3" ) {
54+ return "qwen3" , "enable_thinking"
55+ }
56+ if strings .Contains (lower , "deepseek" ) || strings .Contains (lower , "ds" ) {
57+ return "deepseek" , "thinking"
58+ }
59+ // GPT-OSS family and generic GPT fall back to using reasoning_effort (OpenAI-compatible field)
60+ if strings .Contains (lower , "gpt-oss" ) || strings .Contains (lower , "gpt_oss" ) {
61+ return "gpt-oss" , "reasoning_effort"
62+ }
63+ if strings .Contains (lower , "gpt" ) {
64+ return "gpt" , "reasoning_effort"
65+ }
66+ return "unknown" , ""
67+ }
68+
4869// getChatTemplateKwargs returns the appropriate chat template kwargs based on model and reasoning mode
4970func getChatTemplateKwargs (model string , useReasoning bool ) map [string ]interface {} {
5071 lower := strings .ToLower (strings .TrimSpace (model ))
@@ -83,8 +104,11 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
83104 }
84105 }
85106
107+ family , param := getModelFamilyAndTemplateParam (model )
108+
86109 // Add chat_template_kwargs for reasoning mode
87- if kwargs := getChatTemplateKwargs (model , enabled ); kwargs != nil {
110+ kwargs := getChatTemplateKwargs (model , enabled )
111+ if kwargs != nil {
88112 requestMap ["chat_template_kwargs" ] = kwargs
89113 } else {
90114 delete (requestMap , "chat_template_kwargs" )
@@ -96,17 +120,35 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
96120 // This seems to be the default for openai/gpt-oss models
97121 originalReasoningEffort = "low"
98122 }
123+ var appliedEffort string
99124 if enabled {
100125 // Use configurable reasoning effort based on category
101126 effort := r .getReasoningEffort (categoryName )
102127 requestMap ["reasoning_effort" ] = effort
128+ appliedEffort = effort
103129 } else {
104130 requestMap ["reasoning_effort" ] = originalReasoningEffort
131+ if s , ok := originalReasoningEffort .(string ); ok {
132+ appliedEffort = s
133+ }
105134 }
106135
107136 log .Printf ("Original reasoning effort: %s" , originalReasoningEffort )
108137 log .Printf ("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s" , enabled , requestMap ["reasoning_effort" ], model )
109138
139+ // Record metrics for template usage and effort when enabled
140+ if enabled {
141+ // If we applied a known template param, record its usage
142+ if kwargs != nil && param != "" {
143+ metrics .RecordReasoningTemplateUsage (family , param )
144+ } else if kwargs == nil && param == "reasoning_effort" {
145+ // For GPT/GPT-OSS, we only set reasoning_effort
146+ metrics .RecordReasoningTemplateUsage (family , param )
147+ }
148+ // Record which effort level was used for this family
149+ metrics .RecordReasoningEffortUsage (family , appliedEffort )
150+ }
151+
110152 // Serialize back to JSON
111153 modifiedBody , err := json .Marshal (requestMap )
112154 if err != nil {
0 commit comments