From f36fe38f2bba702b452131e4212c841d2711ad7f Mon Sep 17 00:00:00 2001 From: Huamin Chen Date: Mon, 8 Sep 2025 16:51:27 -0400 Subject: [PATCH] fix: don't set reasoning effort for non-reasoning models Signed-off-by: Huamin Chen --- .../pkg/extproc/reason_mode_selector.go | 69 ++++++---- .../pkg/extproc/reason_mode_selector_test.go | 129 +++++++++++++++++- 2 files changed, 172 insertions(+), 26 deletions(-) diff --git a/src/semantic-router/pkg/extproc/reason_mode_selector.go b/src/semantic-router/pkg/extproc/reason_mode_selector.go index aa17cf0e..b3efe53a 100644 --- a/src/semantic-router/pkg/extproc/reason_mode_selector.go +++ b/src/semantic-router/pkg/extproc/reason_mode_selector.go @@ -94,7 +94,7 @@ func getChatTemplateKwargs(model string, useReasoning bool) map[string]interface } // DeepSeek v3 family: use thinking true/false - if strings.Contains(lower, "deepseek") || strings.Contains(lower, "ds") { + if strings.Contains(lower, "deepseek") || hasDeepSeekAlias(lower) { return map[string]interface{}{ "thinking": useReasoning, } @@ -122,47 +122,66 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled family, param := getModelFamilyAndTemplateParam(model) - // Add chat_template_kwargs for reasoning mode - kwargs := getChatTemplateKwargs(model, enabled) - if kwargs != nil { - requestMap["chat_template_kwargs"] = kwargs - } else { - delete(requestMap, "chat_template_kwargs") - } - // Also set Reasoning-Effort in openai request - // This is a hack to get the reasoning mode for openai/gpt-oss-20b to work + // Handle reasoning_effort first originalReasoningEffort, ok := requestMap["reasoning_effort"] if !ok { // This seems to be the default for openai/gpt-oss models originalReasoningEffort = "low" } var appliedEffort string + if enabled { - // Use configurable reasoning effort based on category - effort := r.getReasoningEffort(categoryName) - requestMap["reasoning_effort"] = effort - appliedEffort = effort + // When reasoning is enabled + if family == "gpt-oss" || family == "gpt" { + // For GPT models, use reasoning_effort + effort := r.getReasoningEffort(categoryName) + requestMap["reasoning_effort"] = effort + appliedEffort = effort + // Remove chat_template_kwargs for GPT models + delete(requestMap, "chat_template_kwargs") + } else { + // For other models (DeepSeek, Qwen3), use chat_template_kwargs + kwargs := getChatTemplateKwargs(model, enabled) + if kwargs != nil { + requestMap["chat_template_kwargs"] = kwargs + } + // Remove reasoning_effort for non-GPT models + delete(requestMap, "reasoning_effort") + appliedEffort = "N/A" + } } else { - requestMap["reasoning_effort"] = originalReasoningEffort - if s, ok := originalReasoningEffort.(string); ok { - appliedEffort = s + // When reasoning is disabled + if family == "gpt-oss" { + // For GPT-OSS, preserve original reasoning_effort + requestMap["reasoning_effort"] = originalReasoningEffort + if s, ok := originalReasoningEffort.(string); ok { + appliedEffort = s + } + } else { + // For other models, remove reasoning_effort + delete(requestMap, "reasoning_effort") + appliedEffort = "N/A" } + // Always remove chat_template_kwargs when reasoning is disabled + delete(requestMap, "chat_template_kwargs") } log.Printf("Original reasoning effort: %s", originalReasoningEffort) - log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model) + log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, appliedEffort, model) // Record metrics for template usage and effort when enabled if enabled { - // If we applied a known template param, record its usage - if kwargs != nil && param != "" { - metrics.RecordReasoningTemplateUsage(family, param) - } else if kwargs == nil && param == "reasoning_effort" { - // For GPT/GPT-OSS, we only set reasoning_effort + // Record template param usage based on what we actually applied + if family == "gpt-oss" || family == "gpt" { + // For GPT models, we used reasoning_effort + metrics.RecordReasoningTemplateUsage(family, "reasoning_effort") + metrics.RecordReasoningEffortUsage(family, appliedEffort) + } else if param != "" { + // For other models (DeepSeek, Qwen3), we used chat_template_kwargs metrics.RecordReasoningTemplateUsage(family, param) + // For non-GPT models, effort is N/A + metrics.RecordReasoningEffortUsage(family, appliedEffort) } - // Record which effort level was used for this family - metrics.RecordReasoningEffortUsage(family, appliedEffort) } // Serialize back to JSON diff --git a/src/semantic-router/pkg/extproc/reason_mode_selector_test.go b/src/semantic-router/pkg/extproc/reason_mode_selector_test.go index 527dd1db..1a5cefaa 100644 --- a/src/semantic-router/pkg/extproc/reason_mode_selector_test.go +++ b/src/semantic-router/pkg/extproc/reason_mode_selector_test.go @@ -1,6 +1,11 @@ package extproc -import "testing" +import ( + "encoding/json" + "testing" + + "github.com/vllm-project/semantic-router/semantic-router/pkg/config" +) // TestGetModelFamilyAndTemplateParam verifies model-family detection and template parameter mapping func TestGetModelFamilyAndTemplateParam(t *testing.T) { @@ -75,3 +80,125 @@ func TestGetModelFamilyAndTemplateParam(t *testing.T) { }) } } + +// TestSetReasoningModeToRequestBody verifies that reasoning_effort is handled correctly for different model families +func TestSetReasoningModeToRequestBody(t *testing.T) { + // Create a minimal router for testing + router := &OpenAIRouter{ + Config: &config.RouterConfig{ + DefaultReasoningEffort: "medium", + }, + } + + testCases := []struct { + name string + model string + enabled bool + initialReasoningEffort interface{} + expectReasoningEffortKey bool + expectedReasoningEffort interface{} + expectedChatTemplateKwargs bool + }{ + { + name: "GPT-OSS model with reasoning disabled - preserve reasoning_effort", + model: "gpt-oss-20b", + enabled: false, + initialReasoningEffort: "low", + expectReasoningEffortKey: true, + expectedReasoningEffort: "low", + expectedChatTemplateKwargs: false, + }, + { + name: "Phi4 model with reasoning disabled - remove reasoning_effort", + model: "phi4", + enabled: false, + initialReasoningEffort: "low", + expectReasoningEffortKey: false, + expectedReasoningEffort: nil, + expectedChatTemplateKwargs: false, + }, + { + name: "DeepSeek model with reasoning disabled - remove reasoning_effort", + model: "deepseek-v31", + enabled: false, + initialReasoningEffort: "low", + expectReasoningEffortKey: false, + expectedReasoningEffort: nil, + expectedChatTemplateKwargs: false, + }, + { + name: "GPT-OSS model with reasoning enabled - set reasoning_effort", + model: "gpt-oss-20b", + enabled: true, + initialReasoningEffort: "low", + expectReasoningEffortKey: true, + expectedReasoningEffort: "medium", + expectedChatTemplateKwargs: false, + }, + { + name: "DeepSeek model with reasoning enabled - set chat_template_kwargs", + model: "deepseek-v31", + enabled: true, + initialReasoningEffort: "low", + expectReasoningEffortKey: false, + expectedReasoningEffort: nil, + expectedChatTemplateKwargs: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Prepare initial request body + requestBody := map[string]interface{}{ + "model": tc.model, + "messages": []map[string]string{ + {"role": "user", "content": "test message"}, + }, + } + if tc.initialReasoningEffort != nil { + requestBody["reasoning_effort"] = tc.initialReasoningEffort + } + + requestBytes, err := json.Marshal(requestBody) + if err != nil { + t.Fatalf("Failed to marshal request body: %v", err) + } + + // Call the function under test + modifiedBytes, err := router.setReasoningModeToRequestBody(requestBytes, tc.enabled, "test-category") + if err != nil { + t.Fatalf("setReasoningModeToRequestBody failed: %v", err) + } + + // Parse the modified request body + var modifiedRequest map[string]interface{} + if err := json.Unmarshal(modifiedBytes, &modifiedRequest); err != nil { + t.Fatalf("Failed to unmarshal modified request body: %v", err) + } + + // Check reasoning_effort handling + reasoningEffort, hasReasoningEffort := modifiedRequest["reasoning_effort"] + if tc.expectReasoningEffortKey != hasReasoningEffort { + t.Fatalf("Expected reasoning_effort key presence: %v, got: %v", tc.expectReasoningEffortKey, hasReasoningEffort) + } + if tc.expectReasoningEffortKey && reasoningEffort != tc.expectedReasoningEffort { + t.Fatalf("Expected reasoning_effort: %v, got: %v", tc.expectedReasoningEffort, reasoningEffort) + } + + // Check chat_template_kwargs handling + chatTemplateKwargs, hasChatTemplateKwargs := modifiedRequest["chat_template_kwargs"] + if tc.expectedChatTemplateKwargs != hasChatTemplateKwargs { + t.Fatalf("Expected chat_template_kwargs key presence: %v, got: %v", tc.expectedChatTemplateKwargs, hasChatTemplateKwargs) + } + if tc.expectedChatTemplateKwargs { + kwargs, ok := chatTemplateKwargs.(map[string]interface{}) + if !ok { + t.Fatalf("Expected chat_template_kwargs to be a map") + } + if len(kwargs) == 0 { + t.Fatalf("Expected non-empty chat_template_kwargs") + } + } + }) + } +}