Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 44 additions & 25 deletions src/semantic-router/pkg/extproc/reason_mode_selector.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func getChatTemplateKwargs(model string, useReasoning bool) map[string]interface
}

// DeepSeek v3 family: use thinking true/false
if strings.Contains(lower, "deepseek") || strings.Contains(lower, "ds") {
if strings.Contains(lower, "deepseek") || hasDeepSeekAlias(lower) {
return map[string]interface{}{
"thinking": useReasoning,
}
Expand Down Expand Up @@ -122,47 +122,66 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled

family, param := getModelFamilyAndTemplateParam(model)

// Add chat_template_kwargs for reasoning mode
kwargs := getChatTemplateKwargs(model, enabled)
if kwargs != nil {
requestMap["chat_template_kwargs"] = kwargs
} else {
delete(requestMap, "chat_template_kwargs")
}
// Also set Reasoning-Effort in openai request
// This is a hack to get the reasoning mode for openai/gpt-oss-20b to work
// Handle reasoning_effort first
originalReasoningEffort, ok := requestMap["reasoning_effort"]
if !ok {
// This seems to be the default for openai/gpt-oss models
originalReasoningEffort = "low"
}
var appliedEffort string

if enabled {
// Use configurable reasoning effort based on category
effort := r.getReasoningEffort(categoryName)
requestMap["reasoning_effort"] = effort
appliedEffort = effort
// When reasoning is enabled
if family == "gpt-oss" || family == "gpt" {
// For GPT models, use reasoning_effort
effort := r.getReasoningEffort(categoryName)
requestMap["reasoning_effort"] = effort
appliedEffort = effort
// Remove chat_template_kwargs for GPT models
delete(requestMap, "chat_template_kwargs")
} else {
// For other models (DeepSeek, Qwen3), use chat_template_kwargs
kwargs := getChatTemplateKwargs(model, enabled)
if kwargs != nil {
requestMap["chat_template_kwargs"] = kwargs
}
// Remove reasoning_effort for non-GPT models
delete(requestMap, "reasoning_effort")
appliedEffort = "N/A"
}
} else {
requestMap["reasoning_effort"] = originalReasoningEffort
if s, ok := originalReasoningEffort.(string); ok {
appliedEffort = s
// When reasoning is disabled
if family == "gpt-oss" {
// For GPT-OSS, preserve original reasoning_effort
requestMap["reasoning_effort"] = originalReasoningEffort
if s, ok := originalReasoningEffort.(string); ok {
appliedEffort = s
}
} else {
// For other models, remove reasoning_effort
delete(requestMap, "reasoning_effort")
appliedEffort = "N/A"
}
// Always remove chat_template_kwargs when reasoning is disabled
delete(requestMap, "chat_template_kwargs")
}

log.Printf("Original reasoning effort: %s", originalReasoningEffort)
log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model)
log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, appliedEffort, model)

// Record metrics for template usage and effort when enabled
if enabled {
// If we applied a known template param, record its usage
if kwargs != nil && param != "" {
metrics.RecordReasoningTemplateUsage(family, param)
} else if kwargs == nil && param == "reasoning_effort" {
// For GPT/GPT-OSS, we only set reasoning_effort
// Record template param usage based on what we actually applied
if family == "gpt-oss" || family == "gpt" {
// For GPT models, we used reasoning_effort
metrics.RecordReasoningTemplateUsage(family, "reasoning_effort")
metrics.RecordReasoningEffortUsage(family, appliedEffort)
} else if param != "" {
// For other models (DeepSeek, Qwen3), we used chat_template_kwargs
metrics.RecordReasoningTemplateUsage(family, param)
// For non-GPT models, effort is N/A
metrics.RecordReasoningEffortUsage(family, appliedEffort)
}
// Record which effort level was used for this family
metrics.RecordReasoningEffortUsage(family, appliedEffort)
}

// Serialize back to JSON
Expand Down
129 changes: 128 additions & 1 deletion src/semantic-router/pkg/extproc/reason_mode_selector_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package extproc

import "testing"
import (
"encoding/json"
"testing"

"github.com/vllm-project/semantic-router/semantic-router/pkg/config"
)

// TestGetModelFamilyAndTemplateParam verifies model-family detection and template parameter mapping
func TestGetModelFamilyAndTemplateParam(t *testing.T) {
Expand Down Expand Up @@ -75,3 +80,125 @@ func TestGetModelFamilyAndTemplateParam(t *testing.T) {
})
}
}

// TestSetReasoningModeToRequestBody verifies that reasoning_effort is handled correctly for different model families
func TestSetReasoningModeToRequestBody(t *testing.T) {
// Create a minimal router for testing
router := &OpenAIRouter{
Config: &config.RouterConfig{
DefaultReasoningEffort: "medium",
},
}

testCases := []struct {
name string
model string
enabled bool
initialReasoningEffort interface{}
expectReasoningEffortKey bool
expectedReasoningEffort interface{}
expectedChatTemplateKwargs bool
}{
{
name: "GPT-OSS model with reasoning disabled - preserve reasoning_effort",
model: "gpt-oss-20b",
enabled: false,
initialReasoningEffort: "low",
expectReasoningEffortKey: true,
expectedReasoningEffort: "low",
expectedChatTemplateKwargs: false,
},
{
name: "Phi4 model with reasoning disabled - remove reasoning_effort",
model: "phi4",
enabled: false,
initialReasoningEffort: "low",
expectReasoningEffortKey: false,
expectedReasoningEffort: nil,
expectedChatTemplateKwargs: false,
},
{
name: "DeepSeek model with reasoning disabled - remove reasoning_effort",
model: "deepseek-v31",
enabled: false,
initialReasoningEffort: "low",
expectReasoningEffortKey: false,
expectedReasoningEffort: nil,
expectedChatTemplateKwargs: false,
},
{
name: "GPT-OSS model with reasoning enabled - set reasoning_effort",
model: "gpt-oss-20b",
enabled: true,
initialReasoningEffort: "low",
expectReasoningEffortKey: true,
expectedReasoningEffort: "medium",
expectedChatTemplateKwargs: false,
},
{
name: "DeepSeek model with reasoning enabled - set chat_template_kwargs",
model: "deepseek-v31",
enabled: true,
initialReasoningEffort: "low",
expectReasoningEffortKey: false,
expectedReasoningEffort: nil,
expectedChatTemplateKwargs: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// Prepare initial request body
requestBody := map[string]interface{}{
"model": tc.model,
"messages": []map[string]string{
{"role": "user", "content": "test message"},
},
}
if tc.initialReasoningEffort != nil {
requestBody["reasoning_effort"] = tc.initialReasoningEffort
}

requestBytes, err := json.Marshal(requestBody)
if err != nil {
t.Fatalf("Failed to marshal request body: %v", err)
}

// Call the function under test
modifiedBytes, err := router.setReasoningModeToRequestBody(requestBytes, tc.enabled, "test-category")
if err != nil {
t.Fatalf("setReasoningModeToRequestBody failed: %v", err)
}

// Parse the modified request body
var modifiedRequest map[string]interface{}
if err := json.Unmarshal(modifiedBytes, &modifiedRequest); err != nil {
t.Fatalf("Failed to unmarshal modified request body: %v", err)
}

// Check reasoning_effort handling
reasoningEffort, hasReasoningEffort := modifiedRequest["reasoning_effort"]
if tc.expectReasoningEffortKey != hasReasoningEffort {
t.Fatalf("Expected reasoning_effort key presence: %v, got: %v", tc.expectReasoningEffortKey, hasReasoningEffort)
}
if tc.expectReasoningEffortKey && reasoningEffort != tc.expectedReasoningEffort {
t.Fatalf("Expected reasoning_effort: %v, got: %v", tc.expectedReasoningEffort, reasoningEffort)
}

// Check chat_template_kwargs handling
chatTemplateKwargs, hasChatTemplateKwargs := modifiedRequest["chat_template_kwargs"]
if tc.expectedChatTemplateKwargs != hasChatTemplateKwargs {
t.Fatalf("Expected chat_template_kwargs key presence: %v, got: %v", tc.expectedChatTemplateKwargs, hasChatTemplateKwargs)
}
if tc.expectedChatTemplateKwargs {
kwargs, ok := chatTemplateKwargs.(map[string]interface{})
if !ok {
t.Fatalf("Expected chat_template_kwargs to be a map")
}
if len(kwargs) == 0 {
t.Fatalf("Expected non-empty chat_template_kwargs")
}
}
})
}
}
Loading