feat: support qwen3 reasoning

Xunzhuo · Xunzhuo · commit 47b83d201d10 · 2025-08-30T16:29:46.000+08:00
Signed-off-by: bitliu &lt;bitliu@tencent.com&gt;
diff --git a/bench/router_reason_bench.py b/bench/router_reason_bench.py
@@ -363,6 +363,14 @@ def build_extra_body_for_model(
         # Base: do not set thinking for DeepSeek
         return None
 
+    # Qwen3 family
+    if "qwen3" in lower:
+        if reasoning is True:
+            return {"chat_template_kwargs": {"enable_thinking": True}}
+        if reasoning is False:
+            return {"chat_template_kwargs": {"enable_thinking": False}}
+        return None
+
     # GPT OSS family
     if "gpt-oss" in lower or "openai/gpt-oss" in lower or "gpt_oss" in lower:
         # Base -> low effort, On -> provided effort (e.g., high)
@@ -527,9 +535,7 @@ def run_variants(q: Dict[str, Any]) -> List[Dict[str, Any]]:
 
     with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
         futures = [executor.submit(run_variants, q) for q in questions_data]
-        for future in tqdm(
-            futures, total=len(futures), desc=f"Evaluating {model} (vLLM modes)"
-        ):
+        for future in tqdm(futures, total=len(futures), desc=f"Evaluating {model} (vLLM modes)"):
             results.extend(future.result())
 
     return pd.DataFrame(results)
@@ -578,9 +584,7 @@ def run_all_modes(q: Dict[str, Any]) -> List[Dict[str, Any]]:
 
     with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
         futures = [executor.submit(run_all_modes, q) for q in questions]
-        for future in tqdm(
-            futures, total=len(futures), desc=f"Evaluating {model} (policies)"
-        ):
+        for future in tqdm(futures, total=len(futures), desc=f"Evaluating {model} (policies)"):
             per_call_records.extend(future.result())
 
     calls_df = pd.DataFrame(per_call_records)
diff --git a/src/semantic-router/pkg/extproc/endpoint_selection_test.go b/src/semantic-router/pkg/extproc/endpoint_selection_test.go
@@ -78,12 +78,22 @@ var _ = Describe("Endpoint Selection", func() {
 						if header.Header.Key == "x-semantic-destination-endpoint" {
 							endpointHeaderFound = true
 							// Should be one of the configured endpoint addresses
-							Expect(header.Header.Value).To(BeElementOf("127.0.0.1:8000", "127.0.0.1:8001"))
+							// Check both Value and RawValue since implementation uses RawValue
+							headerValue := header.Header.Value
+							if headerValue == "" && len(header.Header.RawValue) > 0 {
+								headerValue = string(header.Header.RawValue)
+							}
+							Expect(headerValue).To(BeElementOf("127.0.0.1:8000", "127.0.0.1:8001"))
 						}
 						if header.Header.Key == "x-selected-model" {
 							modelHeaderFound = true
 							// Should be one of the configured models
-							Expect(header.Header.Value).To(BeElementOf("model-a", "model-b"))
+							// Check both Value and RawValue since implementation may use either
+							headerValue := header.Header.Value
+							if headerValue == "" && len(header.Header.RawValue) > 0 {
+								headerValue = string(header.Header.RawValue)
+							}
+							Expect(headerValue).To(BeElementOf("model-a", "model-b"))
 						}
 					}
 
@@ -141,7 +151,11 @@ var _ = Describe("Endpoint Selection", func() {
 					for _, header := range headerMutation.SetHeaders {
 						if header.Header.Key == "x-semantic-destination-endpoint" {
 							endpointHeaderFound = true
+							// Check both Value and RawValue since implementation uses RawValue
 							selectedEndpoint = header.Header.Value
+							if selectedEndpoint == "" && len(header.Header.RawValue) > 0 {
+								selectedEndpoint = string(header.Header.RawValue)
+							}
 							break
 						}
 					}
@@ -200,7 +214,11 @@ var _ = Describe("Endpoint Selection", func() {
 					for _, header := range headerMutation.SetHeaders {
 						if header.Header.Key == "x-semantic-destination-endpoint" {
 							endpointHeaderFound = true
+							// Check both Value and RawValue since implementation uses RawValue
 							selectedEndpoint = header.Header.Value
+							if selectedEndpoint == "" && len(header.Header.RawValue) > 0 {
+								selectedEndpoint = string(header.Header.RawValue)
+							}
 							break
 						}
 					}
diff --git a/src/semantic-router/pkg/extproc/reason_mode_selector.go b/src/semantic-router/pkg/extproc/reason_mode_selector.go
@@ -39,13 +39,25 @@ func (r *OpenAIRouter) shouldUseReasoningMode(query string) bool {
 	return false
 }
 
-// getChatTemplateKwargs returns the appropriate chat template kwargs based on reasoning mode and streaming
-func getChatTemplateKwargs(useReasoning bool) map[string]interface{} {
-	if useReasoning {
+// getChatTemplateKwargs returns the appropriate chat template kwargs based on model and reasoning mode
+func getChatTemplateKwargs(model string, useReasoning bool) map[string]interface{} {
+	lower := strings.ToLower(strings.TrimSpace(model))
+
+	// Qwen3: use enable_thinking true/false
+	if strings.Contains(lower, "qwen3") {
+		return map[string]interface{}{
+			"enable_thinking": useReasoning,
+		}
+	}
+
+	// DeepSeek v3 family: use thinking true/false
+	if strings.Contains(lower, "deepseek") || strings.Contains(lower, "ds") {
 		return map[string]interface{}{
 			"thinking": useReasoning,
 		}
 	}
+
+	// Default: no chat template kwargs
 	return nil
 }
 
@@ -57,8 +69,20 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
 		return nil, fmt.Errorf("failed to parse request body: %w", err)
 	}
 
+	// Determine model for kwargs and logging
+	model := "unknown"
+	if modelValue, ok := requestMap["model"]; ok {
+		if modelStr, ok := modelValue.(string); ok {
+			model = modelStr
+		}
+	}
+
 	// Add chat_template_kwargs for reasoning mode
-	requestMap["chat_template_kwargs"] = getChatTemplateKwargs(enabled)
+	if kwargs := getChatTemplateKwargs(model, enabled); kwargs != nil {
+		requestMap["chat_template_kwargs"] = kwargs
+	} else {
+		delete(requestMap, "chat_template_kwargs")
+	}
 	// Also set Reasoning-Effort in openai request
 	// This is a hack to get the reasoning mode for openai/gpt-oss-20b to work
 	originalReasoningEffort, ok := requestMap["reasoning_effort"]
@@ -73,16 +97,8 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
 		requestMap["reasoning_effort"] = originalReasoningEffort
 	}
 
-	// Get the model name for logging
-	model := "unknown"
-	if modelValue, ok := requestMap["model"]; ok {
-		if modelStr, ok := modelValue.(string); ok {
-			model = modelStr
-		}
-	}
-
 	log.Printf("Original reasoning effort: %s", originalReasoningEffort)
-	log.Printf("Added reasoning mode (thinking: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model)
+	log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model)
 
 	// Serialize back to JSON
 	modifiedBody, err := json.Marshal(requestMap)
diff --git a/src/semantic-router/pkg/extproc/reasoning_integration_test.go b/src/semantic-router/pkg/extproc/reasoning_integration_test.go
@@ -65,8 +65,9 @@ func TestReasoningModeIntegration(t *testing.T) {
 
 	// Test case 3: Test addReasoningModeToRequestBody function
 	t.Run("addReasoningModeToRequestBody adds correct fields", func(t *testing.T) {
+		// Test with DeepSeek model (which supports chat_template_kwargs)
 		originalRequest := map[string]interface{}{
-			"model": "phi4",
+			"model": "deepseek-v31",
 			"messages": []map[string]interface{}{
 				{"role": "user", "content": "What is 2 + 2?"},
 			},
@@ -88,23 +89,23 @@ func TestReasoningModeIntegration(t *testing.T) {
 			t.Fatalf("Failed to unmarshal modified request: %v", err)
 		}
 
-		// Check if chat_template_kwargs was added
+		// Check if chat_template_kwargs was added for DeepSeek model
 		chatTemplateKwargs, exists := modifiedRequest["chat_template_kwargs"]
 		if !exists {
-			t.Error("chat_template_kwargs not found in modified request")
+			t.Error("chat_template_kwargs not found in modified request for DeepSeek model")
 		}
 
-		// Check if thinking: true was set
+		// Check if thinking: true was set for DeepSeek model
 		if kwargs, ok := chatTemplateKwargs.(map[string]interface{}); ok {
 			if thinking, hasThinking := kwargs["thinking"]; hasThinking {
 				if thinkingBool, isBool := thinking.(bool); !isBool || !thinkingBool {
-					t.Errorf("Expected thinking: true, got %v", thinking)
+					t.Errorf("Expected thinking: true for DeepSeek model, got %v", thinking)
 				}
 			} else {
-				t.Error("thinking field not found in chat_template_kwargs")
+				t.Error("thinking field not found in chat_template_kwargs for DeepSeek model")
 			}
 		} else {
-			t.Errorf("chat_template_kwargs is not a map, got %T", chatTemplateKwargs)
+			t.Errorf("chat_template_kwargs is not a map for DeepSeek model, got %T", chatTemplateKwargs)
 		}
 
 		// Verify original fields are preserved
@@ -114,24 +115,80 @@ func TestReasoningModeIntegration(t *testing.T) {
 				t.Errorf("Original field '%s' was lost", field)
 			}
 		}
+
+		// Test with unsupported model (phi4) - should not add chat_template_kwargs
+		originalRequestPhi4 := map[string]interface{}{
+			"model": "phi4",
+			"messages": []map[string]interface{}{
+				{"role": "user", "content": "What is 2 + 2?"},
+			},
+			"stream": false,
+		}
+
+		originalBodyPhi4, err := json.Marshal(originalRequestPhi4)
+		if err != nil {
+			t.Fatalf("Failed to marshal phi4 request: %v", err)
+		}
+
+		modifiedBodyPhi4, err := router.setReasoningModeToRequestBody(originalBodyPhi4, true)
+		if err != nil {
+			t.Fatalf("Failed to process phi4 request: %v", err)
+		}
+
+		var modifiedRequestPhi4 map[string]interface{}
+		if err := json.Unmarshal(modifiedBodyPhi4, &modifiedRequestPhi4); err != nil {
+			t.Fatalf("Failed to unmarshal phi4 request: %v", err)
+		}
+
+		// For phi4, chat_template_kwargs should not be added (since it's not supported)
+		if _, exists := modifiedRequestPhi4["chat_template_kwargs"]; exists {
+			t.Error("chat_template_kwargs should not be added for unsupported model phi4")
+		}
+
+		// But reasoning_effort should still be set
+		if reasoningEffort, exists := modifiedRequestPhi4["reasoning_effort"]; !exists {
+			t.Error("reasoning_effort should be set for phi4 model")
+		} else if reasoningEffort != "high" {
+			t.Errorf("Expected reasoning_effort: high for phi4 model, got %v", reasoningEffort)
+		}
 	})
 
 	// Test case 4: Test getChatTemplateKwargs function
 	t.Run("getChatTemplateKwargs returns correct values", func(t *testing.T) {
-		// Test with reasoning enabled
-		kwargs := getChatTemplateKwargs(true)
+		// Test with DeepSeek model and reasoning enabled
+		kwargs := getChatTemplateKwargs("deepseek-v31", true)
 		if kwargs == nil {
-			t.Error("Expected non-nil kwargs for reasoning enabled")
+			t.Error("Expected non-nil kwargs for DeepSeek model with reasoning enabled")
 		}
 
 		if thinking, ok := kwargs["thinking"]; !ok || thinking != true {
-			t.Errorf("Expected thinking: true, got %v", thinking)
+			t.Errorf("Expected thinking: true for DeepSeek model, got %v", thinking)
+		}
+
+		// Test with DeepSeek model and reasoning disabled
+		kwargs = getChatTemplateKwargs("deepseek-v31", false)
+		if kwargs == nil {
+			t.Error("Expected non-nil kwargs for DeepSeek model with reasoning disabled")
+		}
+
+		if thinking, ok := kwargs["thinking"]; !ok || thinking != false {
+			t.Errorf("Expected thinking: false for DeepSeek model, got %v", thinking)
+		}
+
+		// Test with Qwen3 model and reasoning enabled
+		kwargs = getChatTemplateKwargs("qwen3-7b", true)
+		if kwargs == nil {
+			t.Error("Expected non-nil kwargs for Qwen3 model with reasoning enabled")
+		}
+
+		if enableThinking, ok := kwargs["enable_thinking"]; !ok || enableThinking != true {
+			t.Errorf("Expected enable_thinking: true for Qwen3 model, got %v", enableThinking)
 		}
 
-		// Test with reasoning disabled
-		kwargs = getChatTemplateKwargs(false)
+		// Test with unknown model (should return nil)
+		kwargs = getChatTemplateKwargs("unknown-model", true)
 		if kwargs != nil {
-			t.Errorf("Expected nil kwargs for reasoning disabled, got %v", kwargs)
+			t.Errorf("Expected nil kwargs for unknown model, got %v", kwargs)
 		}
 	})