Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion src/semantic-router/pkg/extproc/reason_mode_selector.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"fmt"
"log"
"strings"

"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
)

// shouldUseReasoningMode determines if reasoning mode should be enabled based on the query category
Expand Down Expand Up @@ -45,6 +47,25 @@ func (r *OpenAIRouter) getReasoningModeAndCategory(query string) (bool, string)
return false, categoryName
}

// getModelFamilyAndTemplateParam returns a normalized model family name and the template param to be used (if any)
func getModelFamilyAndTemplateParam(model string) (string, string) {
Copy link
Collaborator

@rootfs rootfs Sep 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a unit test for this? you can do it in a follow up PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add an UT in this PR, let us make sure it does not break the strategy for modifying the body.

lower := strings.ToLower(strings.TrimSpace(model))
if strings.Contains(lower, "qwen3") {
return "qwen3", "enable_thinking"
}
if strings.Contains(lower, "deepseek") || strings.Contains(lower, "ds") {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ds is too broad to me. Maybe we should also add a config validation to detect if there are conflict or duplicate model names with ds in them but not actually deepseek..

Copy link
Collaborator

@rootfs rootfs Sep 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would you add an issue and follow up with this in the next PR for more robust model name filter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I write "ds" here because many people (including on social media) refer to DeepSeek as "ds" and Claude Code as "cc." 🤣

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created an issue for tracking #61

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my setup modelname for deepseek-v3 is also ds-v3 😄

return "deepseek", "thinking"
}
// GPT-OSS family and generic GPT fall back to using reasoning_effort (OpenAI-compatible field)
if strings.Contains(lower, "gpt-oss") || strings.Contains(lower, "gpt_oss") {
return "gpt-oss", "reasoning_effort"
}
if strings.Contains(lower, "gpt") {
return "gpt", "reasoning_effort"
}
return "unknown", ""
}

// getChatTemplateKwargs returns the appropriate chat template kwargs based on model and reasoning mode
func getChatTemplateKwargs(model string, useReasoning bool) map[string]interface{} {
lower := strings.ToLower(strings.TrimSpace(model))
Expand Down Expand Up @@ -83,8 +104,11 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
}
}

family, param := getModelFamilyAndTemplateParam(model)

// Add chat_template_kwargs for reasoning mode
if kwargs := getChatTemplateKwargs(model, enabled); kwargs != nil {
kwargs := getChatTemplateKwargs(model, enabled)
if kwargs != nil {
requestMap["chat_template_kwargs"] = kwargs
} else {
delete(requestMap, "chat_template_kwargs")
Expand All @@ -96,17 +120,35 @@ func (r *OpenAIRouter) setReasoningModeToRequestBody(requestBody []byte, enabled
// This seems to be the default for openai/gpt-oss models
originalReasoningEffort = "low"
}
var appliedEffort string
if enabled {
// Use configurable reasoning effort based on category
effort := r.getReasoningEffort(categoryName)
requestMap["reasoning_effort"] = effort
appliedEffort = effort
} else {
requestMap["reasoning_effort"] = originalReasoningEffort
if s, ok := originalReasoningEffort.(string); ok {
appliedEffort = s
}
}

log.Printf("Original reasoning effort: %s", originalReasoningEffort)
log.Printf("Added reasoning mode (enabled: %v) and reasoning effort (%s) to request for model: %s", enabled, requestMap["reasoning_effort"], model)

// Record metrics for template usage and effort when enabled
if enabled {
// If we applied a known template param, record its usage
if kwargs != nil && param != "" {
metrics.RecordReasoningTemplateUsage(family, param)
} else if kwargs == nil && param == "reasoning_effort" {
// For GPT/GPT-OSS, we only set reasoning_effort
metrics.RecordReasoningTemplateUsage(family, param)
}
// Record which effort level was used for this family
metrics.RecordReasoningEffortUsage(family, appliedEffort)
}

// Serialize back to JSON
modifiedBody, err := json.Marshal(requestMap)
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions src/semantic-router/pkg/extproc/request_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,9 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
// Check reasoning mode for this category
useReasoning, categoryName := r.getReasoningModeAndCategory(userContent)
log.Printf("Reasoning mode decision for this query: %v on [%s] model", useReasoning, matchedModel)
// Record reasoning decision metric with the effort that will be applied if enabled
effortForMetrics := r.getReasoningEffort(categoryName)
metrics.RecordReasoningDecision(categoryName, matchedModel, useReasoning, effortForMetrics)

// Track the model load for the selected model
r.Classifier.IncrementModelLoad(matchedModel)
Expand Down
58 changes: 58 additions & 0 deletions src/semantic-router/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,33 @@ var (
},
[]string{"model", "pii_type"},
)

// ReasoningDecisions tracks the reasoning mode decision outcome by category, model, and effort
ReasoningDecisions = promauto.NewCounterVec(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please come up with a follow up PR to add them to the doc

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added an issue for tracking #62

prometheus.CounterOpts{
Name: "llm_reasoning_decisions_total",
Help: "The total number of reasoning mode decisions by category, model, and effort",
},
[]string{"category", "model", "enabled", "effort"},
)

// ReasoningTemplateUsage tracks usage of model-family-specific template parameters
ReasoningTemplateUsage = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "llm_reasoning_template_usage_total",
Help: "The total number of times a model family template parameter was applied",
},
[]string{"family", "param"},
)

// ReasoningEffortUsage tracks the distribution of reasoning efforts by model family
ReasoningEffortUsage = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "llm_reasoning_effort_usage_total",
Help: "The total number of times a reasoning effort level was set per model family",
},
[]string{"family", "effort"},
)
)

// RecordModelRequest increments the counter for requests to a specific model
Expand Down Expand Up @@ -463,3 +490,34 @@ func InitializeBatchMetrics(config BatchMetricsConfig) {
)
})
}

// RecordReasoningDecision records a reasoning-mode decision for a category, model and effort
func RecordReasoningDecision(category, model string, enabled bool, effort string) {
status := "false"
if enabled {
status = "true"
}
ReasoningDecisions.WithLabelValues(category, model, status, effort).Inc()
}

// RecordReasoningTemplateUsage records usage of a model-family-specific template parameter
func RecordReasoningTemplateUsage(family, param string) {
if family == "" {
family = "unknown"
}
if param == "" {
param = "none"
}
ReasoningTemplateUsage.WithLabelValues(family, param).Inc()
}

// RecordReasoningEffortUsage records the effort usage by model family
func RecordReasoningEffortUsage(family, effort string) {
if family == "" {
family = "unknown"
}
if effort == "" {
effort = "unspecified"
}
ReasoningEffortUsage.WithLabelValues(family, effort).Inc()
}
Loading