1
- # Example Configuration for MCP-Based Category Classifier
1
+ # Example Configuration for MCP-Based Category Classifier (HTTP Transport)
2
2
#
3
3
# This configuration demonstrates how to use an external MCP (Model Context Protocol)
4
- # service for category classification instead of the built-in Candle/ModernBERT models.
4
+ # service via HTTP for category classification instead of the built-in Candle/ModernBERT models.
5
5
#
6
6
# Use cases:
7
- # - Offload classification to a remote service
7
+ # - Offload classification to a remote HTTP service
8
8
# - Use custom classification models not supported in-tree
9
9
# - Scale classification independently from the router
10
- # - Integrate with existing ML infrastructure
10
+ # - Integrate with existing ML infrastructure via REST API
11
+ #
12
+ # Note: This example uses HTTP transport. The MCP server should expose an HTTP endpoint
13
+ # that implements the MCP protocol (e.g., http://localhost:8080/mcp)
11
14
12
15
# BERT model for semantic caching and tool selection
13
16
bert_model :
14
- model_id : " models /all-MiniLM-L6-v2"
17
+ model_id : " sentence-transformers /all-MiniLM-L6-v2"
15
18
threshold : 0.85
16
19
use_cpu : true
17
20
@@ -20,156 +23,85 @@ classifier:
20
23
# Disable in-tree category classifier (leave model_id empty)
21
24
category_model :
22
25
model_id : " " # Empty = disabled
23
- threshold : 0.6
24
- use_cpu : true
25
- use_modernbert : false
26
- category_mapping_path : " "
27
26
28
- # Enable MCP-based category classifier
27
+ # Enable MCP-based category classifier (HTTP transport only)
29
28
mcp_category_model :
30
29
enabled : true # Enable MCP classifier
31
- transport_type : " stdio" # "stdio" or "http"
32
-
33
- # For stdio transport: run a local Python MCP server
34
- command : " python"
35
- args : ["-m", "mcp_category_classifier"]
36
- env :
37
- PYTHONPATH : " /opt/ml/models"
38
- MODEL_PATH : " /opt/ml/models/category_classifier"
39
- LOG_LEVEL : " INFO"
30
+ transport_type : " http" # HTTP transport
31
+ url : " http://localhost:8090/mcp" # MCP server endpoint
40
32
41
- # For http transport: use this instead
42
- # transport_type: "http"
43
- # url: "http://localhost:8080/mcp"
44
-
45
- tool_name : " classify_text" # MCP tool name to call
33
+ tool_name : " classify_text" # MCP tool name to call
46
34
threshold : 0.6 # Confidence threshold
47
35
timeout_seconds : 30 # Request timeout
48
36
49
- # PII model configuration (unchanged)
50
- pii_model :
51
- model_id : " models/pii_classifier"
52
- threshold : 0.7
53
- use_cpu : true
54
- pii_mapping_path : " models/pii_classifier/pii_type_mapping.json"
55
-
56
- # Prompt guard configuration (unchanged)
57
- prompt_guard :
58
- enabled : true
59
- model_id : " models/jailbreak_classifier"
60
- threshold : 0.8
61
- use_cpu : true
62
- use_modernbert : true
63
- jailbreak_mapping_path : " models/jailbreak_classifier/jailbreak_mapping.json"
64
-
65
37
# Categories for routing queries
66
- categories :
67
- - name : " math"
68
- description : " Mathematical problems, equations, calculus, algebra, statistics"
69
- model_scores :
70
- - model : " deepseek/deepseek-r1:70b"
71
- score : 0.95
72
- use_reasoning : true
73
- - model : " qwen/qwen3-235b"
74
- score : 0.90
75
- use_reasoning : true
76
- mmlu_categories :
77
- - " mathematics"
78
- - " statistics"
79
-
80
- - name : " coding"
81
- description : " Programming, software development, debugging, algorithms"
82
- model_scores :
83
- - model : " deepseek/deepseek-r1-coder:33b"
84
- score : 0.95
85
- use_reasoning : true
86
- - model : " meta/llama3.1-70b"
87
- score : 0.85
88
- use_reasoning : false
89
- mmlu_categories :
90
- - " computer_science"
91
- - " engineering"
92
-
93
- - name : " general"
94
- description : " General knowledge, conversation, misc queries"
95
- model_scores :
96
- - model : " meta/llama3.1-70b"
97
- score : 0.90
98
- use_reasoning : false
99
- - model : " qwen/qwen3-235b"
100
- score : 0.85
101
- use_reasoning : false
38
+ #
39
+ # Categories are automatically loaded from MCP server via 'list_categories' tool.
40
+ # The MCP server controls BOTH classification AND routing decisions.
41
+ #
42
+ # How it works:
43
+ # 1. Router connects to MCP server at startup
44
+ # 2. Calls 'list_categories' tool: MCP returns {"categories": ["business", "law", ...]}
45
+ # 3. For each request, calls 'classify_text' tool which returns:
46
+ # {
47
+ # "class": 3,
48
+ # "confidence": 0.85,
49
+ # "model": "openai/gpt-oss-20b", # MCP decides which model to use
50
+ # "use_reasoning": true # MCP decides whether to use reasoning
51
+ # }
52
+ # 4. Router uses the model and reasoning settings from MCP response
53
+ #
54
+ # BENEFITS:
55
+ # - MCP server makes intelligent routing decisions per query
56
+ # - No hardcoded routing rules needed in config
57
+ # - MCP can adapt routing based on query complexity, content, etc.
58
+ # - Centralized routing logic in MCP server
59
+ #
60
+ # FALLBACK:
61
+ # - If MCP doesn't return model/use_reasoning, uses default_model below
62
+ # - Can also add category-specific overrides here if needed
63
+ #
64
+ categories : []
102
65
103
66
# Default model to use when category can't be determined
104
- default_model : " meta/llama3.1-70b "
67
+ default_model : openai/gpt-oss-20b
105
68
106
69
# vLLM endpoints configuration
107
70
vllm_endpoints :
108
- - name : " deepseek-endpoint "
109
- address : " 10 .0.1.10 "
71
+ - name : endpoint1
72
+ address : 127 .0.0.1
110
73
port : 8000
111
74
models :
112
- - " deepseek/deepseek-r1:70b"
113
- - " deepseek/deepseek-r1-coder:33b"
114
- weight : 100
115
-
116
- - name : " qwen-endpoint"
117
- address : " 10.0.1.11"
118
- port : 8000
119
- models :
120
- - " qwen/qwen3-235b"
121
- weight : 100
122
-
123
- - name : " llama-endpoint"
124
- address : " 10.0.1.12"
125
- port : 8000
126
- models :
127
- - " meta/llama3.1-70b"
128
- weight : 100
129
-
130
- # Semantic cache configuration (optional)
131
- semantic_cache :
132
- enabled : true
133
- backend_type : " in-memory"
134
- similarity_threshold : 0.90
135
- max_entries : 1000
136
- ttl_seconds : 3600
137
- eviction_policy : " lru"
75
+ - openai/gpt-oss-20b
76
+ weight : 1
77
+ health_check_path : /health
138
78
139
79
# Model-specific configuration
140
80
model_config :
141
- " deepseek/deepseek-r1:70b " :
142
- reasoning_family : " deepseek"
143
- pii_policy :
144
- allow_by_default : false
145
- pii_types_allowed : []
146
-
147
- " deepseek/deepseek-r1-coder:33b " :
148
- reasoning_family : " deepseek"
149
- pii_policy :
150
- allow_by_default : false
151
- pii_types_allowed : []
152
-
153
- " qwen/qwen3-235b " :
154
- reasoning_family : " qwen3"
155
- pii_policy :
156
- allow_by_default : true
157
-
158
- " meta/llama3.1-70b " :
81
+ openai/gpt-oss-20b :
82
+ reasoning_family : gpt-oss
83
+ preferred_endpoints :
84
+ - endpoint1
159
85
pii_policy :
160
86
allow_by_default : true
161
87
162
88
# Reasoning family configurations
163
89
reasoning_families :
164
90
deepseek :
165
- type : " chat_template_kwargs"
166
- parameter : " thinking"
91
+ type : chat_template_kwargs
92
+ parameter : thinking
167
93
qwen3 :
168
- type : " reasoning_effort "
169
- parameter : " reasoning_effort "
94
+ type : chat_template_kwargs
95
+ parameter : enable_thinking
170
96
gpt-oss :
171
- type : " chat_template_kwargs"
172
- parameter : " enable_thinking"
97
+ type : reasoning_effort
98
+ parameter : reasoning_effort
99
+ gpt :
100
+ type : reasoning_effort
101
+ parameter : reasoning_effort
102
+
103
+ # Default reasoning effort level
104
+ default_reasoning_effort : high
173
105
174
106
# Tools configuration (optional)
175
107
tools :
@@ -182,9 +114,37 @@ tools:
182
114
# API configuration
183
115
api :
184
116
batch_classification :
117
+ max_batch_size : 100
118
+ concurrency_threshold : 5
119
+ max_concurrency : 8
185
120
metrics :
186
121
enabled : true
122
+ detailed_goroutine_tracking : true
123
+ high_resolution_timing : false
187
124
sample_rate : 1.0
125
+ duration_buckets :
126
+ - 0.001
127
+ - 0.005
128
+ - 0.01
129
+ - 0.025
130
+ - 0.05
131
+ - 0.1
132
+ - 0.25
133
+ - 0.5
134
+ - 1
135
+ - 2.5
136
+ - 5
137
+ - 10
138
+ - 30
139
+ size_buckets :
140
+ - 1
141
+ - 2
142
+ - 5
143
+ - 10
144
+ - 20
145
+ - 50
146
+ - 100
147
+ - 200
188
148
189
149
# Observability configuration
190
150
observability :
0 commit comments