Skip to content

Commit 02d619a

Browse files
Xunzhuorootfs
andauthored
[Feat][Memory] Add OpenAI Response API support (#802)
* feat(router): add OpenAI Response API support Implement OpenAI Response API endpoints for the semantic-router extproc: - POST /v1/responses - Create response with translation to Chat Completions - GET /v1/responses/{id} - Retrieve stored response - DELETE /v1/responses/{id} - Delete stored response - GET /v1/responses/{id}/input_items - List input items for a response Key features: - Request translation: Response API format -> Chat Completions format - Response translation: Chat Completions format -> Response API format - Path rewriting: /v1/responses -> /v1/chat/completions for backend - Conversation chaining via previous_response_id - Pluggable storage backend (memory store implemented) - Session-based response storage with TTL support New packages: - pkg/responseapi: Types, translator, and ID generation - pkg/responsestore: Storage interface and memory implementation - pkg/extproc/req_filter_response_api.go: Response API filter Config example: config/testing/config.response-api.yaml Signed-off-by: bitliu <[email protected]> * docs: add Router Memory guide for Response API Add documentation for the Router Memory feature under intelligent routing: - Overview of Response API implementation - Architecture diagram showing request flow - Supported endpoints (POST, GET, DELETE, input_items) - Configuration examples for memory, Milvus, and Redis backends - Usage examples with curl commands - Conversation chaining explanation - API translation table - Storage backends comparison - Roadmap with links to related issues Signed-off-by: bitliu <[email protected]> * docs: simplify Router Memory guide with real examples - Replace ASCII architecture with Mermaid diagram - Use real request/response examples - Remove unimplemented Milvus/Redis config sections - Simplify content for better readability Signed-off-by: bitliu <[email protected]> * docs: add detailed request flow diagram to Router Memory guide Signed-off-by: bitliu <[email protected]> * docs: add cross-model stateful conversation overview Highlight Semantic Router as unified brain for multiple LLM backends that only support Chat Completions API, enabling cross-model stateful conversations. Signed-off-by: bitliu <[email protected]> * update Signed-off-by: bitliu <[email protected]> --------- Signed-off-by: bitliu <[email protected]> Co-authored-by: Huamin Chen <[email protected]>
1 parent 5b412a8 commit 02d619a

File tree

18 files changed

+2361
-7
lines changed

18 files changed

+2361
-7
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Response API testing configuration
2+
# Based on config.e2e.yaml with Response API enabled
3+
4+
bert_model:
5+
model_id: models/all-MiniLM-L12-v2
6+
threshold: 0.6
7+
use_cpu: true
8+
9+
semantic_cache:
10+
enabled: true
11+
backend_type: "memory"
12+
similarity_threshold: 0.8
13+
max_entries: 1000
14+
ttl_seconds: 3600
15+
16+
# Response API Configuration - NEW
17+
response_api:
18+
enabled: true
19+
store_backend: "memory" # Use in-memory store for testing
20+
ttl_seconds: 86400 # 24 hours
21+
max_responses: 1000
22+
23+
# vLLM Endpoints Configuration
24+
vllm_endpoints:
25+
- name: "test-endpoint"
26+
address: "0.0.0.0"
27+
port: 8000
28+
weight: 1
29+
30+
model_config:
31+
"openai/gpt-oss-120b":
32+
use_reasoning: false
33+
preferred_endpoints: ["test-endpoint"]
34+
35+
# Minimal classifier configuration
36+
classifier:
37+
category_model:
38+
model_id: "models/lora_intent_classifier_bert-base-uncased_model"
39+
use_modernbert: false
40+
threshold: 0.6
41+
use_cpu: true
42+
category_mapping_path: "models/lora_intent_classifier_bert-base-uncased_model/category_mapping.json"
43+
44+
categories:
45+
- name: other
46+
description: "General knowledge and miscellaneous topics"
47+
mmlu_categories: ["other"]
48+
49+
strategy: "priority"
50+
51+
decisions:
52+
- name: "default_decision"
53+
description: "Default catch-all decision"
54+
priority: 1
55+
rules:
56+
operator: "OR"
57+
conditions:
58+
- type: "domain"
59+
name: "other"
60+
modelRefs:
61+
- model: "openai/gpt-oss-120b"
62+
use_reasoning: false
63+
64+
default_model: "openai/gpt-oss-120b"
65+
66+
# Reasoning family configurations
67+
reasoning_families:
68+
qwen3:
69+
type: "chat_template_kwargs"
70+
parameter: "enable_thinking"
71+

src/semantic-router/pkg/config/config.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ type RouterConfig struct {
3232
InlineModels `yaml:",inline"`
3333
// Semantic cache configuration
3434
SemanticCache `yaml:"semantic_cache"`
35+
// Response API configuration for stateful conversations
36+
ResponseAPI ResponseAPIConfig `yaml:"response_api"`
3537
// LLMObservability for LLM tracing, metrics, and logging
3638
LLMObservability `yaml:",inline"`
3739
// API server configuration
@@ -224,6 +226,43 @@ type SemanticCache struct {
224226
EmbeddingModel string `yaml:"embedding_model,omitempty"`
225227
}
226228

229+
// ResponseAPIConfig configures the Response API for stateful conversations.
230+
// The Response API provides OpenAI-compatible /v1/responses endpoints
231+
// that support conversation chaining via previous_response_id.
232+
// Requests are translated to Chat Completions format and routed through Envoy.
233+
type ResponseAPIConfig struct {
234+
// Enable Response API endpoints
235+
Enabled bool `yaml:"enabled"`
236+
237+
// Storage backend type: "memory", "milvus"
238+
// Default: "memory"
239+
StoreBackend string `yaml:"store_backend,omitempty"`
240+
241+
// Time-to-live for stored responses in seconds (0 = 30 days default)
242+
TTLSeconds int `yaml:"ttl_seconds,omitempty"`
243+
244+
// Maximum number of responses to store (for memory backend)
245+
MaxResponses int `yaml:"max_responses,omitempty"`
246+
247+
// Path to backend-specific configuration (for milvus)
248+
BackendConfigPath string `yaml:"backend_config_path,omitempty"`
249+
250+
// Milvus configuration (when store_backend is "milvus")
251+
Milvus ResponseAPIMilvusConfig `yaml:"milvus,omitempty"`
252+
}
253+
254+
// ResponseAPIMilvusConfig configures Milvus storage for Response API.
255+
type ResponseAPIMilvusConfig struct {
256+
// Milvus server address (e.g., "localhost:19530")
257+
Address string `yaml:"address"`
258+
259+
// Database name
260+
Database string `yaml:"database,omitempty"`
261+
262+
// Collection name for storing responses
263+
Collection string `yaml:"collection,omitempty"`
264+
}
265+
227266
// KeywordRule defines a rule for keyword-based classification.
228267
type KeywordRule struct {
229268
Name string `yaml:"name"` // Name is also used as category

src/semantic-router/pkg/extproc/processor_req_body.go

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,36 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
2525
// Save the original request body
2626
ctx.OriginalRequestBody = v.RequestBody.GetBody()
2727

28+
// Handle Response API translation if this is a /v1/responses request
29+
requestBody := ctx.OriginalRequestBody
30+
if ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest && r.ResponseAPIFilter != nil {
31+
respCtx, translatedBody, err := r.ResponseAPIFilter.TranslateRequest(ctx.TraceContext, requestBody)
32+
if err != nil {
33+
logging.Errorf("Response API translation error: %v", err)
34+
return r.createErrorResponse(400, "Invalid Response API request: "+err.Error()), nil
35+
}
36+
if respCtx != nil && translatedBody != nil {
37+
// Update context with full Response API context
38+
ctx.ResponseAPICtx = respCtx
39+
requestBody = translatedBody
40+
logging.Infof("Response API: Translated to Chat Completions format")
41+
} else {
42+
// Translation returned nil - this means the request is missing required fields (e.g., 'input')
43+
// Return error since the request was sent to /v1/responses but is not valid Response API format
44+
logging.Errorf("Response API: Request to /v1/responses missing required 'input' field")
45+
return r.createErrorResponse(400, "Invalid Response API request: 'input' field is required. Use 'input' instead of 'messages' for Response API."), nil
46+
}
47+
}
48+
2849
// Extract stream parameter from original request and update ExpectStreamingResponse if needed
29-
hasStreamParam := extractStreamParam(ctx.OriginalRequestBody)
50+
hasStreamParam := extractStreamParam(requestBody)
3051
if hasStreamParam {
3152
logging.Infof("Original request contains stream parameter: true")
3253
ctx.ExpectStreamingResponse = true // Set this if stream param is found
3354
}
3455

3556
// Parse the OpenAI request using SDK types
36-
openAIRequest, err := parseOpenAIRequest(ctx.OriginalRequestBody)
57+
openAIRequest, err := parseOpenAIRequest(requestBody)
3758
if err != nil {
3859
logging.Errorf("Error parsing OpenAI request: %v", err)
3960
// Attempt to determine model for labeling (may be unknown here)
@@ -186,7 +207,7 @@ func (r *OpenAIRouter) handleSpecifiedModelRouting(openAIRequest *openai.ChatCom
186207
selectedEndpoint := r.selectEndpointForModel(ctx, originalModel)
187208

188209
// Create response with headers
189-
response := r.createSpecifiedModelResponse(originalModel, selectedEndpoint)
210+
response := r.createSpecifiedModelResponse(originalModel, selectedEndpoint, ctx)
190211

191212
// Handle route cache clearing
192213
if r.shouldClearRouteCache() {
@@ -300,6 +321,17 @@ func (r *OpenAIRouter) createRoutingResponse(model string, endpoint string, modi
300321
})
301322
}
302323

324+
// For Response API requests, modify :path to /v1/chat/completions
325+
if ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest {
326+
setHeaders = append(setHeaders, &core.HeaderValueOption{
327+
Header: &core.HeaderValue{
328+
Key: ":path",
329+
RawValue: []byte("/v1/chat/completions"),
330+
},
331+
})
332+
logging.Infof("Response API: Rewriting path to /v1/chat/completions")
333+
}
334+
303335
// Apply header mutations from decision's header_mutation plugin
304336
if ctx.VSRSelectedDecision != nil {
305337
pluginSetHeaders, pluginRemoveHeaders := r.buildHeaderMutations(ctx.VSRSelectedDecision)
@@ -332,8 +364,10 @@ func (r *OpenAIRouter) createRoutingResponse(model string, endpoint string, modi
332364
}
333365

334366
// createSpecifiedModelResponse creates a response for specified model routing
335-
func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint string) *ext_proc.ProcessingResponse {
367+
func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint string, ctx *RequestContext) *ext_proc.ProcessingResponse {
336368
setHeaders := []*core.HeaderValueOption{}
369+
removeHeaders := []string{}
370+
337371
if endpoint != "" {
338372
setHeaders = append(setHeaders, &core.HeaderValueOption{
339373
Header: &core.HeaderValue{
@@ -350,14 +384,38 @@ func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint strin
350384
},
351385
})
352386

387+
// For Response API requests, modify :path to /v1/chat/completions and use translated body
388+
var bodyMutation *ext_proc.BodyMutation
389+
if ctx != nil && ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest {
390+
setHeaders = append(setHeaders, &core.HeaderValueOption{
391+
Header: &core.HeaderValue{
392+
Key: ":path",
393+
RawValue: []byte("/v1/chat/completions"),
394+
},
395+
})
396+
removeHeaders = append(removeHeaders, "content-length")
397+
398+
// Use the translated body from Response API context
399+
if len(ctx.ResponseAPICtx.TranslatedBody) > 0 {
400+
bodyMutation = &ext_proc.BodyMutation{
401+
Mutation: &ext_proc.BodyMutation_Body{
402+
Body: ctx.ResponseAPICtx.TranslatedBody,
403+
},
404+
}
405+
}
406+
logging.Infof("Response API: Rewriting path to /v1/chat/completions (specified model)")
407+
}
408+
353409
return &ext_proc.ProcessingResponse{
354410
Response: &ext_proc.ProcessingResponse_RequestBody{
355411
RequestBody: &ext_proc.BodyResponse{
356412
Response: &ext_proc.CommonResponse{
357413
Status: ext_proc.CommonResponse_CONTINUE,
358414
HeaderMutation: &ext_proc.HeaderMutation{
359-
SetHeaders: setHeaders,
415+
SetHeaders: setHeaders,
416+
RemoveHeaders: removeHeaders,
360417
},
418+
BodyMutation: bodyMutation,
361419
},
362420
},
363421
},

src/semantic-router/pkg/extproc/processor_req_header.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ type RequestContext struct {
4747

4848
// Tracing context
4949
TraceContext context.Context // OpenTelemetry trace context for span propagation
50+
51+
// Response API context
52+
ResponseAPICtx *ResponseAPIContext // Non-nil if this is a Response API request
5053
}
5154

5255
// handleRequestHeaders processes the request headers
@@ -117,6 +120,42 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
117120
return r.handleModelsRequest(path)
118121
}
119122

123+
// Handle Response API endpoints
124+
if r.ResponseAPIFilter != nil && r.ResponseAPIFilter.IsEnabled() && strings.HasPrefix(path, "/v1/responses") {
125+
// GET /v1/responses/{id}/input_items - Get input items for a response
126+
if method == "GET" && strings.HasSuffix(path, "/input_items") {
127+
responseID := extractResponseIDFromInputItemsPath(path)
128+
if responseID != "" {
129+
logging.Infof("Handling GET /v1/responses/%s/input_items", responseID)
130+
return r.ResponseAPIFilter.HandleGetInputItems(ctx.TraceContext, responseID)
131+
}
132+
}
133+
134+
// GET /v1/responses/{id} - Get a response
135+
if method == "GET" {
136+
responseID := extractResponseIDFromPath(path)
137+
if responseID != "" {
138+
logging.Infof("Handling GET /v1/responses/%s", responseID)
139+
return r.ResponseAPIFilter.HandleGetResponse(ctx.TraceContext, responseID)
140+
}
141+
}
142+
143+
// DELETE /v1/responses/{id} - Delete a response
144+
if method == "DELETE" {
145+
responseID := extractResponseIDFromPath(path)
146+
if responseID != "" {
147+
logging.Infof("Handling DELETE /v1/responses/%s", responseID)
148+
return r.ResponseAPIFilter.HandleDeleteResponse(ctx.TraceContext, responseID)
149+
}
150+
}
151+
152+
// POST /v1/responses - Create response (mark for body phase processing)
153+
if method == "POST" {
154+
ctx.ResponseAPICtx = &ResponseAPIContext{IsResponseAPIRequest: true}
155+
logging.Infof("Detected Response API POST request: %s", path)
156+
}
157+
}
158+
120159
// Prepare base response
121160
response := &ext_proc.ProcessingResponse{
122161
Response: &ext_proc.ProcessingResponse_RequestHeaders{
@@ -135,3 +174,60 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
135174

136175
return response, nil
137176
}
177+
178+
// extractResponseIDFromPath extracts the response ID from a path like /v1/responses/{id}
179+
func extractResponseIDFromPath(path string) string {
180+
// Remove query string if present
181+
if idx := strings.Index(path, "?"); idx != -1 {
182+
path = path[:idx]
183+
}
184+
185+
// Expected format: /v1/responses/{id}
186+
prefix := "/v1/responses/"
187+
if !strings.HasPrefix(path, prefix) {
188+
return ""
189+
}
190+
191+
id := strings.TrimPrefix(path, prefix)
192+
// Remove any trailing slashes
193+
id = strings.TrimSuffix(id, "/")
194+
195+
// Skip if this is an input_items request
196+
if strings.Contains(id, "/") {
197+
return ""
198+
}
199+
200+
// Validate it looks like a response ID (should start with "resp_")
201+
if id != "" && strings.HasPrefix(id, "resp_") {
202+
return id
203+
}
204+
205+
return ""
206+
}
207+
208+
// extractResponseIDFromInputItemsPath extracts the response ID from a path like /v1/responses/{id}/input_items
209+
func extractResponseIDFromInputItemsPath(path string) string {
210+
// Remove query string if present
211+
if idx := strings.Index(path, "?"); idx != -1 {
212+
path = path[:idx]
213+
}
214+
215+
// Expected format: /v1/responses/{id}/input_items
216+
prefix := "/v1/responses/"
217+
suffix := "/input_items"
218+
219+
if !strings.HasPrefix(path, prefix) || !strings.HasSuffix(path, suffix) {
220+
return ""
221+
}
222+
223+
// Extract the ID between prefix and suffix
224+
id := strings.TrimPrefix(path, prefix)
225+
id = strings.TrimSuffix(id, suffix)
226+
227+
// Validate it looks like a response ID (should start with "resp_")
228+
if id != "" && strings.HasPrefix(id, "resp_") {
229+
return id
230+
}
231+
232+
return ""
233+
}

0 commit comments

Comments
 (0)