vllm-project
diff --git a/‎config/testing/config.response-api.yaml‎
Lines changed: 71 additions & 0 deletions b/‎config/testing/config.response-api.yaml‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 39 additions & 0 deletions b/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/extproc/processor_req_body.go‎
Lines changed: 63 additions & 5 deletions b/‎src/semantic-router/pkg/extproc/processor_req_body.go‎
Lines changed: 63 additions & 5 deletions
diff --git a/‎src/semantic-router/pkg/extproc/processor_req_header.go‎
Lines changed: 96 additions & 0 deletions b/‎src/semantic-router/pkg/extproc/processor_req_header.go‎
Lines changed: 96 additions & 0 deletions
@@ -0,0 +1,71 @@
+# Response API testing configuration
+# Based on config.e2e.yaml with Response API enabled
+
+bert_model:
+  model_id: models/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+
+# Response API Configuration - NEW
+response_api:
+  enabled: true
+  store_backend: "memory"  # Use in-memory store for testing
+  ttl_seconds: 86400       # 24 hours
+  max_responses: 1000
+
+# vLLM Endpoints Configuration
+vllm_endpoints:
+  - name: "test-endpoint"
+    address: "0.0.0.0"
+    port: 8000
+    weight: 1
+
+model_config:
+  "openai/gpt-oss-120b":
+    use_reasoning: false
+    preferred_endpoints: ["test-endpoint"]
+
+# Minimal classifier configuration
+classifier:
+  category_model:
+    model_id: "models/lora_intent_classifier_bert-base-uncased_model"
+    use_modernbert: false
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/lora_intent_classifier_bert-base-uncased_model/category_mapping.json"
+
+categories:
+  - name: other
+    description: "General knowledge and miscellaneous topics"
+    mmlu_categories: ["other"]
+
+strategy: "priority"
+
+decisions:
+  - name: "default_decision"
+    description: "Default catch-all decision"
+    priority: 1
+    rules:
+      operator: "OR"
+      conditions:
+        - type: "domain"
+          name: "other"
+    modelRefs:
+      - model: "openai/gpt-oss-120b"
+        use_reasoning: false
+
+default_model: "openai/gpt-oss-120b"
+
+# Reasoning family configurations
+reasoning_families:
+  qwen3:
+    type: "chat_template_kwargs"
+    parameter: "enable_thinking"
+
@@ -32,6 +32,8 @@ type RouterConfig struct {
 	InlineModels `yaml:",inline"`
 	// Semantic cache configuration
 	SemanticCache `yaml:"semantic_cache"`
+	// Response API configuration for stateful conversations
+	ResponseAPI ResponseAPIConfig `yaml:"response_api"`
 	// LLMObservability for LLM tracing, metrics, and logging
 	LLMObservability `yaml:",inline"`
 	// API server configuration
@@ -224,6 +226,43 @@ type SemanticCache struct {
 	EmbeddingModel string `yaml:"embedding_model,omitempty"`
 }
 
+// ResponseAPIConfig configures the Response API for stateful conversations.
+// The Response API provides OpenAI-compatible /v1/responses endpoints
+// that support conversation chaining via previous_response_id.
+// Requests are translated to Chat Completions format and routed through Envoy.
+type ResponseAPIConfig struct {
+	// Enable Response API endpoints
+	Enabled bool `yaml:"enabled"`
+
+	// Storage backend type: "memory", "milvus"
+	// Default: "memory"
+	StoreBackend string `yaml:"store_backend,omitempty"`
+
+	// Time-to-live for stored responses in seconds (0 = 30 days default)
+	TTLSeconds int `yaml:"ttl_seconds,omitempty"`
+
+	// Maximum number of responses to store (for memory backend)
+	MaxResponses int `yaml:"max_responses,omitempty"`
+
+	// Path to backend-specific configuration (for milvus)
+	BackendConfigPath string `yaml:"backend_config_path,omitempty"`
+
+	// Milvus configuration (when store_backend is "milvus")
+	Milvus ResponseAPIMilvusConfig `yaml:"milvus,omitempty"`
+}
+
+// ResponseAPIMilvusConfig configures Milvus storage for Response API.
+type ResponseAPIMilvusConfig struct {
+	// Milvus server address (e.g., "localhost:19530")
+	Address string `yaml:"address"`
+
+	// Database name
+	Database string `yaml:"database,omitempty"`
+
+	// Collection name for storing responses
+	Collection string `yaml:"collection,omitempty"`
+}
+
 // KeywordRule defines a rule for keyword-based classification.
 type KeywordRule struct {
 	Name          string   `yaml:"name"` // Name is also used as category
 
@@ -25,15 +25,36 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
 	// Save the original request body
 	ctx.OriginalRequestBody = v.RequestBody.GetBody()
 
+	// Handle Response API translation if this is a /v1/responses request
+	requestBody := ctx.OriginalRequestBody
+	if ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest && r.ResponseAPIFilter != nil {
+		respCtx, translatedBody, err := r.ResponseAPIFilter.TranslateRequest(ctx.TraceContext, requestBody)
+		if err != nil {
+			logging.Errorf("Response API translation error: %v", err)
+			return r.createErrorResponse(400, "Invalid Response API request: "+err.Error()), nil
+		}
+		if respCtx != nil && translatedBody != nil {
+			// Update context with full Response API context
+			ctx.ResponseAPICtx = respCtx
+			requestBody = translatedBody
+			logging.Infof("Response API: Translated to Chat Completions format")
+		} else {
+			// Translation returned nil - this means the request is missing required fields (e.g., 'input')
+			// Return error since the request was sent to /v1/responses but is not valid Response API format
+			logging.Errorf("Response API: Request to /v1/responses missing required 'input' field")
+			return r.createErrorResponse(400, "Invalid Response API request: 'input' field is required. Use 'input' instead of 'messages' for Response API."), nil
+		}
+	}
+
 	// Extract stream parameter from original request and update ExpectStreamingResponse if needed
-	hasStreamParam := extractStreamParam(ctx.OriginalRequestBody)
+	hasStreamParam := extractStreamParam(requestBody)
 	if hasStreamParam {
 		logging.Infof("Original request contains stream parameter: true")
 		ctx.ExpectStreamingResponse = true // Set this if stream param is found
 	}
 
 	// Parse the OpenAI request using SDK types
-	openAIRequest, err := parseOpenAIRequest(ctx.OriginalRequestBody)
+	openAIRequest, err := parseOpenAIRequest(requestBody)
 	if err != nil {
 		logging.Errorf("Error parsing OpenAI request: %v", err)
 		// Attempt to determine model for labeling (may be unknown here)
@@ -186,7 +207,7 @@ func (r *OpenAIRouter) handleSpecifiedModelRouting(openAIRequest *openai.ChatCom
 	selectedEndpoint := r.selectEndpointForModel(ctx, originalModel)
 
 	// Create response with headers
-	response := r.createSpecifiedModelResponse(originalModel, selectedEndpoint)
+	response := r.createSpecifiedModelResponse(originalModel, selectedEndpoint, ctx)
 
 	// Handle route cache clearing
 	if r.shouldClearRouteCache() {
@@ -300,6 +321,17 @@ func (r *OpenAIRouter) createRoutingResponse(model string, endpoint string, modi
 		})
 	}
 
+	// For Response API requests, modify :path to /v1/chat/completions
+	if ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest {
+		setHeaders = append(setHeaders, &core.HeaderValueOption{
+			Header: &core.HeaderValue{
+				Key:      ":path",
+				RawValue: []byte("/v1/chat/completions"),
+			},
+		})
+		logging.Infof("Response API: Rewriting path to /v1/chat/completions")
+	}
+
 	// Apply header mutations from decision's header_mutation plugin
 	if ctx.VSRSelectedDecision != nil {
 		pluginSetHeaders, pluginRemoveHeaders := r.buildHeaderMutations(ctx.VSRSelectedDecision)
@@ -332,8 +364,10 @@ func (r *OpenAIRouter) createRoutingResponse(model string, endpoint string, modi
 }
 
 // createSpecifiedModelResponse creates a response for specified model routing
-func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint string) *ext_proc.ProcessingResponse {
+func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint string, ctx *RequestContext) *ext_proc.ProcessingResponse {
 	setHeaders := []*core.HeaderValueOption{}
+	removeHeaders := []string{}
+
 	if endpoint != "" {
 		setHeaders = append(setHeaders, &core.HeaderValueOption{
 			Header: &core.HeaderValue{
@@ -350,14 +384,38 @@ func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint strin
 		},
 	})
 
+	// For Response API requests, modify :path to /v1/chat/completions and use translated body
+	var bodyMutation *ext_proc.BodyMutation
+	if ctx != nil && ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest {
+		setHeaders = append(setHeaders, &core.HeaderValueOption{
+			Header: &core.HeaderValue{
+				Key:      ":path",
+				RawValue: []byte("/v1/chat/completions"),
+			},
+		})
+		removeHeaders = append(removeHeaders, "content-length")
+
+		// Use the translated body from Response API context
+		if len(ctx.ResponseAPICtx.TranslatedBody) > 0 {
+			bodyMutation = &ext_proc.BodyMutation{
+				Mutation: &ext_proc.BodyMutation_Body{
+					Body: ctx.ResponseAPICtx.TranslatedBody,
+				},
+			}
+		}
+		logging.Infof("Response API: Rewriting path to /v1/chat/completions (specified model)")
+	}
+
 	return &ext_proc.ProcessingResponse{
 		Response: &ext_proc.ProcessingResponse_RequestBody{
 			RequestBody: &ext_proc.BodyResponse{
 				Response: &ext_proc.CommonResponse{
 					Status: ext_proc.CommonResponse_CONTINUE,
 					HeaderMutation: &ext_proc.HeaderMutation{
-						SetHeaders: setHeaders,
+						SetHeaders:    setHeaders,
+						RemoveHeaders: removeHeaders,
 					},
+					BodyMutation: bodyMutation,
 				},
 			},
 		},
 
@@ -47,6 +47,9 @@ type RequestContext struct {
 
 	// Tracing context
 	TraceContext context.Context // OpenTelemetry trace context for span propagation
+
+	// Response API context
+	ResponseAPICtx *ResponseAPIContext // Non-nil if this is a Response API request
 }
 
 // handleRequestHeaders processes the request headers
@@ -117,6 +120,42 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
 		return r.handleModelsRequest(path)
 	}
 
+	// Handle Response API endpoints
+	if r.ResponseAPIFilter != nil && r.ResponseAPIFilter.IsEnabled() && strings.HasPrefix(path, "/v1/responses") {
+		// GET /v1/responses/{id}/input_items - Get input items for a response
+		if method == "GET" && strings.HasSuffix(path, "/input_items") {
+			responseID := extractResponseIDFromInputItemsPath(path)
+			if responseID != "" {
+				logging.Infof("Handling GET /v1/responses/%s/input_items", responseID)
+				return r.ResponseAPIFilter.HandleGetInputItems(ctx.TraceContext, responseID)
+			}
+		}
+
+		// GET /v1/responses/{id} - Get a response
+		if method == "GET" {
+			responseID := extractResponseIDFromPath(path)
+			if responseID != "" {
+				logging.Infof("Handling GET /v1/responses/%s", responseID)
+				return r.ResponseAPIFilter.HandleGetResponse(ctx.TraceContext, responseID)
+			}
+		}
+
+		// DELETE /v1/responses/{id} - Delete a response
+		if method == "DELETE" {
+			responseID := extractResponseIDFromPath(path)
+			if responseID != "" {
+				logging.Infof("Handling DELETE /v1/responses/%s", responseID)
+				return r.ResponseAPIFilter.HandleDeleteResponse(ctx.TraceContext, responseID)
+			}
+		}
+
+		// POST /v1/responses - Create response (mark for body phase processing)
+		if method == "POST" {
+			ctx.ResponseAPICtx = &ResponseAPIContext{IsResponseAPIRequest: true}
+			logging.Infof("Detected Response API POST request: %s", path)
+		}
+	}
+
 	// Prepare base response
 	response := &ext_proc.ProcessingResponse{
 		Response: &ext_proc.ProcessingResponse_RequestHeaders{
@@ -135,3 +174,60 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
 
 	return response, nil
 }
+
+// extractResponseIDFromPath extracts the response ID from a path like /v1/responses/{id}
+func extractResponseIDFromPath(path string) string {
+	// Remove query string if present
+	if idx := strings.Index(path, "?"); idx != -1 {
+		path = path[:idx]
+	}
+
+	// Expected format: /v1/responses/{id}
+	prefix := "/v1/responses/"
+	if !strings.HasPrefix(path, prefix) {
+		return ""
+	}
+
+	id := strings.TrimPrefix(path, prefix)
+	// Remove any trailing slashes
+	id = strings.TrimSuffix(id, "/")
+
+	// Skip if this is an input_items request
+	if strings.Contains(id, "/") {
+		return ""
+	}
+
+	// Validate it looks like a response ID (should start with "resp_")
+	if id != "" && strings.HasPrefix(id, "resp_") {
+		return id
+	}
+
+	return ""
+}
+
+// extractResponseIDFromInputItemsPath extracts the response ID from a path like /v1/responses/{id}/input_items
+func extractResponseIDFromInputItemsPath(path string) string {
+	// Remove query string if present
+	if idx := strings.Index(path, "?"); idx != -1 {
+		path = path[:idx]
+	}
+
+	// Expected format: /v1/responses/{id}/input_items
+	prefix := "/v1/responses/"
+	suffix := "/input_items"
+
+	if !strings.HasPrefix(path, prefix) || !strings.HasSuffix(path, suffix) {
+		return ""
+	}
+
+	// Extract the ID between prefix and suffix
+	id := strings.TrimPrefix(path, prefix)
+	id = strings.TrimSuffix(id, suffix)
+
+	// Validate it looks like a response ID (should start with "resp_")
+	if id != "" && strings.HasPrefix(id, "resp_") {
+		return id
+	}
+
+	return ""
+}