vllm-project
diff --git a/‎config/config.development.yaml‎
Lines changed: 108 additions & 0 deletions b/‎config/config.development.yaml‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 6 additions & 0 deletions b/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/extproc/mapping_responses.go‎
Lines changed: 146 additions & 0 deletions b/‎src/semantic-router/pkg/extproc/mapping_responses.go‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/extproc/mapping_responses_test.go‎
Lines changed: 46 additions & 0 deletions b/‎src/semantic-router/pkg/extproc/mapping_responses_test.go‎
Lines changed: 46 additions & 0 deletions
@@ -0,0 +1,108 @@
+# Development Configuration Example with Stdout Tracing
+# This configuration enables distributed tracing with stdout exporter
+# for local development and debugging.
+
+bert_model:
+  model_id: models/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 100
+  ttl_seconds: 600
+  eviction_policy: "fifo"
+  use_hnsw: true # Enable HNSW for faster search
+  hnsw_m: 16
+  hnsw_ef_construction: 200
+
+tools:
+  enabled: false
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: false
+
+vllm_endpoints:
+  - name: "local-endpoint"
+    address: "127.0.0.1"
+    port: 8000
+    weight: 1
+
+model_config:
+  "test-model":
+    pii_policy:
+      allow_by_default: true
+
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+
+categories:
+  - name: test
+    system_prompt: "You are a test assistant."
+    # Example: Category-level cache settings
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.85
+    model_scores:
+      - model: test-model
+        score: 1.0
+        use_reasoning: false
+
+default_model: test-model
+
+# Enable OpenAI Responses API adapter (experimental)
+enable_responses_adapter: true
+
+# Auto model name for automatic model selection (optional)
+# Uncomment and set to customize the model name for automatic routing
+# auto_model_name: "MoM"
+
+api:
+  batch_classification:
+    max_batch_size: 10
+    metrics:
+      enabled: true
+
+# Observability Configuration - Development with Stdout
+observability:
+  tracing:
+    # Enable tracing for development/debugging
+    enabled: true
+
+    # OpenTelemetry provider
+    provider: "opentelemetry"
+
+    exporter:
+      # Stdout exporter prints traces to console (great for debugging)
+      type: "stdout"
+
+      # No endpoint needed for stdout
+      # endpoint: ""
+      # insecure: true
+
+    sampling:
+      # Always sample in development to see all traces
+      type: "always_on"
+
+      # Rate not used for always_on
+      # rate: 1.0
+
+    resource:
+      # Service name for trace identification
+      service_name: "vllm-semantic-router-dev"
+
+      # Version for development
+      service_version: "dev"
+
+      # Environment identifier
+      deployment_environment: "development"
@@ -480,6 +480,9 @@ reasoning_families:
 # Global default reasoning effort level
 default_reasoning_effort: high
 
+# Enable OpenAI Responses API adapter (experimental)
+enable_responses_adapter: false
+
 # API Configuration
 api:
   batch_classification:
 
@@ -247,6 +247,12 @@ type EmbeddingRule struct {
 	SimilarityThreshold       float32           `yaml:"threshold"`
 	Candidates                []string          `yaml:"candidates"` // Renamed from Keywords
 	AggregationMethodConfiged AggregationMethod `yaml:"aggregation_method"`
+	// Gateway route cache clearing
+	ClearRouteCache bool `yaml:"clear_route_cache"`
+
+	// EnableResponsesAdapter enables the compatibility shim for OpenAI Responses API (/v1/responses)
+	// When enabled, POST /v1/responses requests are adapted to legacy /v1/chat/completions.
+	EnableResponsesAdapter bool `yaml:"enable_responses_adapter"`
 }
 
 // APIConfig represents configuration for API endpoints
 
@@ -0,0 +1,146 @@
+package extproc
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+)
+
+// mapResponsesRequestToChatCompletions converts a minimal OpenAI Responses API request
+// into a legacy Chat Completions request JSON. Supports only text input for PR1.
+func mapResponsesRequestToChatCompletions(original []byte) ([]byte, error) {
+	var req map[string]interface{}
+	if err := json.Unmarshal(original, &req); err != nil {
+		return nil, err
+	}
+
+	// Extract model
+	model, _ := req["model"].(string)
+	if model == "" {
+		return nil, fmt.Errorf("missing model")
+	}
+
+	// Derive user content
+	var userContent string
+	if input, ok := req["input"]; ok {
+		switch v := input.(type) {
+		case string:
+			userContent = v
+		case []interface{}:
+			// Join any string elements; ignore non-string for now
+			var parts []string
+			for _, it := range v {
+				if s, ok := it.(string); ok {
+					parts = append(parts, s)
+				} else if m, ok := it.(map[string]interface{}); ok {
+					// Try common shapes: {type:"input_text"|"text", text:"..."}
+					if t, _ := m["type"].(string); t == "input_text" || t == "text" {
+						if txt, _ := m["text"].(string); txt != "" {
+							parts = append(parts, txt)
+						}
+					}
+				}
+			}
+			userContent = strings.TrimSpace(strings.Join(parts, " "))
+		default:
+			// unsupported multimodal
+			return nil, fmt.Errorf("unsupported input type")
+		}
+	} else if msgs, ok := req["messages"].([]interface{}); ok {
+		// Fallback: if caller already provided messages, pass them through
+		// This enables easy migration from chat/completions
+		mapped := map[string]interface{}{
+			"model":    model,
+			"messages": msgs,
+		}
+		// Map basic params
+		if v, ok := req["temperature"]; ok {
+			mapped["temperature"] = v
+		}
+		if v, ok := req["top_p"]; ok {
+			mapped["top_p"] = v
+		}
+		if v, ok := req["max_output_tokens"]; ok {
+			mapped["max_tokens"] = v
+		}
+		return json.Marshal(mapped)
+	}
+
+	if userContent == "" {
+		return nil, fmt.Errorf("empty input")
+	}
+
+	// Build minimal Chat Completions request
+	mapped := map[string]interface{}{
+		"model": model,
+		"messages": []map[string]interface{}{
+			{"role": "user", "content": userContent},
+		},
+	}
+	// Map basic params
+	if v, ok := req["temperature"]; ok {
+		mapped["temperature"] = v
+	}
+	if v, ok := req["top_p"]; ok {
+		mapped["top_p"] = v
+	}
+	if v, ok := req["max_output_tokens"]; ok {
+		mapped["max_tokens"] = v
+	}
+
+	return json.Marshal(mapped)
+}
+
+// mapChatCompletionToResponses converts an OpenAI ChatCompletion JSON
+// into a minimal Responses API JSON (non-streaming only) for PR1.
+func mapChatCompletionToResponses(chatCompletionJSON []byte) ([]byte, error) {
+	var parsed struct {
+		ID      string `json:"id"`
+		Object  string `json:"object"`
+		Created int64  `json:"created"`
+		Model   string `json:"model"`
+		Choices []struct {
+			Index        int    `json:"index"`
+			FinishReason string `json:"finish_reason"`
+			Message      struct {
+				Role    string `json:"role"`
+				Content string `json:"content"`
+			} `json:"message"`
+		} `json:"choices"`
+		Usage struct {
+			PromptTokens     int `json:"prompt_tokens"`
+			CompletionTokens int `json:"completion_tokens"`
+			TotalTokens      int `json:"total_tokens"`
+		} `json:"usage"`
+	}
+	if err := json.Unmarshal(chatCompletionJSON, &parsed); err != nil {
+		return nil, err
+	}
+
+	content := ""
+	stopReason := "stop"
+	if len(parsed.Choices) > 0 {
+		content = parsed.Choices[0].Message.Content
+		if parsed.Choices[0].FinishReason != "" {
+			stopReason = parsed.Choices[0].FinishReason
+		}
+	}
+
+	out := map[string]interface{}{
+		"id":      parsed.ID,
+		"object":  "response",
+		"created": parsed.Created,
+		"model":   parsed.Model,
+		"output": []map[string]interface{}{
+			{"type": "message", "role": "assistant", "content": content},
+		},
+		"stop_reason": stopReason,
+		"usage": map[string]int{
+			"input_tokens":  parsed.Usage.PromptTokens,
+			"output_tokens": parsed.Usage.CompletionTokens,
+			"total_tokens":  parsed.Usage.TotalTokens,
+		},
+	}
+
+	return json.Marshal(out)
+}
@@ -0,0 +1,46 @@
+package extproc
+
+import (
+	"encoding/json"
+	"testing"
+)
+
+func TestMapResponsesRequestToChatCompletions_TextInput(t *testing.T) {
+	in := []byte(`{"model":"gpt-test","input":"Hello world","temperature":0.2,"top_p":0.9,"max_output_tokens":128}`)
+	out, err := mapResponsesRequestToChatCompletions(in)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	var m map[string]interface{}
+	if err := json.Unmarshal(out, &m); err != nil {
+		t.Fatalf("unmarshal mapped: %v", err)
+	}
+	if m["model"].(string) != "gpt-test" {
+		t.Fatalf("model not mapped")
+	}
+	if _, ok := m["messages"].([]interface{}); !ok {
+		t.Fatalf("messages missing")
+	}
+}
+
+func TestMapChatCompletionToResponses_Minimal(t *testing.T) {
+	in := []byte(`{
+        "id":"chatcmpl-1","object":"chat.completion","created":123,"model":"gpt-test",
+        "choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"hi"}}],
+        "usage":{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2}
+    }`)
+	out, err := mapChatCompletionToResponses(in)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	var m map[string]interface{}
+	if err := json.Unmarshal(out, &m); err != nil {
+		t.Fatalf("unmarshal mapped: %v", err)
+	}
+	if m["object"].(string) != "response" {
+		t.Fatalf("object not 'response'")
+	}
+	if m["stop_reason"].(string) == "" {
+		t.Fatalf("stop_reason missing")
+	}
+}