vllm-project
diff --git a/‎config/config.yaml‎
Lines changed: 13 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 26 additions & 0 deletions b/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 26 additions & 0 deletions
@@ -504,6 +504,19 @@ embedding_models:
   gemma_model_path: "models/embeddinggemma-300m"
   use_cpu: true  # Set to false for GPU acceleration (requires CUDA)
 
+# Ensemble Configuration
+# Enables multi-model inference with configurable aggregation strategies
+ensemble:
+  enabled: false  # Enable ensemble mode (disabled by default)
+  default_strategy: "voting"  # voting, weighted, first_success, score_averaging, reranking
+  default_min_responses: 2  # Minimum number of successful responses required
+  timeout_seconds: 30  # Maximum time to wait for model responses
+  max_concurrent_requests: 10  # Limit parallel model queries
+  endpoint_mappings:  # Map model names to OpenAI-compatible API endpoints
+    # Example:
+    # model-a: "http://localhost:8001/v1/chat/completions"
+    # model-b: "http://localhost:8002/v1/chat/completions"
+
 # Observability Configuration
 observability:
   tracing:
 
@@ -81,6 +81,9 @@ type RouterOptions struct {
 
 	// Gateway route cache clearing
 	ClearRouteCache bool `yaml:"clear_route_cache"`
+
+	// Ensemble configuration for multi-model inference
+	Ensemble EnsembleConfig `yaml:"ensemble,omitempty"`
 }
 
 // InlineModels represents the configuration for models that are built into the binary
@@ -812,3 +815,26 @@ type PIIDetectionPolicy struct {
 	// If nil, uses the global threshold from Classifier.PIIModel.Threshold
 	PIIThreshold *float32 `yaml:"pii_threshold,omitempty"`
 }
+
+// EnsembleConfig represents configuration for ensemble orchestration
+type EnsembleConfig struct {
+	// Enabled controls whether ensemble mode is available
+	Enabled bool `yaml:"enabled"`
+
+	// DefaultStrategy is the default aggregation strategy
+	// Values: "voting", "weighted", "first_success", "score_averaging", "reranking"
+	DefaultStrategy string `yaml:"default_strategy,omitempty"`
+
+	// DefaultMinResponses is the default minimum number of responses required
+	DefaultMinResponses int `yaml:"default_min_responses,omitempty"`
+
+	// TimeoutSeconds is the maximum time to wait for model responses
+	TimeoutSeconds int `yaml:"timeout_seconds,omitempty"`
+
+	// MaxConcurrentRequests limits parallel model queries
+	MaxConcurrentRequests int `yaml:"max_concurrent_requests,omitempty"`
+
+	// EndpointMappings maps model names to their OpenAI-compatible API endpoints
+	// Example: {"model-a": "http://localhost:8001/v1/chat/completions"}
+	EndpointMappings map[string]string `yaml:"endpoint_mappings,omitempty"`
+}