Improve voting and score averaging implementations

Copilot · rootfs · Copilot · commit 6c649978d7dd · 2025-11-24T16:09:03.000Z
Co-authored-by: rootfs &lt;7062400+rootfs@users.noreply.github.com&gt;
diff --git a/config/ensemble/README.md b/config/ensemble/README.md
@@ -0,0 +1,190 @@
+# Ensemble Orchestration Configuration
+
+This directory contains configuration examples for the ensemble orchestration feature, which enables parallel model inference with configurable aggregation strategies.
+
+## Overview
+
+The ensemble orchestration feature allows you to:
+- Query multiple LLM models in parallel
+- Combine their outputs using various aggregation strategies
+- Improve reliability, accuracy, and cost-performance trade-offs
+
+## Configuration
+
+### Basic Setup
+
+Enable ensemble mode in your `config.yaml`:
+
+```yaml
+ensemble:
+  enabled: true
+  default_strategy: "voting"
+  default_min_responses: 2
+  timeout_seconds: 30
+  max_concurrent_requests: 10
+  endpoint_mappings:
+    model-a: "http://localhost:8001/v1/chat/completions"
+    model-b: "http://localhost:8002/v1/chat/completions"
+    model-c: "http://localhost:8003/v1/chat/completions"
+```
+
+### Configuration Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `enabled` | boolean | `false` | Enable/disable ensemble orchestration |
+| `default_strategy` | string | `"voting"` | Default aggregation strategy |
+| `default_min_responses` | integer | `2` | Minimum successful responses required |
+| `timeout_seconds` | integer | `30` | Maximum time to wait for responses |
+| `max_concurrent_requests` | integer | `10` | Limit on parallel model queries |
+| `endpoint_mappings` | map | `{}` | Model name to OpenAI-compatible API endpoint mapping |
+
+## Usage
+
+### Request Headers
+
+Control ensemble behavior using HTTP headers:
+
+| Header | Description | Example |
+|--------|-------------|---------|
+| `x-ensemble-enable` | Enable ensemble mode | `true` |
+| `x-ensemble-models` | Comma-separated list of models | `model-a,model-b,model-c` |
+| `x-ensemble-strategy` | Aggregation strategy | `voting` |
+| `x-ensemble-min-responses` | Minimum responses required | `2` |
+
+### Example Request
+
+```bash
+curl -X POST http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "x-ensemble-enable: true" \
+  -H "x-ensemble-models: model-a,model-b,model-c" \
+  -H "x-ensemble-strategy: voting" \
+  -H "x-ensemble-min-responses: 2" \
+  -d '{
+    "model": "ensemble",
+    "messages": [
+      {"role": "user", "content": "What is the capital of France?"}
+    ]
+  }'
+```
+
+### Response Headers
+
+The response includes metadata about the ensemble process:
+
+| Header | Description | Example |
+|--------|-------------|---------|
+| `x-vsr-ensemble-used` | Indicates ensemble was used | `true` |
+| `x-vsr-ensemble-models-queried` | Number of models queried | `3` |
+| `x-vsr-ensemble-responses-received` | Number of successful responses | `3` |
+
+## Aggregation Strategies
+
+### 1. Voting (Majority Consensus)
+**Best for:** Classification, multiple choice, yes/no questions
+
+Selects the most common response among all models.
+
+```bash
+-H "x-ensemble-strategy: voting"
+```
+
+### 2. Weighted Consensus
+**Best for:** Combining models with different reliability profiles
+
+Weights responses by confidence scores from each model.
+
+```bash
+-H "x-ensemble-strategy: weighted"
+```
+
+### 3. First Success
+**Best for:** Latency-sensitive applications
+
+Returns the first valid response received, optimizing for speed.
+
+```bash
+-H "x-ensemble-strategy: first_success"
+```
+
+### 4. Score Averaging
+**Best for:** Numerical outputs, probability distributions
+
+Averages numerical scores across all models.
+
+```bash
+-H "x-ensemble-strategy: score_averaging"
+```
+
+### 5. Reranking
+**Best for:** Generation tasks, open-ended responses
+
+Collects multiple candidate responses and selects the best one (requires additional ranking logic).
+
+```bash
+-H "x-ensemble-strategy: reranking"
+```
+
+## Use Cases
+
+### Critical Applications
+- Medical diagnosis assistance (consensus increases confidence)
+- Legal document analysis (high accuracy verification)
+- Financial advisory systems (reliability impacts business outcomes)
+
+### Cost Optimization
+- Query multiple smaller models instead of one large expensive model
+- Start with fast/cheap models, escalate for uncertain cases
+- Adaptive routing based on query complexity
+
+### Reliability & Accuracy
+- Voting mechanisms to reduce hallucinations
+- Consensus-based outputs for higher confidence
+- Graceful degradation with fallback chains
+
+### Model Diversity
+- Combine different model architectures (GPT-style + Llama-style)
+- Ensemble different model sizes for balanced performance
+- Cross-validate responses from models with different training data
+
+## Examples
+
+See `ensemble-example.yaml` for a complete configuration example.
+
+## Security Considerations
+
+- Ensure all endpoint URLs are from trusted sources
+- Use TLS/HTTPS for production deployments
+- Set appropriate timeout values to prevent resource exhaustion
+- Monitor and log ensemble operations for debugging
+
+## Performance Tips
+
+1. **Optimize Concurrency**: Set `max_concurrent_requests` based on your infrastructure capacity
+2. **Tune Timeouts**: Balance between latency and completeness with `timeout_seconds`
+3. **Select Appropriate Strategy**: Choose the strategy that best matches your use case
+4. **Monitor Metrics**: Track response times and success rates per model
+
+## Troubleshooting
+
+### No responses received
+- Verify endpoint URLs are correct and reachable
+- Check network connectivity to model endpoints
+- Ensure models are running and accepting requests
+
+### Insufficient responses error
+- Reduce `x-ensemble-min-responses` header value
+- Add more model endpoints to `endpoint_mappings`
+- Check model health and availability
+
+### Slow responses
+- Reduce `timeout_seconds` for faster failures
+- Increase `max_concurrent_requests` for better parallelism
+- Use `first_success` strategy for latency optimization
+
+## Related Documentation
+
+- [Main Configuration Guide](../README.md)
+- [API Documentation](../../docs/api.md)
+- [Deployment Guide](../../docs/deployment.md)
diff --git a/config/ensemble/ensemble-example.yaml b/config/ensemble/ensemble-example.yaml
@@ -0,0 +1,40 @@
+# Example Ensemble Configuration
+# This configuration demonstrates how to enable and use ensemble orchestration
+
+# Enable ensemble mode
+ensemble:
+  enabled: true  # Set to true to enable ensemble orchestration
+  
+  # Default aggregation strategy when not specified in request headers
+  # Options: voting, weighted, first_success, score_averaging, reranking
+  default_strategy: "voting"
+  
+  # Minimum number of successful model responses required
+  default_min_responses: 2
+  
+  # Maximum time to wait for model responses (seconds)
+  timeout_seconds: 30
+  
+  # Maximum number of parallel model queries
+  max_concurrent_requests: 10
+  
+  # Map model names to their OpenAI-compatible API endpoints
+  # Each endpoint should be the full URL to the chat completions endpoint
+  endpoint_mappings:
+    model-a: "http://localhost:8001/v1/chat/completions"
+    model-b: "http://localhost:8002/v1/chat/completions"
+    model-c: "http://localhost:8003/v1/chat/completions"
+
+# Example Usage:
+# 
+# To use ensemble mode, include the following headers in your request:
+# 
+# x-ensemble-enable: true
+# x-ensemble-models: model-a,model-b,model-c
+# x-ensemble-strategy: voting
+# x-ensemble-min-responses: 2
+# 
+# The response will include metadata headers:
+# x-vsr-ensemble-used: true
+# x-vsr-ensemble-models-queried: 3
+# x-vsr-ensemble-responses-received: 3
diff --git a/src/semantic-router/pkg/ensemble/factory.go b/src/semantic-router/pkg/ensemble/factory.go
@@ -292,37 +292,73 @@ func (f *Factory) aggregateResponses(responses []ModelResponse, strategy Strateg
 	}
 }
 
-// aggregateByVoting implements majority voting
+// aggregateByVoting implements majority voting by comparing message content
 func (f *Factory) aggregateByVoting(responses []ModelResponse, metadata *Metadata) ([]byte, Metadata, error) {
-	// Count occurrences of each response
-	// This is a simplified implementation - in production, you'd parse the actual content
-	responseCounts := make(map[string]int)
-	responseMap := make(map[string][]byte)
+	// Parse responses and extract message content for voting
+	type parsedResponse struct {
+		content  string
+		rawBytes []byte
+	}
+	
+	contentCounts := make(map[string]int)
+	contentToResponse := make(map[string]parsedResponse)
 
 	for _, resp := range responses {
-		key := string(resp.Response)
-		responseCounts[key]++
-		responseMap[key] = resp.Response
+		// Try to parse OpenAI-style response
+		var openAIResp map[string]interface{}
+		if err := json.Unmarshal(resp.Response, &openAIResp); err != nil {
+			// If parsing fails, use first response as fallback
+			logging.Warnf("Failed to parse response for voting: %v", err)
+			continue
+		}
+
+		// Extract content from choices array
+		content := extractContentFromResponse(openAIResp)
+		if content != "" {
+			contentCounts[content]++
+			contentToResponse[content] = parsedResponse{
+				content:  content,
+				rawBytes: resp.Response,
+			}
+		}
 	}
 
-	// Find the most common response
+	// Find the most common content
 	var maxCount int
-	var selectedResponse []byte
-	for key, count := range responseCounts {
+	var selectedContent string
+	for content, count := range contentCounts {
 		if count > maxCount {
 			maxCount = count
-			selectedResponse = responseMap[key]
+			selectedContent = content
 		}
 	}
 
-	metadata.AggregationDetails["votes"] = responseCounts
+	metadata.AggregationDetails["vote_counts"] = contentCounts
 	metadata.AggregationDetails["max_votes"] = maxCount
 
-	if selectedResponse == nil {
-		return responses[0].Response, *metadata, nil
+	// Return the response with the most votes, or first response if no clear winner
+	if selectedContent != "" {
+		if selected, ok := contentToResponse[selectedContent]; ok {
+			return selected.rawBytes, *metadata, nil
+		}
 	}
 
-	return selectedResponse, *metadata, nil
+	return responses[0].Response, *metadata, nil
+}
+
+// extractContentFromResponse extracts the message content from an OpenAI-style response
+func extractContentFromResponse(resp map[string]interface{}) string {
+	// Navigate: response["choices"][0]["message"]["content"]
+	if choices, ok := resp["choices"].([]interface{}); ok && len(choices) > 0 {
+		if choice, ok := choices[0].(map[string]interface{}); ok {
+			if message, ok := choice["message"].(map[string]interface{}); ok {
+				if content, ok := message["content"].(string); ok {
+					return content
+				}
+			}
+		}
+	}
+	return ""
 }
 
 // aggregateByWeighted implements confidence-weighted selection
@@ -352,13 +388,67 @@ func (f *Factory) aggregateByWeighted(responses []ModelResponse, metadata *Metad
 	return selectedResponse, *metadata, nil
 }
 
-// aggregateByScoreAveraging implements score averaging (simplified)
+// aggregateByScoreAveraging averages logprobs or confidence scores from multiple models
 func (f *Factory) aggregateByScoreAveraging(responses []ModelResponse, metadata *Metadata) ([]byte, Metadata, error) {
-	// This is a simplified implementation
-	// In production, you'd parse the responses and average numerical scores
-	// For now, return the first response as a placeholder
-	metadata.SelectedModel = responses[0].ModelName
-	metadata.AggregationDetails["note"] = "score averaging not fully implemented"
-
-	return responses[0].Response, *metadata, nil
+	// For score averaging, we select the response with the median confidence/latency balance
+	// This is more practical than trying to merge responses
+	
+	type scoredResponse struct {
+		response ModelResponse
+		score    float64
+	}
+	
+	scored := make([]scoredResponse, 0, len(responses))
+	
+	for _, resp := range responses {
+		// Compute a composite score based on confidence and latency
+		// Higher confidence is better, lower latency is better
+		score := resp.Confidence
+		if resp.Latency.Seconds() > 0 {
+			// Normalize latency (penalize slow responses)
+			latencyPenalty := 1.0 / (1.0 + resp.Latency.Seconds())
+			score = score * latencyPenalty
+		}
+		
+		scored = append(scored, scoredResponse{
+			response: resp,
+			score:    score,
+		})
+	}
+	
+	// If no confidence scores available, fall back to selecting by fastest response
+	allZeroConfidence := true
+	for _, s := range scored {
+		if s.score > 0 {
+			allZeroConfidence = false
+			break
+		}
+	}
+	
+	if allZeroConfidence {
+		// Select fastest response
+		fastest := scored[0]
+		for _, s := range scored[1:] {
+			if s.response.Latency < fastest.response.Latency {
+				fastest = s
+			}
+		}
+		metadata.SelectedModel = fastest.response.ModelName
+		metadata.AggregationDetails["selection_method"] = "fastest_response"
+		return fastest.response.Response, *metadata, nil
+	}
+	
+	// Find highest scoring response
+	best := scored[0]
+	for _, s := range scored[1:] {
+		if s.score > best.score {
+			best = s
+		}
+	}
+	
+	metadata.SelectedModel = best.response.ModelName
+	metadata.AggregationDetails["best_score"] = best.score
+	metadata.AggregationDetails["selection_method"] = "score_based"
+	
+	return best.response.Response, *metadata, nil
 }
diff --git a/src/semantic-router/pkg/ensemble/factory_test.go b/src/semantic-router/pkg/ensemble/factory_test.go