Update majority_voting_plugin.py

codelion · codelion · commit c5ca7959db9a · 2025-07-23T22:09:35.000+08:00
diff --git a/optillm/plugins/majority_voting_plugin.py b/optillm/plugins/majority_voting_plugin.py
@@ -1,19 +1,14 @@
 """
-Majority Voting Plugin V2 for OptILLM
+Majority Voting Plugin for OptILLM
 
-Enhanced version with:
-- Category-aware answer extraction
-- Adaptive temperature control
-- Improved answer normalization
-- Response quality filtering
-- Smart fallback strategies
+Generic implementation that generates multiple candidates and selects
+the most common response through simple voting.
 """
 
 import re
 import logging
 from typing import Tuple, Dict, Any, List, Optional
 from collections import Counter
-import json
 
 logger = logging.getLogger(__name__)
 
@@ -24,89 +19,58 @@
 DEFAULT_K = 8
 DEFAULT_TEMPERATURE = 0.6  # Unified temperature for consistency
 
-def detect_category(query: str) -> str:
+
+def normalize_response(response: str) -> str:
     """
-    Try to detect the problem category from the query.
-    
-    Returns:
-        Category string or 'default' if unknown
+    Basic normalization for comparing responses.
+    Removes extra whitespace, punctuation at ends, and lowercases.
     """
-    query_lower = query.lower()
+    if not response:
+        return ""
     
-    # GSM8K patterns
-    if "###" in query or ("calculate" in query_lower and any(word in query_lower for word in ["total", "sum", "difference", "product"])):
-        return "gsm8k"
+    # Remove thinking blocks if present
+    response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
     
-    # MMLU patterns (multiple choice)
-    if re.search(r'\b[A-E]\s*[:\)]\s*', query) or "which of the following" in query_lower:
-        return "mmlu_math"
+    # Basic normalization
+    response = response.strip()
+    response = response.lower()
     
-    # BoolQ patterns
-    if query_lower.strip().endswith("?") and any(word in query_lower for word in ["is", "are", "was", "were", "does", "do", "did", "can", "could", "will", "would"]):
-        return "boolq"
+    # Remove trailing punctuation
+    response = response.rstrip('.,;:!?')
     
-    # AQUA-RAT patterns
-    if re.search(r'options?:\s*[A-E]', query, re.IGNORECASE):
-        return "aqua_rat"
+    # Normalize whitespace
+    response = ' '.join(response.split())
     
-    return "default"
+    return response
 
 
-
-
-def extract_answer_simple(response: str, category: str) -> Optional[str]:
+def extract_final_answer(response: str) -> str:
     """
-    Extract answer using same logic as evaluation script for consistency.
+    Try to extract just the final answer from a response.
+    This is generic and looks for common patterns.
     """
     if not response:
-        return None
+        return response
     
-    # Remove thinking blocks if present
+    # Remove thinking blocks
     response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
     
-    if category == "gsm8k":
-        # Extract number after ###
-        match = re.search(r'###\s*(-?\d*\.?\d+)', response)
+    # Look for common answer patterns
+    patterns = [
+        r'(?:final answer|answer):\s*(.+?)(?:\n|$)',
+        r'(?:the answer is|answer is)\s*(.+?)(?:\n|$)',
+        r'###\s*(.+?)(?:\n|$)',  # Common in math problems
+        r'^([A-E])\b',  # Single letter at start
+        r'\b([A-E])\b\s*$',  # Single letter at end
+    ]
+    
+    for pattern in patterns:
+        match = re.search(pattern, response, re.IGNORECASE | re.MULTILINE)
         if match:
-            return match.group(1)
+            return match.group(1).strip()
     
-    elif category == "aqua_rat":
-        # For AQUA-RAT, be more flexible in extraction
-        response_upper = response.upper()
-        
-        # Try to find letter choices (A-E)
-        patterns = [
-            r'\b([A-E])\b(?!\w)',  # Single letter not part of word
-            r'(?:answer|choice|option)\s*:?\s*([A-E])\b',
-            r'\(([A-E])\)',  # Letter in parentheses
-            r'^([A-E])$',  # Just the letter
-        ]
-        
-        for pattern in patterns:
-            match = re.search(pattern, response_upper, re.IGNORECASE | re.MULTILINE)
-            if match:
-                return match.group(1)
-        
-        # If no letter found, check for common wrong patterns
-        # Map true/false/yes/no/numbers to letters (this is a heuristic)
-        if re.search(r'\b(true|yes|1)\b', response.lower()):
-            return "A"  # Default mapping
-        elif re.search(r'\b(false|no|0)\b', response.lower()):
-            return "B"  # Default mapping
-    
-    elif category == "boolq":
-        response_lower = response.lower()
-        if 'yes' in response_lower:
-            return 'yes'
-        elif 'no' in response_lower:
-            return 'no'
-    
-    elif category == "mmlu_math":
-        # For MMLU, just return the cleaned response
-        return response.strip()
-    
-    # Default: return cleaned response
-    return response.strip()
+    # If no pattern found, return the whole response
+    return response
 
 
 def run(
@@ -117,20 +81,16 @@ def run(
     request_config: Dict[str, Any] = None
 ) -> Tuple[str, int]:
     """
-    Simplified majority voting using consistent evaluation logic.
+    Generic majority voting implementation.
     """
     logger.info("Starting majority voting process")
     
-    # Detect category
-    category = detect_category(initial_query)
-    logger.info(f"Detected category: {category}")
-    
     # Extract parameters
     k = request_config.get('k', DEFAULT_K) if request_config else DEFAULT_K
     temperature = request_config.get('temperature', DEFAULT_TEMPERATURE) if request_config else DEFAULT_TEMPERATURE
     max_tokens = request_config.get('max_tokens', 4096) if request_config else 4096
     
-    logger.info(f"Generating {k} candidates with temperature={temperature} for category={category}")
+    logger.info(f"Generating {k} candidates with temperature={temperature}")
     
     # Prepare messages
     messages = [
@@ -175,40 +135,36 @@ def run(
     if not candidates:
         return "Error: Could not generate any candidates", 0
     
-    # Extract answers and count votes
+    # Extract and normalize answers for voting
     answer_votes = Counter()
     answer_to_responses = {}
     
     for i, candidate in enumerate(candidates):
-        answer = extract_answer_simple(candidate, category)
-        if answer:
-            # Normalize answer for voting
-            if category == "aqua_rat":
-                answer = answer.upper()  # Ensure letters are uppercase
-            elif category == "boolq":
-                answer = answer.lower()  # Ensure yes/no are lowercase
-            elif category == "gsm8k":
-                # Try to normalize numbers
-                try:
-                    answer = str(float(answer))
-                except:
-                    pass
+        # Try to extract just the answer part
+        answer = extract_final_answer(candidate)
+        
+        # Normalize for comparison
+        normalized = normalize_response(answer)
+        
+        if normalized:
+            answer_votes[normalized] += 1
+            
+            # Keep track of original responses for each normalized answer
+            if normalized not in answer_to_responses:
+                answer_to_responses[normalized] = []
+            answer_to_responses[normalized].append(candidate)
             
-            answer_votes[answer] += 1
-            if answer not in answer_to_responses:
-                answer_to_responses[answer] = []
-            answer_to_responses[answer].append(candidate)
-            logger.debug(f"Candidate {i+1}: extracted '{answer}'")
+            logger.debug(f"Candidate {i+1}: '{answer}' -> '{normalized}'")
         else:
-            logger.warning(f"Could not extract answer from candidate {i+1}")
+            logger.warning(f"Could not extract/normalize answer from candidate {i+1}")
     
     # Select the most voted answer
     if answer_votes:
-        most_common_answer, count = answer_votes.most_common(1)[0]
-        logger.info(f"Most common answer: '{most_common_answer}' with {count}/{k} votes")
+        most_common_normalized, count = answer_votes.most_common(1)[0]
+        logger.info(f"Most common answer: '{most_common_normalized}' with {count}/{k} votes")
         
-        # Return the first response that gave this answer
-        winning_responses = answer_to_responses[most_common_answer]
+        # Return the first original response that mapped to this answer
+        winning_responses = answer_to_responses[most_common_normalized]
         return winning_responses[0], total_tokens
     else:
         # If no answers could be extracted, return the first candidate