fixes

codelion · codelion · commit 761c880692a0 · 2025-05-16T08:42:27.000+08:00
diff --git a/examples/matrix_multiplication/evaluate.py b/examples/matrix_multiplication/evaluate.py
@@ -137,14 +137,15 @@ def evaluate_correctness(matrix_multiply) -> float:
     Returns:
         Correctness score (0.0 to 1.0)
     """
-    # Define test cases
+    # Define test cases focused on smaller matrices (as in the paper)
     test_sizes = [
         (2, 2, 2),
+        (2, 3, 2),
         (3, 3, 3),
-        (4, 4, 4),
-        (10, 10, 10),
         (3, 4, 5),
-        (7, 3, 8),
+        (4, 4, 4),
+        (4, 5, 3),
+        (5, 5, 5),
     ]
     
     passed = 0
@@ -181,26 +182,41 @@ def evaluate_performance(matrix_multiply) -> float:
     Returns:
         Performance score (0.0 to 1.0)
     """
-    # Define benchmark sizes
+    # Define benchmark sizes focused on smaller matrices (as in the paper)
     benchmark_sizes = [
-        (10, 10, 10),
-        (20, 20, 20),
-        (30, 30, 30),
-        (40, 40, 40),
+        (2, 2, 2),
+        (3, 3, 3),
+        (4, 4, 4),
+        (5, 5, 5),
+        (3, 4, 5),
+        (4, 3, 5),
     ]
     
-    # Define baseline times (naive implementation)
-    # These would be measured in advance for the baseline implementation
+    # Define baseline times for the naive triple-loop implementation
+    # These are the reference times that our initial implementation should achieve
     baseline_times = {
-        "10x10x10": 0.0015,  # seconds
-        "20x20x20": 0.0120,  # seconds
-        "30x30x30": 0.0400,  # seconds
-        "40x40x40": 0.0950,  # seconds
+        "2x2x2": 0.0001,
+        "3x3x3": 0.0003,
+        "4x4x4": 0.0007,
+        "5x5x5": 0.0015,
+        "3x4x5": 0.0007,
+        "4x3x5": 0.0007,
+    }
+    
+    # Define target speedups (what we're aiming for)
+    # Based on Strassen's algorithm and other optimized approaches
+    target_speedups = {
+        "2x2x2": 1.5,  # 50% faster than naive
+        "3x3x3": 1.7,  # 70% faster than naive
+        "4x4x4": 2.0,  # 2x faster than naive
+        "5x5x5": 2.2,  # 2.2x faster than naive
+        "3x4x5": 1.7,  # 70% faster than naive
+        "4x3x5": 1.7,  # 70% faster than naive
     }
     
     # Run benchmark
     results = {}
-    runs = 3
+    runs = 5  # More runs for better accuracy
     
     for m, n, p in benchmark_sizes:
         size_key = f"{m}x{n}x{p}"
@@ -221,38 +237,56 @@ def evaluate_performance(matrix_multiply) -> float:
                 end_time = time.time()
                 times.append(end_time - start_time)
             
-            # Record average time
-            avg_time = sum(times) / runs
+            # Record average time (remove fastest and slowest)
+            times.sort()
+            if len(times) > 2:
+                times = times[1:-1]  # Remove extremes
+            avg_time = sum(times) / len(times)
             results[size_key] = avg_time
         except Exception as e:
             logger.warning(f"Error in performance test for sizes {(m, n, p)}: {str(e)}")
             results[size_key] = baseline_times[size_key] * 2  # Penalize errors
     
-    # Calculate speedups
+    # Calculate speedups relative to baseline
     speedups = {}
     for size, time_taken in results.items():
         if time_taken > 0:
             speedups[size] = baseline_times[size] / time_taken
         else:
             speedups[size] = 0
     
-    # Calculate overall score (geometric mean of speedups)
-    if not speedups:
-        return 0.0
+    # Calculate relative performance to targets
+    target_percentages = {}
+    for size, speedup in speedups.items():
+        target = target_speedups[size]
+        # If speedup is below 1.0, it's worse than baseline (score 0.0-0.2)
+        # If speedup equals baseline, score is 0.2
+        # If speedup is between baseline and target, score is 0.2-0.8
+        # If speedup reaches target, score is 0.8
+        # If speedup exceeds target, score is 0.8-1.0
+        if speedup < 1.0:
+            target_percentages[size] = 0.2 * speedup
+        elif speedup < target:
+            # Linear interpolation between 0.2 and 0.8
+            progress = (speedup - 1.0) / (target - 1.0)
+            target_percentages[size] = 0.2 + 0.6 * progress
+        else:
+            # Speedup reached or exceeded target
+            bonus = min((speedup - target) / target, 0.5)  # Cap bonus at 0.5
+            target_percentages[size] = 0.8 + 0.2 * bonus
     
-    # Remove any zero speedups
-    valid_speedups = [s for s in speedups.values() if s > 0]
-    if not valid_speedups:
+    # Calculate overall score (average of target percentages)
+    if not target_percentages:
         return 0.0
     
-    # Calculate geometric mean
-    import math
-    log_sum = sum(math.log(s) for s in valid_speedups)
-    geom_mean = math.exp(log_sum / len(valid_speedups))
+    # Calculate average score
+    avg_score = sum(target_percentages.values()) / len(target_percentages)
     
-    # Normalize to 0.0-1.0 range (assuming baseline = 1.0)
-    # Values above 1.0 indicate improvement, below 1.0 indicate regression
-    # Cap at 5.0x speedup for scoring purposes
-    normalized_score = min(geom_mean / 5.0, 1.0)
+    # Log detailed results for debugging
+    logger.info(f"Performance results:")
+    for size in benchmark_sizes:
+        size_key = f"{size[0]}x{size[1]}x{size[2]}"
+        if size_key in results and size_key in speedups and size_key in target_percentages:
+            logger.info(f"  {size_key}: time={results[size_key]:.6f}s, speedup={speedups[size_key]:.2f}x, score={target_percentages[size_key]:.2f}")
     
-    return normalized_score
+    return avg_score
diff --git a/examples/matrix_multiplication/optimize.py b/examples/matrix_multiplication/optimize.py
@@ -54,6 +54,38 @@ async def main():
     config.diff_based_evolution = True
     config.allow_full_rewrites = False
     
+    # Create specialized template for matrix multiplication
+    from openevolve.prompt.templates import TemplateManager
+    
+    # Modify prompt templates to use specialized ones for matrix multiplication
+    from openevolve.prompt.sampler import PromptSampler
+    original_build_prompt = PromptSampler.build_prompt
+    
+    def custom_build_prompt(self, *args, **kwargs):
+        # Get template key from kwargs or use default
+        template_key = kwargs.pop('template_key', 'diff_user') if 'template_key' in kwargs else 'diff_user'
+        
+        # Use specialized template for matrix multiplication
+        if template_key == 'diff_user':
+            template_key = 'matmul_diff_user'
+        
+        # Use specialized system message
+        if args and len(args) >= 1:
+            result = original_build_prompt(self, *args, **kwargs)
+            if 'system' in result:
+                template_manager = TemplateManager()
+                result['system'] = template_manager.get_template('matmul_system')
+            return result
+        else:
+            kwargs['template_key'] = template_key
+            return original_build_prompt(self, *args, **kwargs)
+    
+    # Apply the patch
+    PromptSampler.build_prompt = custom_build_prompt
+    
+    # Increase temperature for more creative solutions
+    config.llm.temperature = 0.9
+    
     # Initialize OpenEvolve with the custom config
     openevolve = OpenEvolve(
         initial_program_path=str(initial_program_path),
@@ -65,6 +97,7 @@ async def main():
     
     # Run evolution
     print(f"Starting evolution for {args.iterations} iterations...")
+    print(f"Focus on optimizing matrix multiplication for small matrices (2x2 to 5x5)")
     best_program = await openevolve.run(iterations=args.iterations)
     
     print(f"\nEvolution complete!")
diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py
@@ -21,8 +21,28 @@ def __init__(self, config: PromptConfig):
         # Initialize the random number generator
         random.seed()
         
+        # Store custom template mappings
+        self.system_template_override = None
+        self.user_template_override = None
+        
         logger.info("Initialized prompt sampler")
     
+    def set_templates(
+        self,
+        system_template: Optional[str] = None,
+        user_template: Optional[str] = None
+    ) -> None:
+        """
+        Set custom templates to use for this sampler
+        
+        Args:
+            system_template: Template name for system message
+            user_template: Template name for user message
+        """
+        self.system_template_override = system_template
+        self.user_template_override = user_template
+        logger.info(f"Set custom templates: system={system_template}, user={user_template}")
+    
     def build_prompt(
         self,
         current_program: str,
@@ -33,6 +53,7 @@ def build_prompt(
         language: str = "python",
         evolution_round: int = 0,
         allow_full_rewrite: bool = False,
+        template_key: Optional[str] = None,
     ) -> Dict[str, str]:
         """
         Build a prompt for the LLM
@@ -46,14 +67,33 @@ def build_prompt(
             language: Programming language
             evolution_round: Current evolution round
             allow_full_rewrite: Whether to allow a full rewrite
+            template_key: Optional override for template key
             
         Returns:
             Dictionary with 'system' and 'user' keys
         """
-        # Select template based on whether we want a full rewrite
-        template_key = "full_rewrite_user" if allow_full_rewrite else "diff_user"
-        user_template = self.template_manager.get_template(template_key)
-        system_template = self.config.system_message
+        # Select template based on whether we want a full rewrite (with overrides)
+        if template_key:
+            # Use explicitly provided template key
+            user_template_key = template_key
+        elif self.user_template_override:
+            # Use the override set with set_templates
+            user_template_key = self.user_template_override
+        else:
+            # Default behavior
+            user_template_key = "full_rewrite_user" if allow_full_rewrite else "diff_user"
+        
+        # Get the template
+        user_template = self.template_manager.get_template(user_template_key)
+        
+        # Use system template override if set
+        if self.system_template_override:
+            system_message = self.template_manager.get_template(self.system_template_override)
+        else:
+            system_message = self.config.system_message
+            # If system_message is a template name rather than content, get the template
+            if system_message in self.template_manager.templates:
+                system_message = self.template_manager.get_template(system_message)
         
         # Format metrics
         metrics_str = self._format_metrics(program_metrics)
@@ -82,7 +122,7 @@ def build_prompt(
         )
         
         return {
-            "system": system_template,
+            "system": system_message,
             "user": user_message,
         }
     
diff --git a/openevolve/prompt/templates.py b/openevolve/prompt/templates.py
@@ -11,6 +11,13 @@
 Focus on making targeted changes that will increase the program's performance metrics.
 """
 
+# Matrix multiplication system template
+MATMUL_SYSTEM_TEMPLATE = """You are an expert algorithm engineer specialized in numerical computing and matrix operations.
+Your task is to optimize matrix multiplication algorithms for better performance while maintaining correctness.
+Apply techniques like loop reordering, blocking, recursion, and mathematical insights to reduce the number of operations.
+Focus on making improvements for smaller matrix sizes (2x2 to 5x5) where algorithmic innovations like Strassen's algorithm can make a difference.
+"""
+
 # User message template for diff-based evolution
 DIFF_USER_TEMPLATE = """# Current Program Information
 - Current performance metrics: {metrics}
@@ -26,20 +33,86 @@
 
 # Task
 Suggest improvements to the program that will lead to better performance on the specified metrics.
-Use the SEARCH/REPLACE diff format to indicate changes:
+
+You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
 
 <<<<<<< SEARCH
-# Code to find and replace
+# Original code to find and replace (must match exactly)
 =======
 # New replacement code
 >>>>>>> REPLACE
 
-You can suggest multiple changes. Make sure each SEARCH section exactly matches code in the current program.
-Be thoughtful about your changes and explain your reasoning.
+Example of valid diff format:
+<<<<<<< SEARCH
+for i in range(m):
+    for j in range(p):
+        for k in range(n):
+            C[i, j] += A[i, k] * B[k, j]
+=======
+# Reorder loops for better memory access pattern
+for i in range(m):
+    for k in range(n):
+        for j in range(p):
+            C[i, j] += A[i, k] * B[k, j]
+>>>>>>> REPLACE
+
+You can suggest multiple changes. Each SEARCH section must exactly match code in the current program.
+Be thoughtful about your changes and explain your reasoning thoroughly.
 
 IMPORTANT: Do not rewrite the entire program - focus on targeted improvements.
 """
 
+# Matrix multiplication specific template
+MATMUL_DIFF_USER_TEMPLATE = """# Matrix Multiplication Optimization Task
+- Current performance metrics: {metrics}
+- Areas identified for improvement: {improvement_areas}
+
+# Program Evolution History
+{evolution_history}
+
+# Current Program
+```{language}
+{current_program}
+```
+
+# Task
+Optimize the matrix multiplication algorithm for better performance while maintaining correctness.
+Focus on smaller matrix sizes (2x2 to 5x5) where algorithmic innovations can make a significant difference.
+
+Consider these optimization strategies:
+1. Loop reordering for better cache locality
+2. Loop unrolling to reduce loop overhead
+3. Blocking/tiling for better memory access patterns
+4. Algorithmic improvements like Strassen's algorithm for recursive decomposition
+5. Special case handling for specific matrix sizes
+6. Vectorization hints and SIMD-friendly operations
+
+You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
+
+<<<<<<< SEARCH
+# Original code to find and replace (must match exactly)
+=======
+# New replacement code
+>>>>>>> REPLACE
+
+Example of valid diff format:
+<<<<<<< SEARCH
+for i in range(m):
+    for j in range(p):
+        for k in range(n):
+            C[i, j] += A[i, k] * B[k, j]
+=======
+# Reorder loops for better memory access pattern
+for i in range(m):
+    for k in range(n):
+        for j in range(p):
+            C[i, j] += A[i, k] * B[k, j]
+>>>>>>> REPLACE
+
+You can suggest multiple changes. Each SEARCH section must exactly match code in the current program.
+Explain the reasoning behind your optimizations.
+"""
+
 # User message template for full rewrite
 FULL_REWRITE_USER_TEMPLATE = """# Current Program Information
 - Current performance metrics: {metrics}
@@ -93,7 +166,9 @@
 # Default templates dictionary
 DEFAULT_TEMPLATES = {
     "system_message": BASE_SYSTEM_TEMPLATE,
+    "matmul_system": MATMUL_SYSTEM_TEMPLATE,
     "diff_user": DIFF_USER_TEMPLATE,
+    "matmul_diff_user": MATMUL_DIFF_USER_TEMPLATE,
     "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE,
     "evolution_history": EVOLUTION_HISTORY_TEMPLATE,
     "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE,