t

codelion · codelion · commit 6ab66edfc232 · 2025-05-25T19:54:01.000+08:00
diff --git a/examples/mlx_attention_optimization/config.yaml b/examples/mlx_attention_optimization/config.yaml
@@ -86,5 +86,5 @@ evaluator:
 
 # Evolution settings for attention optimization
 diff_based_evolution: true
-allow_full_rewrites: true  # Allow full rewrites for significant attention improvements
+allow_full_rewrites: false  # Allow full rewrites for significant attention improvements
 max_code_length: 100000    # Larger for complex attention implementations
diff --git a/examples/mlx_attention_optimization/evaluator.py b/examples/mlx_attention_optimization/evaluator.py
@@ -18,6 +18,28 @@
 import os
 
 
+def safe_float_conversion(value, default=0.0):
+    """Safely convert a value to float, handling infinity and NaN"""
+    try:
+        float_val = float(value)
+        if np.isnan(float_val) or np.isinf(float_val):
+            return default
+        return float_val
+    except (TypeError, ValueError, OverflowError):
+        return default
+
+
+def safe_division(numerator, denominator, default=0.0):
+    """Safely perform division, handling zero denominators and infinity"""
+    try:
+        if denominator == 0 or denominator is None:
+            return default
+        result = numerator / denominator
+        return safe_float_conversion(result, default)
+    except (TypeError, ValueError, OverflowError, ZeroDivisionError):
+        return default
+
+
 def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=60):
     """Run a function with timeout using concurrent.futures"""
     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
@@ -280,19 +302,19 @@ def benchmark_model_inference(program, config: Dict[str, Any]) -> Dict[str, Any]
                 opt_time = np.mean(opt_times)
                 
                 # Calculate speedup
-                speedup = ref_time / opt_time if opt_time > 0 else 0.0
+                speedup = safe_division(ref_time, opt_time, 0.0)
                 
-                # Calculate throughput (tokens/second)
+                # Calculate throughput (tokens/second)  
                 total_tokens = batch_size * seq_len
-                ref_throughput = total_tokens / ref_time
-                opt_throughput = total_tokens / opt_time
+                ref_throughput = safe_division(total_tokens, ref_time, 0.0)
+                opt_throughput = safe_division(total_tokens, opt_time, 0.0)
                 
                 results[config_name] = {
-                    "reference_time": ref_time,
-                    "optimized_time": opt_time,
-                    "speedup": speedup,
-                    "ref_throughput": ref_throughput,
-                    "opt_throughput": opt_throughput,
+                    "reference_time": safe_float_conversion(ref_time),
+                    "optimized_time": safe_float_conversion(opt_time),
+                    "speedup": safe_float_conversion(speedup),
+                    "ref_throughput": safe_float_conversion(ref_throughput),
+                    "opt_throughput": safe_float_conversion(opt_throughput),
                     "model_config": model_config
                 }
                 
diff --git a/examples/mlx_attention_optimization/initial_program.py b/examples/mlx_attention_optimization/initial_program.py
@@ -80,9 +80,9 @@ def optimized_attention_kernel(
         if value.dtype != mx.float32:
             value = value.astype(mx.float32)
     
-    # Determine scale factor
+    # Determine scale factor - make sure it matches reference implementation
     if scale_strategy == "sqrt_dk":
-        scale = 1.0 / math.sqrt(d_model)
+        scale = 1.0 / math.sqrt(d_model)  # This should match reference
     elif scale_strategy == "learned":
         # Slightly different scale as a heuristic
         scale = 0.9 / math.sqrt(d_model) 
@@ -92,24 +92,20 @@ def optimized_attention_kernel(
     # For now, implement basic attention to ensure correctness
     # More complex optimizations will be evolved
     
-    # Compute attention scores
-    scores = mx.matmul(query, mx.transpose(key, axes=(0, 2, 1)))
-    
-    # Apply scaling
-    scores = scores * scale
+    # Compute attention scores - match reference implementation exactly
+    if scale_strategy == "sqrt_dk":
+        # Match reference exactly: scores = matmul(...) / sqrt(d_k)
+        scores = mx.matmul(query, mx.transpose(key, axes=(0, 2, 1))) / math.sqrt(d_model)
+    else:
+        # For other strategies, compute separately
+        scores = mx.matmul(query, mx.transpose(key, axes=(0, 2, 1)))
+        scores = scores * scale
     
-    # Apply mask if provided
+    # Apply mask if provided - match reference implementation
     if mask is not None:
-        # Ensure mask has the right shape and dtype
-        if mask.shape != scores.shape:
-            # Handle different mask shapes - broadcast if needed
-            if len(mask.shape) == 2:  # [seq_len, seq_len]
-                mask = mx.broadcast_to(mask[None, :, :], scores.shape)
-            elif len(mask.shape) == 3 and mask.shape[0] == 1:  # [1, seq_len, seq_len]
-                mask = mx.broadcast_to(mask, scores.shape)
-        
-        mask_value = -1e9 if compute_dtype == mx.float32 else -1e4
-        scores = scores + mask * mask_value
+        # Reference implementation does: scores = scores + mask
+        # So mask should already contain the large negative values
+        scores = scores + mask
     
     # Compute attention weights (always use high precision initially)
     attention_weights = mx.softmax(scores, axis=-1)
@@ -138,18 +134,13 @@ def _chunked_attention(
     """
     # For now, fall back to standard attention to ensure correctness
     # Evolution will implement proper chunking
-    scores = mx.matmul(query, mx.transpose(key, axes=(0, 2, 1)))
-    scores = scores * scale
+    d_model = query.shape[-1]
+    
+    # Match reference implementation exactly
+    scores = mx.matmul(query, mx.transpose(key, axes=(0, 2, 1))) / math.sqrt(d_model)
     
     if mask is not None:
-        if mask.shape != scores.shape:
-            if len(mask.shape) == 2:  # [seq_len, seq_len]
-                mask = mx.broadcast_to(mask[None, :, :], scores.shape)
-            elif len(mask.shape) == 3 and mask.shape[0] == 1:  # [1, seq_len, seq_len]
-                mask = mx.broadcast_to(mask, scores.shape)
-        
-        mask_value = -1e9 if scores.dtype == mx.float32 else -1e4
-        scores = scores + mask * mask_value
+        scores = scores + mask
     
     attention_weights = mx.softmax(scores, axis=-1)
     output = mx.matmul(attention_weights, value)
@@ -230,7 +221,7 @@ def benchmark_attention(
                     
                     # Create causal mask for decoder attention
                     mask = mx.triu(mx.ones((seq_len, seq_len)), k=1) * -1e9
-                    mask = mx.broadcast_to(mask[None, None, :, :], (batch_size, 1, seq_len, seq_len))
+                    mask = mx.broadcast_to(mask[None, :, :], (batch_size, seq_len, seq_len))
                     
                     # Warmup
                     _ = optimized_attention_kernel(query, key, value, mask, **config)