i

codelion · codelion · commit 8c6aaf612984 · 2025-06-11T17:53:41.000+08:00
diff --git a/examples/mlx_fine_tuning_kernels/config.yaml b/examples/mlx_fine_tuning_kernels/config.yaml
@@ -1,7 +1,13 @@
-# MLX Quantized LoRA Fusion Optimization Configuration
+# MLX Quantized LoRA Fusion Optimization Configuration - EVOLVED VERSION
 # Target: Eliminate dequantization bottleneck in MLX-LM LoRA implementation
+# 
+# EVOLUTION IMPROVEMENTS:
+# - Training iterations: 15 → 50 (better convergence)
+# - Trial count: 5 → 7 (improved statistics)
+# - Statistical significance: p < 0.05 → p < 0.1 (less strict)
+# - Starting from best evolved program (Generation 4) with advanced optimizations
 
-max_iterations: 20  # Keep existing proven count
+max_iterations: 20  # EVOLVED: Can increase to 30+ for continued evolution with improved setup
 checkpoint_interval: 5
 log_level: "INFO"
 
diff --git a/examples/mlx_fine_tuning_kernels/evaluator.py b/examples/mlx_fine_tuning_kernels/evaluator.py
@@ -129,10 +129,10 @@ def create_test_config(self, data_dir: str, adapter_dir: str, trial_seed: int) -
             "seed": trial_seed,  # Unique seed per trial
             "num_layers": 3,
             "batch_size": 2,
-            "iters": 15,  # Sufficient iterations for meaningful measurement
+            "iters": 50,  # EVOLVED: Increased from 15 for better convergence and meaningful measurement
             "val_batches": 5,
             "learning_rate": 1e-4,
-            "steps_per_report": 5,
+            "steps_per_report": 10,  # EVOLVED: Adjusted for longer training
             "steps_per_eval": 50,
             "adapter_path": adapter_dir,
             "save_every": 100,
@@ -196,11 +196,11 @@ def validate_kernel_application(self, model, expected_kernels_applied: bool) ->
 
         return True
 
-    def compare_implementations(self, evolved_kernels: Dict, num_trials: int = 5) -> Dict[str, Any]:
+    def compare_implementations(self, evolved_kernels: Dict, num_trials: int = 7) -> Dict[str, Any]:
         """
         Robust comparison between baseline and evolved implementations.
         
-        Uses 5 trials for better statistical power and rigorous validation.
+        EVOLVED: Uses 7 trials for improved statistical power and rigorous validation.
         """
 
         if not MLX_LM_AVAILABLE:
@@ -520,8 +520,8 @@ def assess_significance(baseline_vals, evolved_vals):
                 return "identical"
                 
             t_stat = abs(b_mean - e_mean) / pooled_se
-            # Rough significance assessment (t > 2 is approximately p < 0.05 for small samples)
-            return "significant" if t_stat > 2.0 else "not_significant"
+            # EVOLVED: Less strict significance assessment (t > 1.6 is approximately p < 0.1 for small samples)
+            return "significant" if t_stat > 1.6 else "not_significant"
 
         significance = {
             "memory": assess_significance(baseline_stats["memory_delta"], evolved_stats["memory_delta"]),
@@ -600,9 +600,9 @@ def evaluate(program_path: str) -> Dict[str, Any]:
         # Setup benchmark
         benchmark = QuantizedLoRABenchmark()
 
-        # Run robust comparison with 5 trials
+        # EVOLVED: Run robust comparison with 7 trials for improved statistics
         comparison_results = benchmark.compare_implementations(
-            evolved_kernels=evolved_kernels, num_trials=5
+            evolved_kernels=evolved_kernels, num_trials=7
         )
 
         if "error" in comparison_results:
diff --git a/examples/mlx_fine_tuning_kernels/new_initial_program.py b/examples/mlx_fine_tuning_kernels/new_initial_program.py