d

codelion · codelion · commit 17ee9f18df44 · 2025-06-15T10:54:08.000+08:00
diff --git a/examples/mlx_metal_kernel_opt/quick_benchmark_test.py b/examples/mlx_metal_kernel_opt/quick_benchmark_test.py
@@ -12,7 +12,7 @@
 
 
 def run_quick_test():
-    """Run a quick test with just a few key benchmarks"""
+    """Run a quick test with just a few key benchmarks with proper warmup"""
 
     # Test configs - subset of full suite
     test_configs = [
@@ -50,17 +50,39 @@ def run_quick_test():
 
     # Use mlx-lm as installed package (no need to change directories)
     try:
+        # Import mlx for cache clearing
+        import mlx.core as mx
+        import numpy as np
+        
         benchmark_suite = Qwen3BenchmarkSuite()
 
         print(f"\n{'='*80}")
         print(f"Quick Benchmark Test - Qwen3-0.6B")
-        print(f"Testing {len(test_configs)} key scenarios")
+        print(f"Testing {len(test_configs)} key scenarios with warmup")
         print(f"{'='*80}")
+        
+        # Global warmup - run one quick test to warm up the system
+        print(f"🔥 Running global warmup to initialize MLX and model...")
+        try:
+            mx.clear_cache()
+            warmup_config = BenchmarkConfig(
+                name="warmup",
+                prompt="Hello",
+                max_tokens=5,
+                description="Warmup run"
+            )
+            print(f"   Global warmup in progress...")
+            warmup_result = benchmark_suite.run_single_benchmark(warmup_config)
+            print(f"   ✅ Global warmup completed")
+        except Exception as e:
+            print(f"   ⚠️  Global warmup failed: {e}")
+            print(f"   Continuing with individual tests...")
 
         results = []
         for i, config in enumerate(test_configs, 1):
             print(f"\n[{i}/{len(test_configs)}] Running: {config.name}")
             try:
+                # The benchmark_suite.run_single_benchmark already has warmup built-in
                 result = benchmark_suite.run_single_benchmark(config)
                 results.append(result)
             except Exception as e:
@@ -72,32 +94,36 @@ def run_quick_test():
             print(f"\n{'='*80}")
             print(f"Quick Test Results Summary")
             print(f"{'='*80}")
-            print(f"{'Name':<20} {'Gen Tokens':<12} {'Decode Speed':<12} {'Memory':<10}")
+            print(f"{'Name':<25} {'Gen Tokens':<12} {'Decode Speed':<15} {'Memory':<10} {'CV%':<8}")
             print(f"{'-'*80}")
 
             for result in results:
+                # Extract standard deviation from the result display if available
+                cv_display = "N/A"
                 print(
-                    f"{result.name:<20} "
+                    f"{result.name:<25} "
                     f"{result.generated_tokens:<12} "
-                    f"{result.decode_tokens_per_sec:<12.1f} "
-                    f"{result.peak_memory_gb:<10.2f}"
+                    f"{result.decode_tokens_per_sec:<15.1f} "
+                    f"{result.peak_memory_gb:<10.2f} "
+                    f"{cv_display:<8}"
                 )
 
             print(f"{'-'*80}")
             decode_speeds = [
                 r.decode_tokens_per_sec for r in results if r.decode_tokens_per_sec > 0
             ]
             if decode_speeds:
-                import numpy as np
-
                 print(f"Average decode speed: {np.mean(decode_speeds):.1f} tokens/sec")
                 print(
                     f"Speed range: {np.min(decode_speeds):.1f} - {np.max(decode_speeds):.1f} tokens/sec"
                 )
+                print(f"Performance std dev: {np.std(decode_speeds):.1f} tokens/sec")
+                print(f"Overall consistency: {np.std(decode_speeds)/np.mean(decode_speeds)*100:.1f}% CV")
 
         print(f"\n{'='*80}")
         print("Quick test complete! If this looks good, run the full benchmark suite.")
         print("python qwen3_benchmark_suite.py")
+        print(f"✅ All tests included proper warmup for reliable results")
         print(f"{'='*80}")
 
         return results
diff --git a/examples/mlx_metal_kernel_opt/qwen3_benchmark_suite.py b/examples/mlx_metal_kernel_opt/qwen3_benchmark_suite.py
@@ -547,13 +547,17 @@ def _create_maximum_context_prompt(self) -> str:
         return extended_context
 
     def run_single_benchmark(self, config: BenchmarkConfig) -> BenchmarkResult:
-        """Run a single benchmark configuration"""
+        """Run a single benchmark configuration with proper warmup"""
         print(f"\n{'='*60}")
         print(f"Running: {config.name}")
         print(f"Description: {config.description}")
         print(f"Max tokens: {config.max_tokens}")
         print(f"{'='*60}")
 
+        # Performance measurement parameters
+        WARMUP_RUNS = 2  # Warmup runs to eliminate cold start effects
+        MEASUREMENT_RUNS = 3  # Multiple measurement runs for reliability
+
         # Create temporary prompt file
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
             f.write(config.prompt)
@@ -571,102 +575,183 @@ def run_single_benchmark(self, config: BenchmarkConfig) -> BenchmarkResult:
                 config.prompt,
                 "--max-tokens",
                 str(config.max_tokens),
-                # Remove --verbose flag as it requires an argument in newer mlx-lm
             ]
 
-            # Record memory before
+            # Clear MLX cache before starting
+            print(f"🧹 Clearing MLX cache...")
             mx.clear_cache()
-            initial_memory = mx.get_active_memory()
-
-            # Run benchmark
-            start_time = time.perf_counter()
-            result = subprocess.run(
-                cmd, capture_output=True, text=True, timeout=300  # 5 minute timeout
-            )
-            end_time = time.perf_counter()
-
-            if result.returncode != 0:
-                print(f"Error running benchmark: {result.stderr}")
-                raise RuntimeError(f"Benchmark failed: {result.stderr}")
-
-            # Parse output
-            output_lines = result.stdout.strip().split("\n")
-
-            # Find the generated text (between ========== markers)
-            generated_text = ""
-            in_generation = False
-            prompt_tokens = 0
-            generation_tokens = 0
-            prompt_speed = 0.0
-            generation_speed = 0.0
-            peak_memory_str = ""
-
-            for line in output_lines:
-                if line.strip() == "==========":
-                    in_generation = not in_generation
-                elif in_generation:
-                    generated_text += line + "\n"
-                elif "Prompt:" in line and "tokens-per-sec" in line:
-                    # Parse: "Prompt: 13 tokens, 310.367 tokens-per-sec"
-                    parts = line.split(",")
-                    prompt_tokens = int(parts[0].split(":")[1].strip().split()[0])
-                    prompt_speed = float(parts[1].strip().split()[0])
-                elif "Generation:" in line and "tokens-per-sec" in line:
-                    # Parse: "Generation: 468 tokens, 69.860 tokens-per-sec"
-                    parts = line.split(",")
-                    generation_tokens = int(parts[0].split(":")[1].strip().split()[0])
-                    generation_speed = float(parts[1].strip().split()[0])
-                elif "Peak memory:" in line:
-                    peak_memory_str = line.split(":")[1].strip()
-
-            # Parse peak memory
-            peak_memory_gb = 0.0
-            if peak_memory_str:
-                if "GB" in peak_memory_str:
-                    peak_memory_gb = float(peak_memory_str.replace("GB", "").strip())
-                elif "MB" in peak_memory_str:
-                    peak_memory_gb = float(peak_memory_str.replace("MB", "").strip()) / 1024
-
-            # Calculate overall tokens per second
-            total_tokens = generation_tokens
-            total_time = end_time - start_time
-            total_tokens_per_sec = total_tokens / total_time if total_time > 0 else 0
-
-            # Create result
-            benchmark_result = BenchmarkResult(
+            
+            # Warmup runs - don't measure these
+            print(f"🔥 Running {WARMUP_RUNS} warmup runs to eliminate cold start effects...")
+            for i in range(WARMUP_RUNS):
+                try:
+                    print(f"   Warmup run {i+1}/{WARMUP_RUNS}...")
+                    warmup_result = subprocess.run(
+                        cmd, capture_output=True, text=True, timeout=300
+                    )
+                    if warmup_result.returncode != 0:
+                        print(f"   ⚠️  Warmup run {i+1} failed: {warmup_result.stderr[:100]}...")
+                    else:
+                        print(f"   ✅ Warmup run {i+1} completed")
+                        
+                    # Clear cache between warmup runs
+                    mx.clear_cache()
+                    
+                except subprocess.TimeoutExpired:
+                    print(f"   ⏰ Warmup run {i+1} timed out")
+                except Exception as e:
+                    print(f"   ❌ Warmup run {i+1} error: {e}")
+
+            print(f"📊 Running {MEASUREMENT_RUNS} measurement runs...")
+            
+            # Measurement runs
+            successful_results = []
+            for run_idx in range(MEASUREMENT_RUNS):
+                try:
+                    print(f"   Measurement run {run_idx+1}/{MEASUREMENT_RUNS}...")
+                    
+                    # Clear cache before each measurement run for consistency
+                    mx.clear_cache()
+                    initial_memory = mx.get_active_memory()
+
+                    # Run benchmark
+                    start_time = time.perf_counter()
+                    result = subprocess.run(
+                        cmd, capture_output=True, text=True, timeout=300
+                    )
+                    end_time = time.perf_counter()
+
+                    if result.returncode != 0:
+                        print(f"   ❌ Measurement run {run_idx+1} failed: {result.stderr[:100]}...")
+                        continue
+
+                    # Parse output
+                    parsed_result = self._parse_benchmark_output(
+                        result.stdout, config, end_time - start_time
+                    )
+                    
+                    if parsed_result:
+                        successful_results.append(parsed_result)
+                        print(f"   ✅ Run {run_idx+1}: {parsed_result.decode_tokens_per_sec:.1f} tokens/sec")
+                    else:
+                        print(f"   ❌ Run {run_idx+1}: Failed to parse output")
+                        
+                except subprocess.TimeoutExpired:
+                    print(f"   ⏰ Measurement run {run_idx+1} timed out")
+                except Exception as e:
+                    print(f"   ❌ Measurement run {run_idx+1} error: {e}")
+
+            # Require at least 2 successful runs for reliable results
+            if len(successful_results) < 2:
+                print(f"❌ Only {len(successful_results)}/{MEASUREMENT_RUNS} measurement runs succeeded")
+                print(f"❌ Need at least 2 successful runs for reliable results")
+                raise RuntimeError(f"Insufficient successful runs: {len(successful_results)}/{MEASUREMENT_RUNS}")
+
+            # Calculate statistics from multiple runs
+            decode_speeds = [r.decode_tokens_per_sec for r in successful_results]
+            prefill_speeds = [r.prefill_tokens_per_sec for r in successful_results]
+            memories = [r.peak_memory_gb for r in successful_results]
+            times = [r.total_time_sec for r in successful_results]
+            
+            # Use median for more robust results (less sensitive to outliers)
+            final_result = BenchmarkResult(
                 name=config.name,
-                prompt_tokens=prompt_tokens,
-                generated_tokens=generation_tokens,
-                prefill_tokens_per_sec=prompt_speed,
-                decode_tokens_per_sec=generation_speed,
-                total_tokens_per_sec=total_tokens_per_sec,
-                peak_memory_gb=peak_memory_gb,
-                total_time_sec=total_time,
+                prompt_tokens=int(np.median([r.prompt_tokens for r in successful_results])),
+                generated_tokens=int(np.median([r.generated_tokens for r in successful_results])),
+                prefill_tokens_per_sec=float(np.median(prefill_speeds)),
+                decode_tokens_per_sec=float(np.median(decode_speeds)),
+                total_tokens_per_sec=float(np.median([r.total_tokens_per_sec for r in successful_results])),
+                peak_memory_gb=float(np.median(memories)),
+                total_time_sec=float(np.median(times)),
                 prompt=config.prompt[:200] + "..." if len(config.prompt) > 200 else config.prompt,
-                generated_text=(
-                    generated_text.strip()[:200] + "..."
-                    if len(generated_text.strip()) > 200
-                    else generated_text.strip()
-                ),
+                generated_text=successful_results[0].generated_text,  # Use first result's text
             )
 
-            # Print results
-            print(f"\nResults:")
-            print(f"  Prompt tokens: {prompt_tokens}")
-            print(f"  Generated tokens: {generation_tokens}")
-            print(f"  Prefill speed: {prompt_speed:.2f} tokens/sec")
-            print(f"  Decode speed: {generation_speed:.2f} tokens/sec")
-            print(f"  Overall speed: {total_tokens_per_sec:.2f} tokens/sec")
-            print(f"  Peak memory: {peak_memory_gb:.3f} GB")
-            print(f"  Total time: {total_time:.2f} seconds")
-
-            return benchmark_result
+            # Print final results with statistics
+            print(f"\n📈 Final Results (median of {len(successful_results)} runs):")
+            print(f"  Prompt tokens: {final_result.prompt_tokens}")
+            print(f"  Generated tokens: {final_result.generated_tokens}")
+            print(f"  Prefill speed: {final_result.prefill_tokens_per_sec:.2f} tokens/sec")
+            print(f"  Decode speed: {final_result.decode_tokens_per_sec:.2f} tokens/sec (σ={np.std(decode_speeds):.2f})")
+            print(f"  Overall speed: {final_result.total_tokens_per_sec:.2f} tokens/sec")
+            print(f"  Peak memory: {final_result.peak_memory_gb:.3f} GB")
+            print(f"  Total time: {final_result.total_time_sec:.2f} seconds")
+            
+            if len(decode_speeds) > 1:
+                print(f"  Performance consistency: {np.std(decode_speeds)/np.mean(decode_speeds)*100:.1f}% CV")
+
+            return final_result
 
         finally:
             # Clean up
             if os.path.exists(prompt_file):
                 os.unlink(prompt_file)
 
+    def _parse_benchmark_output(
+        self, stdout: str, config: BenchmarkConfig, total_time: float
+    ) -> Optional[BenchmarkResult]:
+        """Parse mlx-lm output to extract performance metrics"""
+        output_lines = stdout.strip().split("\n")
+
+        # Find the generated text (between ========== markers)
+        generated_text = ""
+        in_generation = False
+        prompt_tokens = 0
+        generation_tokens = 0
+        prompt_speed = 0.0
+        generation_speed = 0.0
+        peak_memory_str = ""
+
+        for line in output_lines:
+            if line.strip() == "==========":  
+                in_generation = not in_generation
+            elif in_generation:
+                generated_text += line + "\n"
+            elif "Prompt:" in line and "tokens-per-sec" in line:
+                # Parse: "Prompt: 13 tokens, 310.367 tokens-per-sec"
+                parts = line.split(",")
+                prompt_tokens = int(parts[0].split(":")[1].strip().split()[0])
+                prompt_speed = float(parts[1].strip().split()[0])
+            elif "Generation:" in line and "tokens-per-sec" in line:
+                # Parse: "Generation: 468 tokens, 69.860 tokens-per-sec"
+                parts = line.split(",")
+                generation_tokens = int(parts[0].split(":")[1].strip().split()[0])
+                generation_speed = float(parts[1].strip().split()[0])
+            elif "Peak memory:" in line:
+                peak_memory_str = line.split(":")[1].strip()
+
+        # Parse peak memory
+        peak_memory_gb = 0.0
+        if peak_memory_str:
+            if "GB" in peak_memory_str:
+                peak_memory_gb = float(peak_memory_str.replace("GB", "").strip())
+            elif "MB" in peak_memory_str:
+                peak_memory_gb = float(peak_memory_str.replace("MB", "").strip()) / 1024
+
+        # Validate we got meaningful results
+        if generation_tokens == 0 or generation_speed == 0:
+            return None
+
+        # Calculate overall tokens per second
+        total_tokens_per_sec = generation_tokens / total_time if total_time > 0 else 0
+
+        return BenchmarkResult(
+            name=config.name,
+            prompt_tokens=prompt_tokens,
+            generated_tokens=generation_tokens,
+            prefill_tokens_per_sec=prompt_speed,
+            decode_tokens_per_sec=generation_speed,
+            total_tokens_per_sec=total_tokens_per_sec,
+            peak_memory_gb=peak_memory_gb,
+            total_time_sec=total_time,
+            prompt=config.prompt[:200] + "..." if len(config.prompt) > 200 else config.prompt,
+            generated_text=(
+                generated_text.strip()[:200] + "..."
+                if len(generated_text.strip()) > 200
+                else generated_text.strip()
+            ),
+        )
+
     def run_full_benchmark_suite(self) -> Dict:
         """Run the complete benchmark suite"""
         print(f"\n{'='*80}")