@@ -46,79 +46,80 @@ def load_baseline_results() -> Optional[Dict[str, Any]]:
4646
4747def run_baseline_if_needed () -> Dict [str , Any ]:
4848 """Run baseline training if results don't exist"""
49- baseline_results = load_baseline_results ()
50-
51- if baseline_results is None :
52- print ("Baseline results not found. Running baseline training..." )
53-
54- # Find baseline_finetuning.py with robust path handling
55- current_dir = os .path .dirname (os .path .abspath (__file__ ))
56- baseline_path = None
57-
58- search_paths = [
59- current_dir ,
60- os .path .dirname (current_dir ),
61- os .path .join (current_dir , 'examples' , 'mlx_finetuning_optimization' ),
62- '/Users/asankhaya/Documents/GitHub/openevolve/examples/mlx_finetuning_optimization'
63- ]
64-
65- for search_path in search_paths :
66- potential_path = os .path .join (search_path , 'baseline_finetuning.py' )
67- if os .path .exists (potential_path ):
68- baseline_path = potential_path
69- break
70-
71- if baseline_path is None :
72- # Create a default baseline result for evaluation to continue
73- print ("Baseline script not found. Using default baseline results..." )
74- return {
75- "tokens_per_second" : 150.0 , # Reasonable baseline
76- "memory_efficiency" : 0.08 ,
77- "peak_memory_mb" : 1800.0 ,
78- "total_time" : 15.0 ,
79- "final_loss" : 2.2
80- }
49+
50+ # FIXED: Always regenerate baseline for consistency
51+ # The cached baseline results can be inconsistent due to different parameters
52+ print ("Regenerating baseline results for consistency..." )
53+
54+ # Find baseline_finetuning.py with robust path handling
55+ current_dir = os .path .dirname (os .path .abspath (__file__ ))
56+ baseline_path = None
57+
58+ search_paths = [
59+ current_dir ,
60+ os .path .dirname (current_dir ),
61+ os .path .join (current_dir , 'examples' , 'mlx_finetuning_optimization' ),
62+ '/Users/asankhaya/Documents/GitHub/openevolve/examples/mlx_finetuning_optimization'
63+ ]
64+
65+ for search_path in search_paths :
66+ potential_path = os .path .join (search_path , 'baseline_finetuning.py' )
67+ if os .path .exists (potential_path ):
68+ baseline_path = potential_path
69+ break
70+
71+ if baseline_path is None :
72+ # Create a consistent default baseline result
73+ print ("Baseline script not found. Using consistent default baseline results..." )
74+ return {
75+ "tokens_per_second" : 180.0 , # Reasonable and consistent baseline
76+ "memory_efficiency" : 0.08 ,
77+ "peak_memory_mb" : 1700.0 ,
78+ "total_time" : 12.0 ,
79+ "final_loss" : 2.0
80+ }
81+
82+ spec = importlib .util .spec_from_file_location ("baseline_finetuning" , baseline_path )
83+ baseline_module = importlib .util .module_from_spec (spec )
84+
85+ # Add the directory to sys.path for imports
86+ baseline_dir = os .path .dirname (baseline_path )
87+ sys_path_added = False
88+ if baseline_dir not in sys .path :
89+ sys .path .insert (0 , baseline_dir )
90+ sys_path_added = True
91+
92+ try :
93+ spec .loader .exec_module (baseline_module )
8194
82- spec = importlib .util .spec_from_file_location ("baseline_finetuning" , baseline_path )
83- baseline_module = importlib .util .module_from_spec (spec )
95+ # Create and run baseline trainer with CONSISTENT parameters
96+ trainer = baseline_module .BaselineTrainer ("mlx-community/Qwen3-0.6B-bf16" )
97+ trainer .config .batch_size = 2 # Consistent with evaluation
98+ trainer .config .num_epochs = 1
99+ trainer .config .sequence_length = 128 # Consistent with evaluation
84100
85- # Add the directory to sys.path for imports
86- baseline_dir = os .path .dirname (baseline_path )
87- sys_path_added = False
88- if baseline_dir not in sys .path :
89- sys .path .insert (0 , baseline_dir )
90- sys_path_added = True
101+ # Create consistent dataset for baseline (SAME SIZE as evaluation)
102+ dataset = trainer .create_sample_dataset (num_samples = 10 ) # Match evaluation size
103+ baseline_results = trainer .train (dataset , output_dir = "./baseline_output" )
91104
92- try :
93- spec .loader .exec_module (baseline_module )
94-
95- # Create and run baseline trainer
96- trainer = baseline_module .BaselineTrainer ("mlx-community/Qwen3-0.6B-bf16" )
97- trainer .config .batch_size = 2 # Small batch for evaluation
98- trainer .config .num_epochs = 1
99- trainer .config .sequence_length = 256 # Match evaluation settings
100-
101- # Create small dataset for baseline
102- dataset = trainer .create_sample_dataset (num_samples = 20 ) # Match evaluation size
103- baseline_results = trainer .train (dataset , output_dir = "./baseline_output" )
104-
105- print ("Baseline training completed." )
106-
107- except Exception as e :
108- print (f"Failed to run baseline: { e } " )
109- # Return default baseline results
110- baseline_results = {
111- "tokens_per_second" : 150.0 ,
112- "memory_efficiency" : 0.08 ,
113- "peak_memory_mb" : 1800.0 ,
114- "total_time" : 15.0 ,
115- "final_loss" : 2.2
116- }
117- finally :
118- if sys_path_added and baseline_dir in sys .path :
119- sys .path .remove (baseline_dir )
120- else :
121- print ("Using cached baseline results." )
105+ print ("Baseline training completed with consistent parameters." )
106+ print (f"Baseline tokens/sec: { baseline_results .get ('tokens_per_second' , 0 ):.1f} " )
107+ print (f"Baseline memory: { baseline_results .get ('peak_memory_mb' , 0 ):.1f} MB" )
108+ print (f"Baseline loss: { baseline_results .get ('final_loss' , 0 ):.3f} " )
109+
110+ except Exception as e :
111+ print (f"Failed to run baseline: { e } " )
112+ # Return consistent default baseline results
113+ baseline_results = {
114+ "tokens_per_second" : 180.0 ,
115+ "memory_efficiency" : 0.08 ,
116+ "peak_memory_mb" : 1700.0 ,
117+ "total_time" : 12.0 ,
118+ "final_loss" : 2.0
119+ }
120+ finally :
121+ if sys_path_added and baseline_dir in sys .path :
122+ sys .path .remove (baseline_dir )
122123
123124 return baseline_results
124125
@@ -157,15 +158,27 @@ def validate_training_metrics(optimization_results: Dict[str, Any], baseline_res
157158 opt_tokens_per_sec = optimization_results .get ("tokens_per_second" , 0.0 )
158159 baseline_tokens_per_sec = baseline_results .get ("tokens_per_second" , 1.0 )
159160
160- if opt_tokens_per_sec > baseline_tokens_per_sec * 20 : # 20x speed improvement is unrealistic
161- return False , f"Unrealistic speed improvement: { opt_tokens_per_sec :.1f} vs { baseline_tokens_per_sec :.1f} tokens/sec (>20x suspicious)"
161+ # FIXED: More lenient speed improvement detection (50x instead of 20x)
162+ # and allow for reasonable baseline variations
163+ speed_ratio = opt_tokens_per_sec / max (baseline_tokens_per_sec , 1.0 )
164+ if speed_ratio > 50 : # 50x speed improvement is unrealistic
165+ return False , f"Unrealistic speed improvement: { opt_tokens_per_sec :.1f} vs { baseline_tokens_per_sec :.1f} tokens/sec (>{ speed_ratio :.1f} x suspicious)"
166+
167+ # FIXED: Don't flag reasonable performance differences that could be due to:
168+ # - Different dataset sizes
169+ # - Different sequence lengths
170+ # - Different batch sizes
171+ # - Different hardware states
172+ if speed_ratio > 2.0 and speed_ratio <= 20.0 :
173+ print (f"ℹ️ Performance difference detected but within reasonable range: { speed_ratio :.1f} x vs baseline" )
174+ print (f" This could be due to dataset size, sequence length, or hardware differences" )
162175
163176 # Check memory efficiency improvements
164177 opt_memory_eff = optimization_results .get ("memory_efficiency" , 0.0 )
165178 baseline_memory_eff = baseline_results .get ("memory_efficiency" , 0.001 )
166179
167- if opt_memory_eff > baseline_memory_eff * 50 : # 50x memory efficiency is unrealistic
168- return False , f"Unrealistic memory efficiency: { opt_memory_eff :.4f} vs { baseline_memory_eff :.4f} (>50x suspicious)"
180+ if opt_memory_eff > baseline_memory_eff * 100 : # 100x memory efficiency is unrealistic
181+ return False , f"Unrealistic memory efficiency: { opt_memory_eff :.4f} vs { baseline_memory_eff :.4f} (>100x suspicious)"
169182
170183 # Check for infinite or NaN values
171184 metrics_to_check = ["tokens_per_second" , "memory_efficiency" , "peak_memory_mb" , "total_time" ]
0 commit comments