Skip to content

Commit 6b01219

Browse files
committed
f
1 parent e4a1706 commit 6b01219

File tree

2 files changed

+112
-86
lines changed

2 files changed

+112
-86
lines changed

examples/mlx_finetuning_optimization/evaluator.py

Lines changed: 87 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -46,79 +46,80 @@ def load_baseline_results() -> Optional[Dict[str, Any]]:
4646

4747
def run_baseline_if_needed() -> Dict[str, Any]:
4848
"""Run baseline training if results don't exist"""
49-
baseline_results = load_baseline_results()
50-
51-
if baseline_results is None:
52-
print("Baseline results not found. Running baseline training...")
53-
54-
# Find baseline_finetuning.py with robust path handling
55-
current_dir = os.path.dirname(os.path.abspath(__file__))
56-
baseline_path = None
57-
58-
search_paths = [
59-
current_dir,
60-
os.path.dirname(current_dir),
61-
os.path.join(current_dir, 'examples', 'mlx_finetuning_optimization'),
62-
'/Users/asankhaya/Documents/GitHub/openevolve/examples/mlx_finetuning_optimization'
63-
]
64-
65-
for search_path in search_paths:
66-
potential_path = os.path.join(search_path, 'baseline_finetuning.py')
67-
if os.path.exists(potential_path):
68-
baseline_path = potential_path
69-
break
70-
71-
if baseline_path is None:
72-
# Create a default baseline result for evaluation to continue
73-
print("Baseline script not found. Using default baseline results...")
74-
return {
75-
"tokens_per_second": 150.0, # Reasonable baseline
76-
"memory_efficiency": 0.08,
77-
"peak_memory_mb": 1800.0,
78-
"total_time": 15.0,
79-
"final_loss": 2.2
80-
}
49+
50+
# FIXED: Always regenerate baseline for consistency
51+
# The cached baseline results can be inconsistent due to different parameters
52+
print("Regenerating baseline results for consistency...")
53+
54+
# Find baseline_finetuning.py with robust path handling
55+
current_dir = os.path.dirname(os.path.abspath(__file__))
56+
baseline_path = None
57+
58+
search_paths = [
59+
current_dir,
60+
os.path.dirname(current_dir),
61+
os.path.join(current_dir, 'examples', 'mlx_finetuning_optimization'),
62+
'/Users/asankhaya/Documents/GitHub/openevolve/examples/mlx_finetuning_optimization'
63+
]
64+
65+
for search_path in search_paths:
66+
potential_path = os.path.join(search_path, 'baseline_finetuning.py')
67+
if os.path.exists(potential_path):
68+
baseline_path = potential_path
69+
break
70+
71+
if baseline_path is None:
72+
# Create a consistent default baseline result
73+
print("Baseline script not found. Using consistent default baseline results...")
74+
return {
75+
"tokens_per_second": 180.0, # Reasonable and consistent baseline
76+
"memory_efficiency": 0.08,
77+
"peak_memory_mb": 1700.0,
78+
"total_time": 12.0,
79+
"final_loss": 2.0
80+
}
81+
82+
spec = importlib.util.spec_from_file_location("baseline_finetuning", baseline_path)
83+
baseline_module = importlib.util.module_from_spec(spec)
84+
85+
# Add the directory to sys.path for imports
86+
baseline_dir = os.path.dirname(baseline_path)
87+
sys_path_added = False
88+
if baseline_dir not in sys.path:
89+
sys.path.insert(0, baseline_dir)
90+
sys_path_added = True
91+
92+
try:
93+
spec.loader.exec_module(baseline_module)
8194

82-
spec = importlib.util.spec_from_file_location("baseline_finetuning", baseline_path)
83-
baseline_module = importlib.util.module_from_spec(spec)
95+
# Create and run baseline trainer with CONSISTENT parameters
96+
trainer = baseline_module.BaselineTrainer("mlx-community/Qwen3-0.6B-bf16")
97+
trainer.config.batch_size = 2 # Consistent with evaluation
98+
trainer.config.num_epochs = 1
99+
trainer.config.sequence_length = 128 # Consistent with evaluation
84100

85-
# Add the directory to sys.path for imports
86-
baseline_dir = os.path.dirname(baseline_path)
87-
sys_path_added = False
88-
if baseline_dir not in sys.path:
89-
sys.path.insert(0, baseline_dir)
90-
sys_path_added = True
101+
# Create consistent dataset for baseline (SAME SIZE as evaluation)
102+
dataset = trainer.create_sample_dataset(num_samples=10) # Match evaluation size
103+
baseline_results = trainer.train(dataset, output_dir="./baseline_output")
91104

92-
try:
93-
spec.loader.exec_module(baseline_module)
94-
95-
# Create and run baseline trainer
96-
trainer = baseline_module.BaselineTrainer("mlx-community/Qwen3-0.6B-bf16")
97-
trainer.config.batch_size = 2 # Small batch for evaluation
98-
trainer.config.num_epochs = 1
99-
trainer.config.sequence_length = 256 # Match evaluation settings
100-
101-
# Create small dataset for baseline
102-
dataset = trainer.create_sample_dataset(num_samples=20) # Match evaluation size
103-
baseline_results = trainer.train(dataset, output_dir="./baseline_output")
104-
105-
print("Baseline training completed.")
106-
107-
except Exception as e:
108-
print(f"Failed to run baseline: {e}")
109-
# Return default baseline results
110-
baseline_results = {
111-
"tokens_per_second": 150.0,
112-
"memory_efficiency": 0.08,
113-
"peak_memory_mb": 1800.0,
114-
"total_time": 15.0,
115-
"final_loss": 2.2
116-
}
117-
finally:
118-
if sys_path_added and baseline_dir in sys.path:
119-
sys.path.remove(baseline_dir)
120-
else:
121-
print("Using cached baseline results.")
105+
print("Baseline training completed with consistent parameters.")
106+
print(f"Baseline tokens/sec: {baseline_results.get('tokens_per_second', 0):.1f}")
107+
print(f"Baseline memory: {baseline_results.get('peak_memory_mb', 0):.1f}MB")
108+
print(f"Baseline loss: {baseline_results.get('final_loss', 0):.3f}")
109+
110+
except Exception as e:
111+
print(f"Failed to run baseline: {e}")
112+
# Return consistent default baseline results
113+
baseline_results = {
114+
"tokens_per_second": 180.0,
115+
"memory_efficiency": 0.08,
116+
"peak_memory_mb": 1700.0,
117+
"total_time": 12.0,
118+
"final_loss": 2.0
119+
}
120+
finally:
121+
if sys_path_added and baseline_dir in sys.path:
122+
sys.path.remove(baseline_dir)
122123

123124
return baseline_results
124125

@@ -157,15 +158,27 @@ def validate_training_metrics(optimization_results: Dict[str, Any], baseline_res
157158
opt_tokens_per_sec = optimization_results.get("tokens_per_second", 0.0)
158159
baseline_tokens_per_sec = baseline_results.get("tokens_per_second", 1.0)
159160

160-
if opt_tokens_per_sec > baseline_tokens_per_sec * 20: # 20x speed improvement is unrealistic
161-
return False, f"Unrealistic speed improvement: {opt_tokens_per_sec:.1f} vs {baseline_tokens_per_sec:.1f} tokens/sec (>20x suspicious)"
161+
# FIXED: More lenient speed improvement detection (50x instead of 20x)
162+
# and allow for reasonable baseline variations
163+
speed_ratio = opt_tokens_per_sec / max(baseline_tokens_per_sec, 1.0)
164+
if speed_ratio > 50: # 50x speed improvement is unrealistic
165+
return False, f"Unrealistic speed improvement: {opt_tokens_per_sec:.1f} vs {baseline_tokens_per_sec:.1f} tokens/sec (>{speed_ratio:.1f}x suspicious)"
166+
167+
# FIXED: Don't flag reasonable performance differences that could be due to:
168+
# - Different dataset sizes
169+
# - Different sequence lengths
170+
# - Different batch sizes
171+
# - Different hardware states
172+
if speed_ratio > 2.0 and speed_ratio <= 20.0:
173+
print(f"ℹ️ Performance difference detected but within reasonable range: {speed_ratio:.1f}x vs baseline")
174+
print(f" This could be due to dataset size, sequence length, or hardware differences")
162175

163176
# Check memory efficiency improvements
164177
opt_memory_eff = optimization_results.get("memory_efficiency", 0.0)
165178
baseline_memory_eff = baseline_results.get("memory_efficiency", 0.001)
166179

167-
if opt_memory_eff > baseline_memory_eff * 50: # 50x memory efficiency is unrealistic
168-
return False, f"Unrealistic memory efficiency: {opt_memory_eff:.4f} vs {baseline_memory_eff:.4f} (>50x suspicious)"
180+
if opt_memory_eff > baseline_memory_eff * 100: # 100x memory efficiency is unrealistic
181+
return False, f"Unrealistic memory efficiency: {opt_memory_eff:.4f} vs {baseline_memory_eff:.4f} (>100x suspicious)"
169182

170183
# Check for infinite or NaN values
171184
metrics_to_check = ["tokens_per_second", "memory_efficiency", "peak_memory_mb", "total_time"]

examples/mlx_finetuning_optimization/initial_program.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414

1515
# EVOLVE-BLOCK-START
1616
def memory_efficient_gradient_accumulation(model, optimizer, batch: mx.array,
17-
accumulation_step: int, total_accumulation_steps: int,
17+
accumulation_step: int, total_steps: int,
1818
config: Dict[str, Any]) -> Tuple[float, bool]:
1919
"""
2020
Core gradient accumulation pattern - this is where most MLX errors occur.
2121
Evolution should focus on making this robust and memory-efficient.
22+
23+
FIXED: Function signature now matches baseline expectations
2224
"""
2325
# Safe array indexing with dimension check
2426
if batch.ndim >= 2:
@@ -97,9 +99,11 @@ def get_optimization_config() -> Dict[str, Any]:
9799
def apply_optimizations_to_trainer(trainer, config: Dict[str, Any]):
98100
"""Apply the evolved optimization to trainer"""
99101
def patched_gradient_step(model, optimizer, batch, accumulation_step, total_steps):
102+
# FIXED: Ensure function signature matches what's expected
100103
return memory_efficient_gradient_accumulation(
101104
model, optimizer, batch, accumulation_step,
102-
trainer.config.gradient_accumulation_steps, config
105+
total_steps, # Use total_steps (not total_accumulation_steps)
106+
config
103107
)
104108

105109
trainer.gradient_accumulation_step = patched_gradient_step
@@ -109,7 +113,7 @@ def patched_gradient_step(model, optimizer, batch, accumulation_step, total_step
109113
def benchmark_optimization_patterns(config: Dict[str, Any],
110114
baseline_results: Dict[str, Any] = None) -> Dict[str, float]:
111115
"""
112-
Simplified benchmark focusing on core metrics
116+
Simplified benchmark focusing on core metrics with CONSISTENT parameters
113117
"""
114118
try:
115119
import sys
@@ -129,17 +133,17 @@ def benchmark_optimization_patterns(config: Dict[str, Any],
129133
sys.path.insert(0, os.path.dirname(baseline_path))
130134
spec.loader.exec_module(baseline_module)
131135

132-
# Create and configure trainer
136+
# FIXED: Create trainer with EXACTLY same parameters as baseline
133137
trainer = baseline_module.BaselineTrainer("mlx-community/Qwen3-0.6B-bf16")
134-
trainer.config.batch_size = 2
135-
trainer.config.sequence_length = 128 # Very short for fast eval
138+
trainer.config.batch_size = 2 # Match baseline
139+
trainer.config.sequence_length = 128 # Match baseline - CONSISTENT!
136140
trainer.config.num_epochs = 1
137141

138142
trainer.load_model()
139143
apply_optimizations_to_trainer(trainer, config)
140144

141-
# Small dataset for quick evaluation
142-
dataset = trainer.create_sample_dataset(num_samples=10)
145+
# FIXED: Same dataset size as baseline for fair comparison
146+
dataset = trainer.create_sample_dataset(num_samples=10) # Match baseline exactly
143147

144148
# Measure performance
145149
process = psutil.Process(os.getpid())
@@ -151,20 +155,27 @@ def benchmark_optimization_patterns(config: Dict[str, Any],
151155
end_time = time.time()
152156
end_memory = process.memory_info().rss / 1024 / 1024
153157

154-
# Calculate metrics
158+
# Calculate metrics CONSISTENTLY
155159
training_time = end_time - start_time
156-
tokens_processed = len(dataset) * trainer.config.sequence_length
160+
tokens_processed = len(dataset) * trainer.config.sequence_length # Using consistent seq_len
157161
tokens_per_sec = tokens_processed / max(training_time, 0.1)
158162
memory_efficiency = tokens_per_sec / max(end_memory, 100)
159163

164+
print(f"Evaluation metrics:")
165+
print(f" Tokens processed: {tokens_processed}")
166+
print(f" Training time: {training_time:.2f}s")
167+
print(f" Tokens/sec: {tokens_per_sec:.1f}")
168+
print(f" Peak memory: {end_memory:.1f}MB")
169+
print(f" Memory efficiency: {memory_efficiency:.4f}")
170+
160171
# Clean up
161172
if os.path.exists("./eval_output"):
162173
import shutil
163174
shutil.rmtree("./eval_output")
164175

165-
# Calculate fitness
176+
# Calculate fitness based on reasonable performance
166177
base_fitness = 0.1
167-
if tokens_per_sec > 20:
178+
if tokens_per_sec > 50: # Reasonable threshold
168179
base_fitness += 0.3
169180
if memory_efficiency > 0.02:
170181
base_fitness += 0.3
@@ -182,6 +193,8 @@ def benchmark_optimization_patterns(config: Dict[str, Any],
182193

183194
except Exception as e:
184195
print(f"Benchmark error: {e}")
196+
import traceback
197+
traceback.print_exc()
185198
return {
186199
"tokens_per_second": 0.0,
187200
"memory_efficiency": 0.0,

0 commit comments

Comments
 (0)