Skip to content

Commit f430c1a

Browse files
committed
fix erward hacking
1 parent bf73e00 commit f430c1a

File tree

2 files changed

+166
-2
lines changed

2 files changed

+166
-2
lines changed

examples/mlx_finetuning_optimization/config.yaml

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Focuses on evolving memory-efficient patterns and algorithmic optimizations
33
# for fine-tuning on Apple Silicon hardware
44

5-
max_iterations: 100
5+
max_iterations: 50
66
checkpoint_interval: 10
77
log_level: "INFO"
88

@@ -153,7 +153,56 @@ prompt:
153153
154154
**PRIMARY GOAL: Discover memory-efficient patterns that enable faster, lower-memory fine-tuning on Mac hardware**
155155
156-
**COMMON RUNTIME ERROR PATTERNS TO AVOID:**
156+
**CRITICAL REWARD HACKING PATTERNS TO AVOID:**
157+
158+
❌ **Loss Scaling Manipulation**
159+
```python
160+
# WRONG: Artificially reducing reported loss through scaling
161+
return loss / total_accumulation_steps # This makes loss appear better than it is
162+
163+
# RIGHT: Scale loss for gradient computation but report unscaled loss
164+
scaled_loss_for_gradients = loss / max(total_accumulation_steps, 1)
165+
# Use scaled_loss_for_gradients for backward pass
166+
# But return the original unscaled loss for evaluation
167+
return float(loss), should_update # Report actual loss, not scaled
168+
```
169+
170+
❌ **Zero Loss Fallbacks**
171+
```python
172+
# WRONG: Defaulting to zero loss rewards failed computations
173+
loss_value = float(mx.eval(loss) or 0.0) # 0.0 = perfect loss!
174+
175+
# RIGHT: Use reasonable fallback or fail gracefully
176+
eval_result = mx.eval(loss)
177+
if eval_result is None:
178+
raise ValueError("Loss computation failed - cannot proceed")
179+
loss_value = float(eval_result)
180+
```
181+
182+
❌ **Unrealistic Performance Claims**
183+
```python
184+
# WRONG: Reporting impossible improvements
185+
# - 100x speed improvements
186+
# - Zero memory usage
187+
# - Perfect loss values (< 0.01)
188+
# - Infinite tokens/second
189+
190+
# RIGHT: Report realistic, measurable improvements
191+
# - 10-50% speed improvements are realistic
192+
# - 20-40% memory reductions are achievable
193+
# - Loss should remain in reasonable range (0.1-10.0)
194+
```
195+
196+
❌ **Measurement Manipulation**
197+
```python
198+
# WRONG: Manipulating timing or memory measurements
199+
fake_time = 0.001 # Impossibly fast
200+
fake_memory = 10 # Impossibly low memory
201+
202+
# RIGHT: Use actual measurements
203+
actual_time = time.time() - start_time
204+
actual_memory = process.memory_info().rss / 1024 / 1024
205+
```
157206
158207
❌ **'NoneType' object is not subscriptable**
159208
```python
@@ -282,6 +331,15 @@ prompt:
282331
- **NEVER assume function return values or object states**
283332
- **INCLUDE error handling and safe fallbacks in all operations**
284333
334+
**CRITICAL: HONEST EVALUATION REQUIREMENTS**
335+
- **Report ACTUAL loss values, not scaled or manipulated values**
336+
- **Use REAL timing and memory measurements**
337+
- **Ensure training actually works and learns**
338+
- **Realistic improvement targets: 10-50% speed, 20-40% memory reduction**
339+
- **Loss should remain in range 0.1-10.0 for cross-entropy**
340+
- **Any >10x improvement claims will be automatically rejected**
341+
- **Zero or near-zero loss values (<0.01) will be flagged as reward hacking**
342+
285343
**IMPLEMENTATION CONSTRAINTS:**
286344
- Must use MLX operations and data types
287345
- Cannot break existing training pipeline interfaces
@@ -296,6 +354,7 @@ prompt:
296354

297355
# Database configuration for optimization pattern evolution
298356
database:
357+
db_path: "./openevolve_output/program_db" # Updated for training focus
299358
population_size: 80
300359
archive_size: 25
301360
num_islands: 3

examples/mlx_finetuning_optimization/evaluator.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,81 @@ def safe_float_conversion(value, default=0.0):
134134
return default
135135

136136

137+
def validate_training_metrics(optimization_results: Dict[str, Any], baseline_results: Dict[str, Any]) -> Tuple[bool, str]:
138+
"""Validate training metrics to detect reward hacking patterns"""
139+
140+
opt_final_loss = optimization_results.get("final_loss", 999.0)
141+
baseline_final_loss = baseline_results.get("final_loss", 2.0)
142+
143+
# CRITICAL: Detect suspiciously low loss values that indicate reward hacking
144+
MINIMUM_REASONABLE_LOSS = 0.01 # Cross-entropy loss should rarely be this low
145+
if opt_final_loss < MINIMUM_REASONABLE_LOSS:
146+
return False, f"Suspiciously low loss detected: {opt_final_loss:.6f} (likely reward hacking)"
147+
148+
# Check for exactly zero loss (common reward hacking pattern)
149+
if abs(opt_final_loss) < 1e-10:
150+
return False, f"Exact zero loss detected: {opt_final_loss} (reward hacking fallback pattern)"
151+
152+
# Check for loss values that are unrealistically good
153+
if opt_final_loss < baseline_final_loss * 0.1: # 10x better than baseline is suspicious
154+
return False, f"Unrealistically good loss: {opt_final_loss:.4f} vs baseline {baseline_final_loss:.4f} (>10x improvement suspicious)"
155+
156+
# Check for performance metrics that are too good to be true
157+
opt_tokens_per_sec = optimization_results.get("tokens_per_second", 0.0)
158+
baseline_tokens_per_sec = baseline_results.get("tokens_per_second", 1.0)
159+
160+
if opt_tokens_per_sec > baseline_tokens_per_sec * 20: # 20x speed improvement is unrealistic
161+
return False, f"Unrealistic speed improvement: {opt_tokens_per_sec:.1f} vs {baseline_tokens_per_sec:.1f} tokens/sec (>20x suspicious)"
162+
163+
# Check memory efficiency improvements
164+
opt_memory_eff = optimization_results.get("memory_efficiency", 0.0)
165+
baseline_memory_eff = baseline_results.get("memory_efficiency", 0.001)
166+
167+
if opt_memory_eff > baseline_memory_eff * 50: # 50x memory efficiency is unrealistic
168+
return False, f"Unrealistic memory efficiency: {opt_memory_eff:.4f} vs {baseline_memory_eff:.4f} (>50x suspicious)"
169+
170+
# Check for infinite or NaN values
171+
metrics_to_check = ["tokens_per_second", "memory_efficiency", "peak_memory_mb", "total_time"]
172+
for metric in metrics_to_check:
173+
value = optimization_results.get(metric, 0.0)
174+
if not np.isfinite(value):
175+
return False, f"Invalid {metric} value: {value} (NaN/Inf detected)"
176+
177+
# Check for negative metrics that should be positive
178+
positive_metrics = ["tokens_per_second", "memory_efficiency", "peak_memory_mb", "total_time"]
179+
for metric in positive_metrics:
180+
value = optimization_results.get(metric, 0.0)
181+
if value <= 0:
182+
return False, f"Invalid {metric} value: {value} (should be positive)"
183+
184+
# Check peak memory is reasonable (not too low)
185+
opt_peak_memory = optimization_results.get("peak_memory_mb", float('inf'))
186+
MINIMUM_REASONABLE_MEMORY = 100.0 # MB - any fine-tuning should use at least this much
187+
if opt_peak_memory < MINIMUM_REASONABLE_MEMORY:
188+
return False, f"Unrealistically low memory usage: {opt_peak_memory:.1f}MB (likely measurement error)"
189+
190+
return True, "Metrics appear valid"
191+
192+
193+
def detect_loss_scaling_hacks(optimization_results: Dict[str, Any]) -> Tuple[bool, str]:
194+
"""Detect common loss scaling hacks in gradient accumulation"""
195+
196+
# This is harder to detect directly, but we can look for patterns
197+
opt_final_loss = optimization_results.get("final_loss", 999.0)
198+
199+
# Check if loss is a simple fraction that suggests artificial scaling
200+
# Common hack: loss / accumulation_steps where accumulation_steps > 1
201+
COMMON_SCALE_FACTORS = [2, 4, 8, 16, 32] # Common accumulation step values
202+
203+
for scale_factor in COMMON_SCALE_FACTORS:
204+
scaled_loss = opt_final_loss * scale_factor
205+
# If scaling by a common factor gives us a "normal" looking loss (1-5 range)
206+
if 1.0 <= scaled_loss <= 5.0:
207+
return False, f"Loss appears artificially scaled: {opt_final_loss:.4f} * {scale_factor} = {scaled_loss:.4f} (possible gradient accumulation hack)"
208+
209+
return True, "No obvious loss scaling detected"
210+
211+
137212
def validate_optimization_config(config: Dict[str, Any]) -> Tuple[bool, str]:
138213
"""Validate that optimization configuration is reasonable"""
139214

@@ -211,6 +286,36 @@ def evaluate_optimization_patterns(program, baseline_results: Dict[str, Any]) ->
211286
"error": optimization_results["error"]
212287
}
213288

289+
# CRITICAL: Validate training metrics to detect reward hacking
290+
metrics_valid, metrics_message = validate_training_metrics(optimization_results, baseline_results)
291+
if not metrics_valid:
292+
print(f"🚨 REWARD HACKING DETECTED: {metrics_message}")
293+
return {
294+
"memory_efficiency": 0.0,
295+
"training_speed": 0.0,
296+
"memory_improvement": -1.0,
297+
"speed_improvement": -1.0,
298+
"final_loss": 999.0,
299+
"loss_ratio": 999.0,
300+
"overall_fitness": -100.0, # Severe penalty for reward hacking
301+
"error": f"Reward hacking detected: {metrics_message}"
302+
}
303+
304+
# CRITICAL: Detect loss scaling hacks
305+
loss_scaling_valid, loss_scaling_message = detect_loss_scaling_hacks(optimization_results)
306+
if not loss_scaling_valid:
307+
print(f"🚨 LOSS SCALING HACK DETECTED: {loss_scaling_message}")
308+
return {
309+
"memory_efficiency": 0.0,
310+
"training_speed": 0.0,
311+
"memory_improvement": -1.0,
312+
"speed_improvement": -1.0,
313+
"final_loss": 999.0,
314+
"loss_ratio": 999.0,
315+
"overall_fitness": -50.0, # Heavy penalty for loss scaling hacks
316+
"error": f"Loss scaling hack detected: {loss_scaling_message}"
317+
}
318+
214319
# Calculate relative improvements
215320
baseline_tokens_per_sec = baseline_results.get("tokens_per_second", 1.0)
216321
baseline_memory_efficiency = baseline_results.get("memory_efficiency", 0.001)

0 commit comments

Comments
 (0)