@@ -134,6 +134,81 @@ def safe_float_conversion(value, default=0.0):
134134 return default
135135
136136
137+ def validate_training_metrics (optimization_results : Dict [str , Any ], baseline_results : Dict [str , Any ]) -> Tuple [bool , str ]:
138+ """Validate training metrics to detect reward hacking patterns"""
139+
140+ opt_final_loss = optimization_results .get ("final_loss" , 999.0 )
141+ baseline_final_loss = baseline_results .get ("final_loss" , 2.0 )
142+
143+ # CRITICAL: Detect suspiciously low loss values that indicate reward hacking
144+ MINIMUM_REASONABLE_LOSS = 0.01 # Cross-entropy loss should rarely be this low
145+ if opt_final_loss < MINIMUM_REASONABLE_LOSS :
146+ return False , f"Suspiciously low loss detected: { opt_final_loss :.6f} (likely reward hacking)"
147+
148+ # Check for exactly zero loss (common reward hacking pattern)
149+ if abs (opt_final_loss ) < 1e-10 :
150+ return False , f"Exact zero loss detected: { opt_final_loss } (reward hacking fallback pattern)"
151+
152+ # Check for loss values that are unrealistically good
153+ if opt_final_loss < baseline_final_loss * 0.1 : # 10x better than baseline is suspicious
154+ return False , f"Unrealistically good loss: { opt_final_loss :.4f} vs baseline { baseline_final_loss :.4f} (>10x improvement suspicious)"
155+
156+ # Check for performance metrics that are too good to be true
157+ opt_tokens_per_sec = optimization_results .get ("tokens_per_second" , 0.0 )
158+ baseline_tokens_per_sec = baseline_results .get ("tokens_per_second" , 1.0 )
159+
160+ if opt_tokens_per_sec > baseline_tokens_per_sec * 20 : # 20x speed improvement is unrealistic
161+ return False , f"Unrealistic speed improvement: { opt_tokens_per_sec :.1f} vs { baseline_tokens_per_sec :.1f} tokens/sec (>20x suspicious)"
162+
163+ # Check memory efficiency improvements
164+ opt_memory_eff = optimization_results .get ("memory_efficiency" , 0.0 )
165+ baseline_memory_eff = baseline_results .get ("memory_efficiency" , 0.001 )
166+
167+ if opt_memory_eff > baseline_memory_eff * 50 : # 50x memory efficiency is unrealistic
168+ return False , f"Unrealistic memory efficiency: { opt_memory_eff :.4f} vs { baseline_memory_eff :.4f} (>50x suspicious)"
169+
170+ # Check for infinite or NaN values
171+ metrics_to_check = ["tokens_per_second" , "memory_efficiency" , "peak_memory_mb" , "total_time" ]
172+ for metric in metrics_to_check :
173+ value = optimization_results .get (metric , 0.0 )
174+ if not np .isfinite (value ):
175+ return False , f"Invalid { metric } value: { value } (NaN/Inf detected)"
176+
177+ # Check for negative metrics that should be positive
178+ positive_metrics = ["tokens_per_second" , "memory_efficiency" , "peak_memory_mb" , "total_time" ]
179+ for metric in positive_metrics :
180+ value = optimization_results .get (metric , 0.0 )
181+ if value <= 0 :
182+ return False , f"Invalid { metric } value: { value } (should be positive)"
183+
184+ # Check peak memory is reasonable (not too low)
185+ opt_peak_memory = optimization_results .get ("peak_memory_mb" , float ('inf' ))
186+ MINIMUM_REASONABLE_MEMORY = 100.0 # MB - any fine-tuning should use at least this much
187+ if opt_peak_memory < MINIMUM_REASONABLE_MEMORY :
188+ return False , f"Unrealistically low memory usage: { opt_peak_memory :.1f} MB (likely measurement error)"
189+
190+ return True , "Metrics appear valid"
191+
192+
193+ def detect_loss_scaling_hacks (optimization_results : Dict [str , Any ]) -> Tuple [bool , str ]:
194+ """Detect common loss scaling hacks in gradient accumulation"""
195+
196+ # This is harder to detect directly, but we can look for patterns
197+ opt_final_loss = optimization_results .get ("final_loss" , 999.0 )
198+
199+ # Check if loss is a simple fraction that suggests artificial scaling
200+ # Common hack: loss / accumulation_steps where accumulation_steps > 1
201+ COMMON_SCALE_FACTORS = [2 , 4 , 8 , 16 , 32 ] # Common accumulation step values
202+
203+ for scale_factor in COMMON_SCALE_FACTORS :
204+ scaled_loss = opt_final_loss * scale_factor
205+ # If scaling by a common factor gives us a "normal" looking loss (1-5 range)
206+ if 1.0 <= scaled_loss <= 5.0 :
207+ return False , f"Loss appears artificially scaled: { opt_final_loss :.4f} * { scale_factor } = { scaled_loss :.4f} (possible gradient accumulation hack)"
208+
209+ return True , "No obvious loss scaling detected"
210+
211+
137212def validate_optimization_config (config : Dict [str , Any ]) -> Tuple [bool , str ]:
138213 """Validate that optimization configuration is reasonable"""
139214
@@ -211,6 +286,36 @@ def evaluate_optimization_patterns(program, baseline_results: Dict[str, Any]) ->
211286 "error" : optimization_results ["error" ]
212287 }
213288
289+ # CRITICAL: Validate training metrics to detect reward hacking
290+ metrics_valid , metrics_message = validate_training_metrics (optimization_results , baseline_results )
291+ if not metrics_valid :
292+ print (f"🚨 REWARD HACKING DETECTED: { metrics_message } " )
293+ return {
294+ "memory_efficiency" : 0.0 ,
295+ "training_speed" : 0.0 ,
296+ "memory_improvement" : - 1.0 ,
297+ "speed_improvement" : - 1.0 ,
298+ "final_loss" : 999.0 ,
299+ "loss_ratio" : 999.0 ,
300+ "overall_fitness" : - 100.0 , # Severe penalty for reward hacking
301+ "error" : f"Reward hacking detected: { metrics_message } "
302+ }
303+
304+ # CRITICAL: Detect loss scaling hacks
305+ loss_scaling_valid , loss_scaling_message = detect_loss_scaling_hacks (optimization_results )
306+ if not loss_scaling_valid :
307+ print (f"🚨 LOSS SCALING HACK DETECTED: { loss_scaling_message } " )
308+ return {
309+ "memory_efficiency" : 0.0 ,
310+ "training_speed" : 0.0 ,
311+ "memory_improvement" : - 1.0 ,
312+ "speed_improvement" : - 1.0 ,
313+ "final_loss" : 999.0 ,
314+ "loss_ratio" : 999.0 ,
315+ "overall_fitness" : - 50.0 , # Heavy penalty for loss scaling hacks
316+ "error" : f"Loss scaling hack detected: { loss_scaling_message } "
317+ }
318+
214319 # Calculate relative improvements
215320 baseline_tokens_per_sec = baseline_results .get ("tokens_per_second" , 1.0 )
216321 baseline_memory_efficiency = baseline_results .get ("memory_efficiency" , 0.001 )
0 commit comments