Skip to content

Commit cb0a984

Browse files
committed
Update test_evaluator_timeout.py
1 parent 16e683d commit cb0a984

File tree

1 file changed

+44
-44
lines changed

1 file changed

+44
-44
lines changed

tests/test_evaluator_timeout.py

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def setUp(self):
2121
# Create a test evaluation file
2222
self.test_eval_file = tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False)
2323

24-
# Write test evaluation functions
24+
# Write test evaluation functions with shorter sleep times for faster tests
2525
self.test_eval_file.write(
2626
"""
2727
import time
@@ -32,12 +32,12 @@ def evaluate(program_path):
3232
code = f.read()
3333
3434
if 'SLEEP_LONG' in code:
35-
# Sleep for a long time to trigger timeout
36-
time.sleep(30)
35+
# Sleep for a long time to trigger timeout (reduced for faster tests)
36+
time.sleep(8)
3737
return {"score": 1.0}
3838
elif 'SLEEP_SHORT' in code:
3939
# Sleep for a short time that should not timeout
40-
time.sleep(1)
40+
time.sleep(0.5)
4141
return {"score": 0.8}
4242
elif 'RAISE_ERROR' in code:
4343
# Raise an error to trigger retries
@@ -51,7 +51,7 @@ def evaluate_stage1(program_path):
5151
code = f.read()
5252
5353
if 'STAGE1_TIMEOUT' in code:
54-
time.sleep(30)
54+
time.sleep(8)
5555
return {"stage1_score": 1.0}
5656
else:
5757
return {"stage1_score": 0.7}
@@ -61,7 +61,7 @@ def evaluate_stage2(program_path):
6161
code = f.read()
6262
6363
if 'STAGE2_TIMEOUT' in code:
64-
time.sleep(30)
64+
time.sleep(8)
6565
return {"stage2_score": 1.0}
6666
else:
6767
return {"stage2_score": 0.8}
@@ -71,7 +71,7 @@ def evaluate_stage3(program_path):
7171
code = f.read()
7272
7373
if 'STAGE3_TIMEOUT' in code:
74-
time.sleep(30)
74+
time.sleep(8)
7575
return {"stage3_score": 1.0}
7676
else:
7777
return {"stage3_score": 0.9}
@@ -84,8 +84,8 @@ def tearDown(self):
8484
if os.path.exists(self.test_eval_file.name):
8585
os.unlink(self.test_eval_file.name)
8686

87-
def _create_evaluator(self, timeout=5, cascade_evaluation=False):
88-
"""Helper to create evaluator with given settings"""
87+
def _create_evaluator(self, timeout=3, cascade_evaluation=False):
88+
"""Helper to create evaluator with given settings (shorter timeout for faster tests)"""
8989
config = EvaluatorConfig()
9090
config.timeout = timeout
9191
config.max_retries = 1 # Minimal retries for faster testing
@@ -103,7 +103,7 @@ def test_fast_evaluation_completes(self):
103103
"""Test that fast evaluations complete successfully"""
104104

105105
async def run_test():
106-
evaluator = self._create_evaluator(timeout=5)
106+
evaluator = self._create_evaluator(timeout=3)
107107
program_code = "def test(): return 'fast'"
108108
start_time = time.time()
109109

@@ -112,7 +112,7 @@ async def run_test():
112112
elapsed_time = time.time() - start_time
113113

114114
# Should complete quickly
115-
self.assertLess(elapsed_time, 3.0)
115+
self.assertLess(elapsed_time, 2.0)
116116
# Should return successful result
117117
self.assertIn("score", result)
118118
self.assertEqual(result["score"], 0.5)
@@ -126,7 +126,7 @@ def test_short_evaluation_completes(self):
126126
"""Test that evaluations shorter than timeout complete successfully"""
127127

128128
async def run_test():
129-
evaluator = self._create_evaluator(timeout=5)
129+
evaluator = self._create_evaluator(timeout=3)
130130
program_code = "# SLEEP_SHORT\ndef test(): return 'short'"
131131
start_time = time.time()
132132

@@ -135,7 +135,7 @@ async def run_test():
135135
elapsed_time = time.time() - start_time
136136

137137
# Should complete within timeout
138-
self.assertLess(elapsed_time, 5)
138+
self.assertLess(elapsed_time, 3)
139139
# Should return successful result
140140
self.assertIn("score", result)
141141
self.assertEqual(result["score"], 0.8)
@@ -149,7 +149,7 @@ def test_long_evaluation_times_out(self):
149149
"""Test that long evaluations time out properly"""
150150

151151
async def run_test():
152-
evaluator = self._create_evaluator(timeout=5)
152+
evaluator = self._create_evaluator(timeout=3)
153153
program_code = "# SLEEP_LONG\ndef test(): return 'long'"
154154
start_time = time.time()
155155

@@ -158,8 +158,8 @@ async def run_test():
158158
elapsed_time = time.time() - start_time
159159

160160
# Should complete around the timeout period (allowing some margin)
161-
self.assertGreater(elapsed_time, 4)
162-
self.assertLess(elapsed_time, 8)
161+
self.assertGreater(elapsed_time, 2.5)
162+
self.assertLess(elapsed_time, 5)
163163

164164
# Should return timeout result
165165
self.assertIn("error", result)
@@ -173,7 +173,7 @@ def test_cascade_evaluation_timeout_stage1(self):
173173
"""Test timeout in cascade evaluation stage 1"""
174174

175175
async def run_test():
176-
evaluator = self._create_evaluator(timeout=5, cascade_evaluation=True)
176+
evaluator = self._create_evaluator(timeout=3, cascade_evaluation=True)
177177
program_code = "# STAGE1_TIMEOUT\ndef test(): return 'stage1_timeout'"
178178
start_time = time.time()
179179

@@ -182,8 +182,8 @@ async def run_test():
182182
elapsed_time = time.time() - start_time
183183

184184
# Should timeout around the configured timeout
185-
self.assertGreater(elapsed_time, 4)
186-
self.assertLess(elapsed_time, 8)
185+
self.assertGreater(elapsed_time, 2.5)
186+
self.assertLess(elapsed_time, 5)
187187

188188
# Should return stage1 timeout result
189189
self.assertIn("stage1_passed", result)
@@ -197,7 +197,7 @@ def test_cascade_evaluation_timeout_stage2(self):
197197
"""Test timeout in cascade evaluation stage 2"""
198198

199199
async def run_test():
200-
evaluator = self._create_evaluator(timeout=5, cascade_evaluation=True)
200+
evaluator = self._create_evaluator(timeout=3, cascade_evaluation=True)
201201
program_code = "# STAGE2_TIMEOUT\ndef test(): return 'stage2_timeout'"
202202
start_time = time.time()
203203

@@ -206,8 +206,8 @@ async def run_test():
206206
elapsed_time = time.time() - start_time
207207

208208
# Should timeout on stage 2, but stage 1 should complete first
209-
self.assertGreater(elapsed_time, 4)
210-
self.assertLess(elapsed_time, 8)
209+
self.assertGreater(elapsed_time, 2.5)
210+
self.assertLess(elapsed_time, 5)
211211

212212
# Should have stage1 result but stage2 timeout
213213
self.assertIn("stage1_score", result)
@@ -223,7 +223,7 @@ def test_cascade_evaluation_timeout_stage3(self):
223223
"""Test timeout in cascade evaluation stage 3"""
224224

225225
async def run_test():
226-
evaluator = self._create_evaluator(timeout=5, cascade_evaluation=True)
226+
evaluator = self._create_evaluator(timeout=3, cascade_evaluation=True)
227227
program_code = "# STAGE3_TIMEOUT\ndef test(): return 'stage3_timeout'"
228228
start_time = time.time()
229229

@@ -232,8 +232,8 @@ async def run_test():
232232
elapsed_time = time.time() - start_time
233233

234234
# Should timeout on stage 3, but stages 1 and 2 should complete first
235-
self.assertGreater(elapsed_time, 4)
236-
self.assertLess(elapsed_time, 8)
235+
self.assertGreater(elapsed_time, 2.5)
236+
self.assertLess(elapsed_time, 5)
237237

238238
# Should have stage1 and stage2 results but stage3 timeout
239239
self.assertIn("stage1_score", result)
@@ -252,7 +252,7 @@ def test_timeout_config_respected(self):
252252

253253
async def run_test():
254254
# Create evaluator with different timeout
255-
evaluator = self._create_evaluator(timeout=10)
255+
evaluator = self._create_evaluator(timeout=5)
256256

257257
program_code = "# SLEEP_LONG\ndef test(): return 'long'"
258258
start_time = time.time()
@@ -261,9 +261,9 @@ async def run_test():
261261

262262
elapsed_time = time.time() - start_time
263263

264-
# Should timeout around 10 seconds, not 5
265-
self.assertGreater(elapsed_time, 9)
266-
self.assertLess(elapsed_time, 13)
264+
# Should timeout around 5 seconds, not 3
265+
self.assertGreater(elapsed_time, 4.5)
266+
self.assertLess(elapsed_time, 7)
267267

268268
# Should return timeout result
269269
self.assertIn("timeout", result)
@@ -277,7 +277,7 @@ def test_multiple_retries_with_errors(self):
277277
async def run_test():
278278
# Create evaluator with more retries
279279
config = EvaluatorConfig()
280-
config.timeout = 10 # Long timeout to avoid timeout during this test
280+
config.timeout = 8 # Long timeout to avoid timeout during this test
281281
config.max_retries = 2 # 3 total attempts
282282
config.cascade_evaluation = False
283283

@@ -300,7 +300,7 @@ async def run_test():
300300
# Each attempt should fail quickly, plus 1 second sleep between retries
301301
# So total time should be around 2-3 seconds (quick failures + 2 sleep periods)
302302
self.assertGreater(elapsed_time, 1.8) # At least 2 sleep periods
303-
self.assertLess(elapsed_time, 5) # But not too long
303+
self.assertLess(elapsed_time, 4) # But not too long
304304

305305
# Should return error result after all retries fail
306306
self.assertIn("error", result)
@@ -314,7 +314,7 @@ def test_timeout_does_not_trigger_retries(self):
314314
async def run_test():
315315
# Create evaluator with retries enabled
316316
config = EvaluatorConfig()
317-
config.timeout = 3 # Short timeout
317+
config.timeout = 2 # Short timeout
318318
config.max_retries = 2 # Would allow 3 attempts if retries were triggered
319319
config.cascade_evaluation = False
320320

@@ -333,10 +333,10 @@ async def run_test():
333333

334334
elapsed_time = time.time() - start_time
335335

336-
# Should timeout only once (~3 seconds), not retry multiple times
337-
# If retries were happening, this would take ~9 seconds
338-
self.assertGreater(elapsed_time, 2.5) # At least the timeout period
339-
self.assertLess(elapsed_time, 5) # But not multiple timeout periods
336+
# Should timeout only once (~2 seconds), not retry multiple times
337+
# If retries were happening, this would take ~6 seconds
338+
self.assertGreater(elapsed_time, 1.8) # At least the timeout period
339+
self.assertLess(elapsed_time, 3.5) # But not multiple timeout periods
340340

341341
# Should return timeout result
342342
self.assertIn("timeout", result)
@@ -350,7 +350,7 @@ def test_artifacts_on_timeout(self):
350350
async def run_test():
351351
# Enable artifacts
352352
with patch.dict(os.environ, {"ENABLE_ARTIFACTS": "true"}):
353-
evaluator = self._create_evaluator(timeout=5)
353+
evaluator = self._create_evaluator(timeout=3)
354354
program_code = "# SLEEP_LONG\ndef test(): return 'long'"
355355

356356
# Execute evaluation
@@ -382,7 +382,7 @@ async def run_test():
382382
"timeout_duration", artifacts, "Artifacts should contain timeout_duration"
383383
)
384384
self.assertEqual(
385-
artifacts["timeout_duration"], 5, "timeout_duration should match config"
385+
artifacts["timeout_duration"], 3, "timeout_duration should match config"
386386
)
387387

388388
print(f"✅ Artifacts captured correctly: {list(artifacts.keys())}")
@@ -405,8 +405,8 @@ async def run_test():
405405
import time
406406
407407
def evaluate(program_path):
408-
# Simulate a very long evaluation (like the 11-hour case)
409-
time.sleep(20) # 20 seconds to test timeout
408+
# Simulate a very long evaluation (like the 11-hour case)
409+
time.sleep(6) # 6 seconds to test timeout (reduced for faster tests)
410410
return {"accReturn": 0.1, "CalmarRatio": 0.9, "combined_score": 0.82}
411411
"""
412412
)
@@ -415,7 +415,7 @@ def evaluate(program_path):
415415
try:
416416
# Configure like user's config but with shorter timeout for testing
417417
config = EvaluatorConfig()
418-
config.timeout = 5 # 5 seconds instead of 600
418+
config.timeout = 3 # 3 seconds instead of 600
419419
config.max_retries = 1
420420
config.cascade_evaluation = False
421421
config.parallel_evaluations = 1
@@ -438,9 +438,9 @@ def search_algorithm():
438438
result = await evaluator.evaluate_program(program_code, "financial_test")
439439
elapsed_time = time.time() - start_time
440440

441-
# Should timeout in ~5 seconds, not 20+ seconds
442-
self.assertLess(elapsed_time, 8)
443-
self.assertGreater(elapsed_time, 4)
441+
# Should timeout in ~3 seconds, not 6+ seconds
442+
self.assertLess(elapsed_time, 5)
443+
self.assertGreater(elapsed_time, 2.5)
444444

445445
# Should return timeout error
446446
self.assertIn("error", result)

0 commit comments

Comments
 (0)