@@ -61,8 +61,7 @@ def sample_run_result() -> RunResult:
6161 )
6262
6363
64- @pytest .fixture
65- def sample_eval_result () -> EvalResult :
64+ def create_eval_result () -> EvalResult :
6665 return EvalResult (
6766 start = datetime .datetime .now () - datetime .timedelta (minutes = 5 ),
6867 end = datetime .datetime .now (),
@@ -72,9 +71,14 @@ def sample_eval_result() -> EvalResult:
7271
7372
7473@pytest .fixture
75- def sample_full_result (sample_eval_result : EvalResult ) -> FullResult :
74+ def sample_eval_result () -> EvalResult :
75+ return create_eval_result ()
76+
77+
78+ @pytest .fixture
79+ def sample_full_result () -> FullResult :
7680 return FullResult (
77- success = True , error = "" , system = sample_system_info (), runs = {"test" : sample_eval_result }
81+ success = True , error = "" , system = sample_system_info (), runs = {"test" : create_eval_result () }
7882 )
7983
8084
@@ -188,6 +192,76 @@ def test_make_short_report_compilation_failed(sample_eval_result: EvalResult):
188192 assert result == ["❌ Compilation failed" ]
189193
190194
195+ def test_make_short_report_testing_failed (sample_eval_result : EvalResult ):
196+ sample_eval_result .run .success = False
197+ sample_eval_result .run .exit_code = consts .ExitCode .TIMEOUT_EXPIRED
198+ runs = {"test" : sample_eval_result }
199+
200+ result = make_short_report (runs )
201+ assert result == ["✅ Compilation successful" , "❌ Running tests failed (timeout)" ]
202+
203+ sample_eval_result .run .success = True
204+ sample_eval_result .run .passed = False
205+ sample_eval_result .run .exit_code = consts .ExitCode .VALIDATE_FAIL
206+ result = make_short_report (runs )
207+ assert result == ["✅ Compilation successful" , "❌ Testing failed" ]
208+
209+
210+ def test_make_short_report_benchmarking_failed (sample_eval_result : EvalResult ):
211+ sample_eval_result .run .success = False
212+ sample_eval_result .compilation = None
213+ sample_eval_result .run .exit_code = consts .ExitCode .CUDA_FAIL
214+ runs = {"benchmark" : sample_eval_result }
215+
216+ result = make_short_report (runs , full = False )
217+ assert result == ["❌ Running benchmarks failed (cuda api error)" ]
218+
219+ sample_eval_result .run .success = True
220+ sample_eval_result .run .passed = False
221+ sample_eval_result .run .exit_code = consts .ExitCode .VALIDATE_FAIL
222+ result = make_short_report (runs )
223+ assert result == ["❌ Tests missing" , "❌ Benchmarking failed" ]
224+
225+
226+ def test_make_short_report_profiling_failed (sample_eval_result : EvalResult ):
227+ sample_eval_result .run .success = False
228+ sample_eval_result .compilation = None
229+ sample_eval_result .run .exit_code = consts .ExitCode .PIPE_FAILED
230+ runs = {"profile" : sample_eval_result }
231+
232+ result = make_short_report (runs , full = False )
233+ assert result == ["❌ Running profile failed (internal error 111)" ]
234+
235+ sample_eval_result .run .success = True
236+ sample_eval_result .run .passed = False
237+ sample_eval_result .run .exit_code = consts .ExitCode .VALIDATE_FAIL
238+ result = make_short_report (runs )
239+ # TODO is this actually possible? Should profiling do **any** correctness testing?
240+ assert result == ["❌ Tests missing" , "❌ Benchmarks missing" , "❌ Profiling failed" ]
241+
242+
243+ def test_make_short_report_leaderboard_failed (sample_eval_result : EvalResult ):
244+ sample_eval_result .run .success = False
245+ sample_eval_result .compilation = None
246+ sample_eval_result .run .exit_code = consts .ExitCode .TEST_SPEC
247+ runs = {"leaderboard" : sample_eval_result }
248+
249+ result = make_short_report (runs , full = False )
250+ assert result == ["❌ Running leaderboard failed (internal error 113)" ]
251+
252+ sample_eval_result .run .success = True
253+ sample_eval_result .run .passed = False
254+ sample_eval_result .run .exit_code = consts .ExitCode .VALIDATE_FAIL
255+ result = make_short_report (runs )
256+ # TODO is this actually possible? Should profiling do **any** correctness testing?
257+ assert result == ["❌ Tests missing" , "❌ Benchmarks missing" , "❌ Leaderboard run failed" ]
258+
259+
260+ def test_make_short_report_empty ():
261+ result = make_short_report ({})
262+ assert result == ["❌ Tests missing" , "❌ Benchmarks missing" , "❌ Leaderboard missing" ]
263+
264+
191265def test_make_short_report_full_success ():
192266 runs = {}
193267 for run_type in ["test" , "benchmark" , "profile" , "leaderboard" ]:
@@ -218,8 +292,8 @@ def test_make_short_report_full_success():
218292 assert result == expected
219293
220294
221- def test_make_short_report_missing_components (sample_eval_result : EvalResult ):
222- runs = {"test" : sample_eval_result }
295+ def test_make_short_report_missing_components ():
296+ runs = {"test" : create_eval_result () }
223297
224298 result = make_short_report (runs , full = True )
225299 expected = [
@@ -382,7 +456,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
382456 exit_code = consts .ExitCode .VALIDATE_FAIL ,
383457 duration = 2.1 ,
384458 stdout = "Running tests..." ,
385- stderr = "" ,
459+ stderr = "Oh no a test failed! " ,
386460 result = {
387461 "test-count" : "2" ,
388462 "test.0.status" : "pass" ,
@@ -394,23 +468,189 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
394468 )
395469
396470 report = generate_report (sample_full_result )
471+ from libkernelbot .report import Log , Text
472+
473+ assert report .data == [
474+ Text (
475+ text = "\n "
476+ "Running on:\n "
477+ "* GPU: `NVIDIA RTX 4090`\n "
478+ "* CPU: `Intel i9-12900K`\n "
479+ "* Platform: `Linux-5.15.0`\n "
480+ "* Torch: `2.0.1+cu118`\n "
481+ ),
482+ Text (
483+ text = "# Testing failed\n "
484+ "Command ```bash\n "
485+ "python eval.py test```\n "
486+ "ran successfully in 2.10 seconds, but did not pass all tests.\n "
487+ ),
488+ Log (
489+ header = "Test log" ,
490+ content = "✅ Basic functionality\n "
491+ "❌ Edge case handling\n "
492+ "> Expected [1, 2, 3] but got [1, 2, 0]" ,
493+ ),
494+ Log (header = "Program stderr" , content = "Oh no a test failed!" ),
495+ Log (header = "Program stdout" , content = "Running tests..." ),
496+ ]
397497
398- # Should have system info + test failure text + test log + stdout log
399- assert len (report .data ) == 4
400498
401- text_items = [ item . text for item in report . data if hasattr ( item , "text" )]
402- test_text = next ( text for text in text_items if "Testing failed" in text )
499+ def test_generate_report_benchmark_failure ( sample_full_result : FullResult ):
500+ from libkernelbot . report import Log , Text
403501
404- assert "python eval.py test" in test_text
405- assert "ran successfully in 2.10 seconds" in test_text
406- assert "did not pass all tests" in test_text
502+ sample_full_result .runs ["benchmark" ] = create_eval_result ()
503+ report = generate_report (sample_full_result )
504+ assert report .data == [
505+ Text (
506+ text = "\n "
507+ "Running on:\n "
508+ "* GPU: `NVIDIA RTX 4090`\n "
509+ "* CPU: `Intel i9-12900K`\n "
510+ "* Platform: `Linux-5.15.0`\n "
511+ "* Torch: `2.0.1+cu118`\n "
512+ ),
513+ Log (
514+ header = "✅ Passed 3/3 tests" ,
515+ content = "✅ Test addition\n "
516+ "> Addition works correctly\n "
517+ "✅ Test multiplication\n "
518+ "❌ Test division\n "
519+ "> Division by zero" ,
520+ ),
521+ Log (header = "Benchmarks" , content = "❗ Could not find any benchmarks" ),
522+ ]
407523
408- # Check test log contains failure details
409- log_items = [item for item in report .data if hasattr (item , "header" )]
410- test_log = next (item for item in log_items if item .header == "Test log" )
411- assert "✅ Basic functionality" in test_log .content
412- assert "❌ Edge case handling" in test_log .content
413- assert "Expected [1, 2, 3] but got [1, 2, 0]" in test_log .content
524+ sample_full_result .runs ["benchmark" ].run .passed = False
525+ sample_full_result .runs ["benchmark" ].run .result = {
526+ "benchmark-count" : "2" ,
527+ "benchmark.0.status" : "pass" ,
528+ "benchmark.0.spec" : "Basic functionality" ,
529+ "benchmark.0.mean" : "10.5" ,
530+ "benchmark.0.err" : "0.5" ,
531+ "benchmark.0.best" : "9.8" ,
532+ "benchmark.0.worst" : "15.2" ,
533+ "benchmark.1.status" : "fail" ,
534+ "benchmark.1.spec" : "Edge case handling" ,
535+ "benchmark.1.error" : "Expected [1, 2, 3] but got [1, 2, 0]" ,
536+ }
537+ report = generate_report (sample_full_result )
538+ assert report .data == [
539+ Text (
540+ text = "\n "
541+ "Running on:\n "
542+ "* GPU: `NVIDIA RTX 4090`\n "
543+ "* CPU: `Intel i9-12900K`\n "
544+ "* Platform: `Linux-5.15.0`\n "
545+ "* Torch: `2.0.1+cu118`\n "
546+ ),
547+ Log (
548+ header = "✅ Passed 3/3 tests" ,
549+ content = "✅ Test addition\n "
550+ "> Addition works correctly\n "
551+ "✅ Test multiplication\n "
552+ "❌ Test division\n "
553+ "> Division by zero" ,
554+ ),
555+ Log (
556+ header = "Benchmarks" ,
557+ content = "Basic functionality\n "
558+ " ⏱ 10.5 ± 0.50 ns\n "
559+ " ⚡ 9.80 ns 🐌 15.2 ns\n "
560+ "\n "
561+ "❌ Edge case handling failed testing:\n "
562+ "\n "
563+ "Expected [1, 2, 3] but got [1, 2, 0]\n " ,
564+ ),
565+ ]
566+
567+
568+ def test_generate_report_leaderboard_failure (sample_full_result : FullResult ):
569+ from libkernelbot .report import Log , Text
570+
571+ sample_full_result .runs ["leaderboard" ] = create_eval_result ()
572+ report = generate_report (sample_full_result )
573+ assert report .data == [
574+ Text (
575+ text = "\n "
576+ "Running on:\n "
577+ "* GPU: `NVIDIA RTX 4090`\n "
578+ "* CPU: `Intel i9-12900K`\n "
579+ "* Platform: `Linux-5.15.0`\n "
580+ "* Torch: `2.0.1+cu118`\n "
581+ ),
582+ Log (
583+ header = "✅ Passed 3/3 tests" ,
584+ content = "✅ Test addition\n "
585+ "> Addition works correctly\n "
586+ "✅ Test multiplication\n "
587+ "❌ Test division\n "
588+ "> Division by zero" ,
589+ ),
590+ Log (header = "Ranked Benchmark" , content = "❗ Could not find any benchmarks" ),
591+ ]
592+
593+ sample_full_result .runs ["leaderboard" ].run .success = False
594+ sample_full_result .runs ["leaderboard" ].run .exit_code = consts .ExitCode .TIMEOUT_EXPIRED
595+ sample_full_result .runs ["leaderboard" ].run .duration = 10.0
596+
597+ report = generate_report (sample_full_result )
598+ assert report .data == [
599+ Text (
600+ text = "\n "
601+ "Running on:\n "
602+ "* GPU: `NVIDIA RTX 4090`\n "
603+ "* CPU: `Intel i9-12900K`\n "
604+ "* Platform: `Linux-5.15.0`\n "
605+ "* Torch: `2.0.1+cu118`\n "
606+ ),
607+ Log (
608+ header = "✅ Passed 3/3 tests" ,
609+ content = "✅ Test addition\n "
610+ "> Addition works correctly\n "
611+ "✅ Test multiplication\n "
612+ "❌ Test division\n "
613+ "> Division by zero" ,
614+ ),
615+ Text (
616+ text = "# Running failed\n "
617+ "Command ```bash\n "
618+ "./test```\n "
619+ "**timed out** after 10.00 seconds."
620+ ),
621+ Log (header = "Program stdout" , content = "All tests passed" ),
622+ ]
623+
624+
625+ def test_generate_report_profile (sample_full_result : FullResult ):
626+ sample_full_result .runs ["profile" ] = create_eval_result ()
627+ sample_full_result .runs ["profile" ].run .result = {
628+ "benchmark-count" : "1" ,
629+ "benchmark.0.spec" : "Benchmark" ,
630+ "benchmark.0.report" : base64 .b64encode (b"Profile report" , b"+*" ).decode ("utf-8" ),
631+ }
632+ report = generate_report (sample_full_result )
633+ from libkernelbot .report import Log , Text
634+
635+ assert report .data == [
636+ Text (
637+ text = "\n "
638+ "Running on:\n "
639+ "* GPU: `NVIDIA RTX 4090`\n "
640+ "* CPU: `Intel i9-12900K`\n "
641+ "* Platform: `Linux-5.15.0`\n "
642+ "* Torch: `2.0.1+cu118`\n "
643+ ),
644+ Log (
645+ header = "✅ Passed 3/3 tests" ,
646+ content = "✅ Test addition\n "
647+ "> Addition works correctly\n "
648+ "✅ Test multiplication\n "
649+ "❌ Test division\n "
650+ "> Division by zero" ,
651+ ),
652+ Log (header = "Profiling" , content = "Benchmark\n \n Profile report\n " ),
653+ ]
414654
415655
416656def test_run_result_report ():
0 commit comments