Skip to content

Commit 2e8fc7f

Browse files
committed
improve test coverage
1 parent 823046b commit 2e8fc7f

File tree

4 files changed

+333
-20
lines changed

4 files changed

+333
-20
lines changed

unit-tests/test_report.py

Lines changed: 260 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ def sample_run_result() -> RunResult:
6161
)
6262

6363

64-
@pytest.fixture
65-
def sample_eval_result() -> EvalResult:
64+
def create_eval_result() -> EvalResult:
6665
return EvalResult(
6766
start=datetime.datetime.now() - datetime.timedelta(minutes=5),
6867
end=datetime.datetime.now(),
@@ -72,9 +71,14 @@ def sample_eval_result() -> EvalResult:
7271

7372

7473
@pytest.fixture
75-
def sample_full_result(sample_eval_result: EvalResult) -> FullResult:
74+
def sample_eval_result() -> EvalResult:
75+
return create_eval_result()
76+
77+
78+
@pytest.fixture
79+
def sample_full_result() -> FullResult:
7680
return FullResult(
77-
success=True, error="", system=sample_system_info(), runs={"test": sample_eval_result}
81+
success=True, error="", system=sample_system_info(), runs={"test": create_eval_result()}
7882
)
7983

8084

@@ -188,6 +192,76 @@ def test_make_short_report_compilation_failed(sample_eval_result: EvalResult):
188192
assert result == ["❌ Compilation failed"]
189193

190194

195+
def test_make_short_report_testing_failed(sample_eval_result: EvalResult):
196+
sample_eval_result.run.success = False
197+
sample_eval_result.run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED
198+
runs = {"test": sample_eval_result}
199+
200+
result = make_short_report(runs)
201+
assert result == ["✅ Compilation successful", "❌ Running tests failed (timeout)"]
202+
203+
sample_eval_result.run.success = True
204+
sample_eval_result.run.passed = False
205+
sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL
206+
result = make_short_report(runs)
207+
assert result == ["✅ Compilation successful", "❌ Testing failed"]
208+
209+
210+
def test_make_short_report_benchmarking_failed(sample_eval_result: EvalResult):
211+
sample_eval_result.run.success = False
212+
sample_eval_result.compilation = None
213+
sample_eval_result.run.exit_code = consts.ExitCode.CUDA_FAIL
214+
runs = {"benchmark": sample_eval_result}
215+
216+
result = make_short_report(runs, full=False)
217+
assert result == ["❌ Running benchmarks failed (cuda api error)"]
218+
219+
sample_eval_result.run.success = True
220+
sample_eval_result.run.passed = False
221+
sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL
222+
result = make_short_report(runs)
223+
assert result == ["❌ Tests missing", "❌ Benchmarking failed"]
224+
225+
226+
def test_make_short_report_profiling_failed(sample_eval_result: EvalResult):
227+
sample_eval_result.run.success = False
228+
sample_eval_result.compilation = None
229+
sample_eval_result.run.exit_code = consts.ExitCode.PIPE_FAILED
230+
runs = {"profile": sample_eval_result}
231+
232+
result = make_short_report(runs, full=False)
233+
assert result == ["❌ Running profile failed (internal error 111)"]
234+
235+
sample_eval_result.run.success = True
236+
sample_eval_result.run.passed = False
237+
sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL
238+
result = make_short_report(runs)
239+
# TODO is this actually possible? Should profiling do **any** correctness testing?
240+
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Profiling failed"]
241+
242+
243+
def test_make_short_report_leaderboard_failed(sample_eval_result: EvalResult):
244+
sample_eval_result.run.success = False
245+
sample_eval_result.compilation = None
246+
sample_eval_result.run.exit_code = consts.ExitCode.TEST_SPEC
247+
runs = {"leaderboard": sample_eval_result}
248+
249+
result = make_short_report(runs, full=False)
250+
assert result == ["❌ Running leaderboard failed (internal error 113)"]
251+
252+
sample_eval_result.run.success = True
253+
sample_eval_result.run.passed = False
254+
sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL
255+
result = make_short_report(runs)
256+
# TODO is this actually possible? Should profiling do **any** correctness testing?
257+
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard run failed"]
258+
259+
260+
def test_make_short_report_empty():
261+
result = make_short_report({})
262+
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard missing"]
263+
264+
191265
def test_make_short_report_full_success():
192266
runs = {}
193267
for run_type in ["test", "benchmark", "profile", "leaderboard"]:
@@ -218,8 +292,8 @@ def test_make_short_report_full_success():
218292
assert result == expected
219293

220294

221-
def test_make_short_report_missing_components(sample_eval_result: EvalResult):
222-
runs = {"test": sample_eval_result}
295+
def test_make_short_report_missing_components():
296+
runs = {"test": create_eval_result()}
223297

224298
result = make_short_report(runs, full=True)
225299
expected = [
@@ -382,7 +456,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
382456
exit_code=consts.ExitCode.VALIDATE_FAIL,
383457
duration=2.1,
384458
stdout="Running tests...",
385-
stderr="",
459+
stderr="Oh no a test failed!",
386460
result={
387461
"test-count": "2",
388462
"test.0.status": "pass",
@@ -394,23 +468,189 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
394468
)
395469

396470
report = generate_report(sample_full_result)
471+
from libkernelbot.report import Log, Text
472+
473+
assert report.data == [
474+
Text(
475+
text="\n"
476+
"Running on:\n"
477+
"* GPU: `NVIDIA RTX 4090`\n"
478+
"* CPU: `Intel i9-12900K`\n"
479+
"* Platform: `Linux-5.15.0`\n"
480+
"* Torch: `2.0.1+cu118`\n"
481+
),
482+
Text(
483+
text="# Testing failed\n"
484+
"Command ```bash\n"
485+
"python eval.py test```\n"
486+
"ran successfully in 2.10 seconds, but did not pass all tests.\n"
487+
),
488+
Log(
489+
header="Test log",
490+
content="✅ Basic functionality\n"
491+
"❌ Edge case handling\n"
492+
"> Expected [1, 2, 3] but got [1, 2, 0]",
493+
),
494+
Log(header="Program stderr", content="Oh no a test failed!"),
495+
Log(header="Program stdout", content="Running tests..."),
496+
]
397497

398-
# Should have system info + test failure text + test log + stdout log
399-
assert len(report.data) == 4
400498

401-
text_items = [item.text for item in report.data if hasattr(item, "text")]
402-
test_text = next(text for text in text_items if "Testing failed" in text)
499+
def test_generate_report_benchmark_failure(sample_full_result: FullResult):
500+
from libkernelbot.report import Log, Text
403501

404-
assert "python eval.py test" in test_text
405-
assert "ran successfully in 2.10 seconds" in test_text
406-
assert "did not pass all tests" in test_text
502+
sample_full_result.runs["benchmark"] = create_eval_result()
503+
report = generate_report(sample_full_result)
504+
assert report.data == [
505+
Text(
506+
text="\n"
507+
"Running on:\n"
508+
"* GPU: `NVIDIA RTX 4090`\n"
509+
"* CPU: `Intel i9-12900K`\n"
510+
"* Platform: `Linux-5.15.0`\n"
511+
"* Torch: `2.0.1+cu118`\n"
512+
),
513+
Log(
514+
header="✅ Passed 3/3 tests",
515+
content="✅ Test addition\n"
516+
"> Addition works correctly\n"
517+
"✅ Test multiplication\n"
518+
"❌ Test division\n"
519+
"> Division by zero",
520+
),
521+
Log(header="Benchmarks", content="❗ Could not find any benchmarks"),
522+
]
407523

408-
# Check test log contains failure details
409-
log_items = [item for item in report.data if hasattr(item, "header")]
410-
test_log = next(item for item in log_items if item.header == "Test log")
411-
assert "✅ Basic functionality" in test_log.content
412-
assert "❌ Edge case handling" in test_log.content
413-
assert "Expected [1, 2, 3] but got [1, 2, 0]" in test_log.content
524+
sample_full_result.runs["benchmark"].run.passed = False
525+
sample_full_result.runs["benchmark"].run.result = {
526+
"benchmark-count": "2",
527+
"benchmark.0.status": "pass",
528+
"benchmark.0.spec": "Basic functionality",
529+
"benchmark.0.mean": "10.5",
530+
"benchmark.0.err": "0.5",
531+
"benchmark.0.best": "9.8",
532+
"benchmark.0.worst": "15.2",
533+
"benchmark.1.status": "fail",
534+
"benchmark.1.spec": "Edge case handling",
535+
"benchmark.1.error": "Expected [1, 2, 3] but got [1, 2, 0]",
536+
}
537+
report = generate_report(sample_full_result)
538+
assert report.data == [
539+
Text(
540+
text="\n"
541+
"Running on:\n"
542+
"* GPU: `NVIDIA RTX 4090`\n"
543+
"* CPU: `Intel i9-12900K`\n"
544+
"* Platform: `Linux-5.15.0`\n"
545+
"* Torch: `2.0.1+cu118`\n"
546+
),
547+
Log(
548+
header="✅ Passed 3/3 tests",
549+
content="✅ Test addition\n"
550+
"> Addition works correctly\n"
551+
"✅ Test multiplication\n"
552+
"❌ Test division\n"
553+
"> Division by zero",
554+
),
555+
Log(
556+
header="Benchmarks",
557+
content="Basic functionality\n"
558+
" ⏱ 10.5 ± 0.50 ns\n"
559+
" ⚡ 9.80 ns 🐌 15.2 ns\n"
560+
"\n"
561+
"❌ Edge case handling failed testing:\n"
562+
"\n"
563+
"Expected [1, 2, 3] but got [1, 2, 0]\n",
564+
),
565+
]
566+
567+
568+
def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
569+
from libkernelbot.report import Log, Text
570+
571+
sample_full_result.runs["leaderboard"] = create_eval_result()
572+
report = generate_report(sample_full_result)
573+
assert report.data == [
574+
Text(
575+
text="\n"
576+
"Running on:\n"
577+
"* GPU: `NVIDIA RTX 4090`\n"
578+
"* CPU: `Intel i9-12900K`\n"
579+
"* Platform: `Linux-5.15.0`\n"
580+
"* Torch: `2.0.1+cu118`\n"
581+
),
582+
Log(
583+
header="✅ Passed 3/3 tests",
584+
content="✅ Test addition\n"
585+
"> Addition works correctly\n"
586+
"✅ Test multiplication\n"
587+
"❌ Test division\n"
588+
"> Division by zero",
589+
),
590+
Log(header="Ranked Benchmark", content="❗ Could not find any benchmarks"),
591+
]
592+
593+
sample_full_result.runs["leaderboard"].run.success = False
594+
sample_full_result.runs["leaderboard"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED
595+
sample_full_result.runs["leaderboard"].run.duration = 10.0
596+
597+
report = generate_report(sample_full_result)
598+
assert report.data == [
599+
Text(
600+
text="\n"
601+
"Running on:\n"
602+
"* GPU: `NVIDIA RTX 4090`\n"
603+
"* CPU: `Intel i9-12900K`\n"
604+
"* Platform: `Linux-5.15.0`\n"
605+
"* Torch: `2.0.1+cu118`\n"
606+
),
607+
Log(
608+
header="✅ Passed 3/3 tests",
609+
content="✅ Test addition\n"
610+
"> Addition works correctly\n"
611+
"✅ Test multiplication\n"
612+
"❌ Test division\n"
613+
"> Division by zero",
614+
),
615+
Text(
616+
text="# Running failed\n"
617+
"Command ```bash\n"
618+
"./test```\n"
619+
"**timed out** after 10.00 seconds."
620+
),
621+
Log(header="Program stdout", content="All tests passed"),
622+
]
623+
624+
625+
def test_generate_report_profile(sample_full_result: FullResult):
626+
sample_full_result.runs["profile"] = create_eval_result()
627+
sample_full_result.runs["profile"].run.result = {
628+
"benchmark-count": "1",
629+
"benchmark.0.spec": "Benchmark",
630+
"benchmark.0.report": base64.b64encode(b"Profile report", b"+*").decode("utf-8"),
631+
}
632+
report = generate_report(sample_full_result)
633+
from libkernelbot.report import Log, Text
634+
635+
assert report.data == [
636+
Text(
637+
text="\n"
638+
"Running on:\n"
639+
"* GPU: `NVIDIA RTX 4090`\n"
640+
"* CPU: `Intel i9-12900K`\n"
641+
"* Platform: `Linux-5.15.0`\n"
642+
"* Torch: `2.0.1+cu118`\n"
643+
),
644+
Log(
645+
header="✅ Passed 3/3 tests",
646+
content="✅ Test addition\n"
647+
"> Addition works correctly\n"
648+
"✅ Test multiplication\n"
649+
"❌ Test division\n"
650+
"> Division by zero",
651+
),
652+
Log(header="Profiling", content="Benchmark\n\n Profile report\n"),
653+
]
414654

415655

416656
def test_run_result_report():

unit-tests/test_submission.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime
2+
import re
23
from unittest import mock
34

45
import pytest
@@ -284,6 +285,16 @@ def test_prepare_submission_checks(mock_backend):
284285
):
285286
submission.prepare_submission(req, mock_backend)
286287

288+
req.file_name = "test.py"
289+
req.gpus = ["A99"]
290+
with pytest.raises(
291+
KernelBotError,
292+
match=re.escape(
293+
"GPU A99 not available for `test_board`\nChoose one of: * A100\n * V100\n"
294+
),
295+
):
296+
submission.prepare_submission(req, mock_backend)
297+
287298

288299
def test_compute_score():
289300
mock_task = mock.Mock()
@@ -329,3 +340,7 @@ def test_compute_score():
329340
mock_result.runs["leaderboard"].run.result["benchmark-count"] = "2"
330341
with pytest.raises(KernelBotError, match="exactly one benchmark"):
331342
submission.compute_score(mock_result, mock_task, 1)
343+
344+
mock_task.ranking_by = "WRONG"
345+
with pytest.raises(KernelBotError, match="Invalid ranking criterion WRONG"):
346+
submission.compute_score(mock_result, mock_task, 1)

0 commit comments

Comments
 (0)