more tests added

DzmitryPihulski · DzmitryPihulski · commit 98dc1472bfef · 2025-12-07T13:04:43.000+01:00
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,6 +61,7 @@ dev = [
     "sphinx-autoapi>=3.6.0",
     "sphinx-autobuild>=2024.10.3",
     "sphinx-copybutton>=0.5.2",
+    "pytest-mock>=3.15.1",
 ]
 vllm = [
     "vllm>=0.4.2",
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,8 @@
 import json
 import os
+from pathlib import Path
 import sqlite3
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -68,3 +70,44 @@ def fake_jsonl_files(tmp_path):
     tpath.write_text("\n".join(json.dumps(t) for t in tables))
 
     return str(qpath), str(tpath)
+
+
+@pytest.fixture
+def mock_utils(mocker, tmp_path):
+    """Mock all underlying I/O + DB functions."""
+    # load questions
+    mocker.patch(
+        "llmsql.evaluation.evaluate.load_jsonl_dict_by_key",
+        return_value={1: {"question_id": 1, "gold": "SELECT 1"}},
+    )
+
+    # predictions loader
+    mocker.patch(
+        "llmsql.evaluation.evaluate.load_jsonl",
+        return_value=[{"question_id": 1, "completion": "SELECT 1"}],
+    )
+
+    # DB connection
+    fake_conn = MagicMock()
+    mocker.patch("llmsql.evaluation.evaluate.connect_sqlite", return_value=fake_conn)
+
+    # evaluate_sample → always correct prediction
+    mocker.patch(
+        "llmsql.evaluation.evaluate.evaluate_sample",
+        return_value=(1, None, {"pred_none": 0, "gold_none": 0, "sql_error": 0}),
+    )
+
+    # rich logging
+    mocker.patch("llmsql.evaluation.evaluate.log_mismatch")
+    mocker.patch("llmsql.evaluation.evaluate.print_summary")
+
+    # download files
+    mocker.patch(
+        "llmsql.evaluation.evaluate.download_benchmark_file",
+        side_effect=lambda filename, wd: str(Path(wd) / filename),
+    )
+
+    # report writer
+    mocker.patch("llmsql.evaluation.evaluate.save_json_report")
+
+    return tmp_path
diff --git a/tests/evaluation/test_evaluator_stability.py b/tests/evaluation/test_evaluator_stability.py
@@ -168,3 +168,98 @@ async def test_evaluate_with_dict_list(monkeypatch, temp_dir, dummy_db_file):
     assert report["matches"] == 1
     assert report["accuracy"] == 1.0
     assert report["input_mode"] == "dict_list"
+
+
+def test_evaluate_with_list_outputs(mock_utils, mocker):
+    outputs = [{"question_id": 1, "completion": "SELECT 1"}]
+
+    report = evaluate(outputs, workdir_path=str(mock_utils))
+
+    assert report["total"] == 1
+    assert report["matches"] == 1
+    assert report["accuracy"] == 1.0
+    assert report["input_mode"] == "dict_list"
+
+
+def test_evaluate_with_jsonl_path(mock_utils, mocker):
+    # fake jsonl file
+    jsonl_path = mock_utils / "preds.jsonl"
+    jsonl_path.write_text("dummy", encoding="utf-8")
+
+    report = evaluate(str(jsonl_path), workdir_path=str(mock_utils))
+
+    assert report["total"] == 1
+    assert report["input_mode"] == "jsonl_path"
+
+
+def test_missing_workdir_and_no_questions_path_raises():
+    with pytest.raises(ValueError):
+        evaluate(
+            outputs=[{"question_id": 1, "completion": "x"}],
+            workdir_path=None,
+            questions_path=None,
+        )
+
+
+def test_missing_workdir_and_no_db_path_raises():
+    with pytest.raises(ValueError):
+        evaluate(
+            outputs=[{"question_id": 1, "completion": "x"}],
+            workdir_path=None,
+            db_path=None,
+        )
+
+
+def test_download_occurs_if_files_missing(mock_utils, mocker):
+    dl = mocker.patch("llmsql.evaluation.evaluate.download_benchmark_file")
+
+    evaluate(
+        [{"question_id": 1, "completion": "SELECT 1"}],
+        workdir_path=str(mock_utils),
+        questions_path=None,
+        db_path=None,
+    )
+
+    assert dl.call_count == 2  # questions + sqlite
+
+
+def test_saves_report_with_auto_filename(mock_utils, mocker):
+    save = mocker.patch("llmsql.evaluation.evaluate.save_json_report")
+
+    report = evaluate(
+        [{"question_id": 1, "completion": "SELECT 1"}],
+        workdir_path=str(mock_utils),
+        save_report=None,
+    )
+
+    # automatic UUID-based filename
+    args, kwargs = save.call_args
+    auto_filename = args[0]
+    assert auto_filename.startswith("evaluation_results_")
+    assert auto_filename.endswith(".json")
+
+    assert report["total"] == 1
+
+
+def test_mismatch_handling(mock_utils, mocker):
+    """Test branch where a mismatch is returned."""
+    mocker.patch(
+        "llmsql.evaluation.evaluate.evaluate_sample",
+        return_value=(
+            0,
+            {"info": "bad"},
+            {"pred_none": 0, "gold_none": 0, "sql_error": 0},
+        ),
+    )
+
+    log_mis = mocker.patch("llmsql.evaluation.evaluate.log_mismatch")
+
+    report = evaluate(
+        [{"question_id": 1, "completion": "SELECT X"}],
+        workdir_path=str(mock_utils),
+        max_mismatches=3,
+    )
+
+    assert report["matches"] == 0
+    assert len(report["mismatches"]) == 1
+    log_mis.assert_called_once()

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ dev = [`
`61`	`61`	`"sphinx-autoapi>=3.6.0",`
`62`	`62`	`"sphinx-autobuild>=2024.10.3",`
`63`	`63`	`"sphinx-copybutton>=0.5.2",`
	`64`	`+ "pytest-mock>=3.15.1",`
`64`	`65`	`]`
`65`	`66`	`vllm = [`
`66`	`67`	`"vllm>=0.4.2",`