Migrate evaluation to uv backend (#178)

luarss · web-flow · commit 3524fefc45d6 · 2025-10-18T19:36:18.000+08:00
* shift evaluation to `uv` backend

* add `--limit` and fix deepeval calls

* fix checks

---------

Signed-off-by: Jack Luar &lt;jluar@precisioninno.com&gt;
diff --git a/evaluation/Makefile b/evaluation/Makefile
@@ -1,25 +1,19 @@
 .PHONY: init
 init:
-	@python3 -m venv .venv && \
-		. .venv/bin/activate && \
-		pip install -r requirements.txt && \
-		pip install -e .
+	@uv sync
 
 .PHONY: init-dev
-init-dev: init
-	@. .venv/bin/activate && \
-		pip install -r requirements-test.txt
+init-dev:
+	@uv sync --extra test
 
 .PHONY: format
 format:
-	@. .venv/bin/activate && \
-		ruff format
+	@uv run ruff format
 
 .PHONY: check
 check:
-	@. .venv/bin/activate && \
-		mypy . && \
-		ruff check
+	@uv run mypy . && \
+		uv run ruff check
 
 .PHONY: clean
 clean:
@@ -29,7 +23,6 @@ clean:
 .PHONY: llm-tests
 llm-tests: clean
 	@bash -c '\
-		. .venv/bin/activate && \
 		cd auto_evaluation && \
-		./llm_tests.sh 2>&1 | tee llm_tests_output.txt; \
+		uv run ./llm_tests.sh 2>&1 | tee llm_tests_output.txt; \
 		exit $${PIPESTATUS[0]}'
diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -27,6 +27,13 @@ def write_data(results_list: list[dict[str, Any]], results_path: str):
 
 
 def read_deepeval_cache():
+    import os
+
+    cache_file = ".deepeval/.deepeval-cache.json"
+    if not os.path.exists(cache_file):
+        print(f"Warning: {cache_file} not found. Skipping cache read.")
+        return
+
     metric_scores = {
         "Contextual Precision": [],
         "Contextual Recall": [],
@@ -37,7 +44,7 @@ def read_deepeval_cache():
         "Contextual Recall": [],
         "Hallucination": [],
     }
-    with open(".deepeval-cache.json") as f:
+    with open(cache_file) as f:
         results = json.load(f)
     for _, value in results["test_cases_lookup_map"].items():
         for metric in value["cached_metrics_data"]:
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
@@ -81,7 +81,7 @@ def sanity_check(self):
                 continue
         raise ValueError("Sanity check failed after timeout")
 
-    def evaluate(self, retriever: str):
+    def evaluate(self, retriever: str, limit: int | None = None):
         retrieval_tcs = []
         response_times = []
 
@@ -93,7 +93,8 @@ def evaluate(self, retriever: str):
         )
 
         # retrieval test cases
-        for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
+        questions = self.qns[:limit] if limit else self.qns
+        for i, qa_pair in enumerate(tqdm(questions, desc="Evaluating")):
             question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
             response, response_time = self.query(retriever, question)
             response_text = response["response"]
@@ -114,7 +115,6 @@ def evaluate(self, retriever: str):
         evaluate(
             test_cases=retrieval_tcs,
             metrics=[precision, recall, hallucination],
-            print_results=False,
         )
 
         # parse deepeval results
@@ -155,11 +155,14 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
     )
     parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on")
     parser.add_argument("--retriever", type=str, help="Retriever to evaluate on")
+    parser.add_argument(
+        "--limit", type=int, help="Limit number of questions to evaluate", default=None
+    )
     args = parser.parse_args()
 
     # Pull the dataset from huggingface hub
     hf_pull.main()
 
     # Evaluate the model on the dataset
     harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
-    harness.evaluate(args.retriever)
+    harness.evaluate(args.retriever, limit=args.limit)
diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh
@@ -4,13 +4,27 @@ retrievers=(
     "agent-retriever"
 )
 
+# Set default limit (empty means run all)
+LIMIT=${1:-}
+
 echo "==================================="
 echo "==> Dataset: EDA Corpus"
+if [ -n "$LIMIT" ]; then
+    echo "==> Running with limit: $LIMIT questions"
+fi
 for retriever in "${retrievers[@]}" ; do
     echo "==> Running tests for $retriever"
-    python eval_main.py \
-       --base_url http://localhost:8000 \
-       --dataset ./dataset/EDA_Corpus_100_Question.csv \
-       --retriever $retriever
+    if [ -n "$LIMIT" ]; then
+        python eval_main.py \
+           --base_url http://localhost:8000 \
+           --dataset ./dataset/EDA_Corpus_100_Question.csv \
+           --retriever $retriever \
+           --limit $LIMIT
+    else
+        python eval_main.py \
+           --base_url http://localhost:8000 \
+           --dataset ./dataset/EDA_Corpus_100_Question.csv \
+           --retriever $retriever
+    fi
 done
 echo "==================================="
diff --git a/evaluation/auto_evaluation/src/metrics/content.py b/evaluation/auto_evaluation/src/metrics/content.py
@@ -1,5 +1,4 @@
 from deepeval.metrics import (
-    FaithfulnessMetric,
     AnswerRelevancyMetric,
     BiasMetric,
     ToxicityMetric,
@@ -12,23 +11,23 @@
 TOXICITY_THRESHOLD = 0.7
 
 
-def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> AnswerRelevancyMetric:
     return AnswerRelevancyMetric(
         threshold=ANSRELEVANCY_THRESHOLD,
         model=model,
         include_reason=True,
     )
 
 
-def make_bias_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+def make_bias_metric(model: DeepEvalBaseLLM) -> BiasMetric:
     return BiasMetric(
         threshold=BIAS_THRESHOLD,
         model=model,
         include_reason=True,
     )
 
 
-def make_toxicity_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+def make_toxicity_metric(model: DeepEvalBaseLLM) -> ToxicityMetric:
     return ToxicityMetric(
         threshold=TOXICITY_THRESHOLD,
         model=model,
diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml
@@ -1,12 +1,30 @@
 [build-system]
-requires = ['setuptools>=60', 'Cython==3.0.7', 'wheel==0.42.0']
-build-backend = "setuptools.build_meta"
+requires = ["hatchling"]
+build-backend = "hatchling.build"
 
 [project]
 name = "ora-evaluation"
 version = "1.0.0"
-dynamic = ["dependencies", "optional-dependencies"]
 requires-python = ">=3.12"
+dependencies = [
+    "google-api-python-client==2.151.0",
+    "google-auth==2.30.0",
+    "google-auth-httplib2==0.2.0",
+    "google-auth-oauthlib==1.2.0",
+    "gspread==6.1.2",
+    "python-dotenv==1.0.1",
+    "requests==2.32.4",
+    "streamlit==1.37.0",
+    "deepeval==3.0.0",
+    "langchain-google-vertexai==2.0.15",
+    "asyncio==3.4.3",
+    "huggingface-hub==0.26.2",
+    "instructor[vertexai]==1.5.2",
+    "openai==1.58.1",
+    "pydantic==2.10.4",
+    "tqdm==4.67.1",
+    "plotly==5.24.1",
+]
 classifiers = [
   "Development Status :: 3 - Alpha",
   "Intended Audience :: Developers",
@@ -16,12 +34,17 @@ classifiers = [
   "Programming Language :: Python :: 3 :: Only",
 ]
 
-[tool.setuptools.dynamic]
-dependencies = { file = ["requirements.txt"] }
-optional-dependencies = { test = { file = ["requirements-test.txt"] } }
+[project.optional-dependencies]
+test = [
+    "mypy==1.10.1",
+    "ruff==0.5.1",
+    "types-requests==2.32.0.20250602",
+    "google-api-python-client-stubs==1.28.0",
+    "types-tqdm==4.67.0.20241221",
+]
 
-[tool.setuptools.packages.find]
-include = ["auto_evaluation", "human_evaluation", "script_based_evaluation"]
+[tool.hatch.build.targets.wheel]
+packages = ["auto_evaluation", "human_evaluation", "script_based_evaluation"]
 
 [tool.mypy]
 python_version = "3.12"
diff --git a/evaluation/requirements-test.txt b/evaluation/requirements-test.txt
diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt
diff --git a/evaluation/uv.lock b/evaluation/uv.lock