Skip to content

Commit 3524fef

Browse files
authored
Migrate evaluation to uv backend (#178)
* shift evaluation to `uv` backend * add `--limit` and fix deepeval calls * fix checks --------- Signed-off-by: Jack Luar <[email protected]>
1 parent f5dd23b commit 3524fef

File tree

9 files changed

+2969
-58
lines changed

9 files changed

+2969
-58
lines changed

evaluation/Makefile

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,19 @@
11
.PHONY: init
22
init:
3-
@python3 -m venv .venv && \
4-
. .venv/bin/activate && \
5-
pip install -r requirements.txt && \
6-
pip install -e .
3+
@uv sync
74

85
.PHONY: init-dev
9-
init-dev: init
10-
@. .venv/bin/activate && \
11-
pip install -r requirements-test.txt
6+
init-dev:
7+
@uv sync --extra test
128

139
.PHONY: format
1410
format:
15-
@. .venv/bin/activate && \
16-
ruff format
11+
@uv run ruff format
1712

1813
.PHONY: check
1914
check:
20-
@. .venv/bin/activate && \
21-
mypy . && \
22-
ruff check
15+
@uv run mypy . && \
16+
uv run ruff check
2317

2418
.PHONY: clean
2519
clean:
@@ -29,7 +23,6 @@ clean:
2923
.PHONY: llm-tests
3024
llm-tests: clean
3125
@bash -c '\
32-
. .venv/bin/activate && \
3326
cd auto_evaluation && \
34-
./llm_tests.sh 2>&1 | tee llm_tests_output.txt; \
27+
uv run ./llm_tests.sh 2>&1 | tee llm_tests_output.txt; \
3528
exit $${PIPESTATUS[0]}'

evaluation/auto_evaluation/dataset/preprocess.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ def write_data(results_list: list[dict[str, Any]], results_path: str):
2727

2828

2929
def read_deepeval_cache():
30+
import os
31+
32+
cache_file = ".deepeval/.deepeval-cache.json"
33+
if not os.path.exists(cache_file):
34+
print(f"Warning: {cache_file} not found. Skipping cache read.")
35+
return
36+
3037
metric_scores = {
3138
"Contextual Precision": [],
3239
"Contextual Recall": [],
@@ -37,7 +44,7 @@ def read_deepeval_cache():
3744
"Contextual Recall": [],
3845
"Hallucination": [],
3946
}
40-
with open(".deepeval-cache.json") as f:
47+
with open(cache_file) as f:
4148
results = json.load(f)
4249
for _, value in results["test_cases_lookup_map"].items():
4350
for metric in value["cached_metrics_data"]:

evaluation/auto_evaluation/eval_main.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def sanity_check(self):
8181
continue
8282
raise ValueError("Sanity check failed after timeout")
8383

84-
def evaluate(self, retriever: str):
84+
def evaluate(self, retriever: str, limit: int | None = None):
8585
retrieval_tcs = []
8686
response_times = []
8787

@@ -93,7 +93,8 @@ def evaluate(self, retriever: str):
9393
)
9494

9595
# retrieval test cases
96-
for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
96+
questions = self.qns[:limit] if limit else self.qns
97+
for i, qa_pair in enumerate(tqdm(questions, desc="Evaluating")):
9798
question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
9899
response, response_time = self.query(retriever, question)
99100
response_text = response["response"]
@@ -114,7 +115,6 @@ def evaluate(self, retriever: str):
114115
evaluate(
115116
test_cases=retrieval_tcs,
116117
metrics=[precision, recall, hallucination],
117-
print_results=False,
118118
)
119119

120120
# parse deepeval results
@@ -155,11 +155,14 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
155155
)
156156
parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on")
157157
parser.add_argument("--retriever", type=str, help="Retriever to evaluate on")
158+
parser.add_argument(
159+
"--limit", type=int, help="Limit number of questions to evaluate", default=None
160+
)
158161
args = parser.parse_args()
159162

160163
# Pull the dataset from huggingface hub
161164
hf_pull.main()
162165

163166
# Evaluate the model on the dataset
164167
harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
165-
harness.evaluate(args.retriever)
168+
harness.evaluate(args.retriever, limit=args.limit)

evaluation/auto_evaluation/llm_tests.sh

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,27 @@ retrievers=(
44
"agent-retriever"
55
)
66

7+
# Set default limit (empty means run all)
8+
LIMIT=${1:-}
9+
710
echo "==================================="
811
echo "==> Dataset: EDA Corpus"
12+
if [ -n "$LIMIT" ]; then
13+
echo "==> Running with limit: $LIMIT questions"
14+
fi
915
for retriever in "${retrievers[@]}" ; do
1016
echo "==> Running tests for $retriever"
11-
python eval_main.py \
12-
--base_url http://localhost:8000 \
13-
--dataset ./dataset/EDA_Corpus_100_Question.csv \
14-
--retriever $retriever
17+
if [ -n "$LIMIT" ]; then
18+
python eval_main.py \
19+
--base_url http://localhost:8000 \
20+
--dataset ./dataset/EDA_Corpus_100_Question.csv \
21+
--retriever $retriever \
22+
--limit $LIMIT
23+
else
24+
python eval_main.py \
25+
--base_url http://localhost:8000 \
26+
--dataset ./dataset/EDA_Corpus_100_Question.csv \
27+
--retriever $retriever
28+
fi
1529
done
1630
echo "==================================="

evaluation/auto_evaluation/src/metrics/content.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from deepeval.metrics import (
2-
FaithfulnessMetric,
32
AnswerRelevancyMetric,
43
BiasMetric,
54
ToxicityMetric,
@@ -12,23 +11,23 @@
1211
TOXICITY_THRESHOLD = 0.7
1312

1413

15-
def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
14+
def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> AnswerRelevancyMetric:
1615
return AnswerRelevancyMetric(
1716
threshold=ANSRELEVANCY_THRESHOLD,
1817
model=model,
1918
include_reason=True,
2019
)
2120

2221

23-
def make_bias_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
22+
def make_bias_metric(model: DeepEvalBaseLLM) -> BiasMetric:
2423
return BiasMetric(
2524
threshold=BIAS_THRESHOLD,
2625
model=model,
2726
include_reason=True,
2827
)
2928

3029

31-
def make_toxicity_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
30+
def make_toxicity_metric(model: DeepEvalBaseLLM) -> ToxicityMetric:
3231
return ToxicityMetric(
3332
threshold=TOXICITY_THRESHOLD,
3433
model=model,

evaluation/pyproject.toml

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,30 @@
11
[build-system]
2-
requires = ['setuptools>=60', 'Cython==3.0.7', 'wheel==0.42.0']
3-
build-backend = "setuptools.build_meta"
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
44

55
[project]
66
name = "ora-evaluation"
77
version = "1.0.0"
8-
dynamic = ["dependencies", "optional-dependencies"]
98
requires-python = ">=3.12"
9+
dependencies = [
10+
"google-api-python-client==2.151.0",
11+
"google-auth==2.30.0",
12+
"google-auth-httplib2==0.2.0",
13+
"google-auth-oauthlib==1.2.0",
14+
"gspread==6.1.2",
15+
"python-dotenv==1.0.1",
16+
"requests==2.32.4",
17+
"streamlit==1.37.0",
18+
"deepeval==3.0.0",
19+
"langchain-google-vertexai==2.0.15",
20+
"asyncio==3.4.3",
21+
"huggingface-hub==0.26.2",
22+
"instructor[vertexai]==1.5.2",
23+
"openai==1.58.1",
24+
"pydantic==2.10.4",
25+
"tqdm==4.67.1",
26+
"plotly==5.24.1",
27+
]
1028
classifiers = [
1129
"Development Status :: 3 - Alpha",
1230
"Intended Audience :: Developers",
@@ -16,12 +34,17 @@ classifiers = [
1634
"Programming Language :: Python :: 3 :: Only",
1735
]
1836

19-
[tool.setuptools.dynamic]
20-
dependencies = { file = ["requirements.txt"] }
21-
optional-dependencies = { test = { file = ["requirements-test.txt"] } }
37+
[project.optional-dependencies]
38+
test = [
39+
"mypy==1.10.1",
40+
"ruff==0.5.1",
41+
"types-requests==2.32.0.20250602",
42+
"google-api-python-client-stubs==1.28.0",
43+
"types-tqdm==4.67.0.20241221",
44+
]
2245

23-
[tool.setuptools.packages.find]
24-
include = ["auto_evaluation", "human_evaluation", "script_based_evaluation"]
46+
[tool.hatch.build.targets.wheel]
47+
packages = ["auto_evaluation", "human_evaluation", "script_based_evaluation"]
2548

2649
[tool.mypy]
2750
python_version = "3.12"

evaluation/requirements-test.txt

Lines changed: 0 additions & 5 deletions
This file was deleted.

evaluation/requirements.txt

Lines changed: 0 additions & 18 deletions
This file was deleted.

0 commit comments

Comments
 (0)