Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 7 additions & 14 deletions evaluation/Makefile
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
.PHONY: init
init:
@python3 -m venv .venv && \
. .venv/bin/activate && \
pip install -r requirements.txt && \
pip install -e .
@uv sync

.PHONY: init-dev
init-dev: init
@. .venv/bin/activate && \
pip install -r requirements-test.txt
init-dev:
@uv sync --extra test

.PHONY: format
format:
@. .venv/bin/activate && \
ruff format
@uv run ruff format

.PHONY: check
check:
@. .venv/bin/activate && \
mypy . && \
ruff check
@uv run mypy . && \
uv run ruff check

.PHONY: clean
clean:
Expand All @@ -29,7 +23,6 @@ clean:
.PHONY: llm-tests
llm-tests: clean
@bash -c '\
. .venv/bin/activate && \
cd auto_evaluation && \
./llm_tests.sh 2>&1 | tee llm_tests_output.txt; \
uv run ./llm_tests.sh 2>&1 | tee llm_tests_output.txt; \
exit $${PIPESTATUS[0]}'
9 changes: 8 additions & 1 deletion evaluation/auto_evaluation/dataset/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ def write_data(results_list: list[dict[str, Any]], results_path: str):


def read_deepeval_cache():
import os

cache_file = ".deepeval/.deepeval-cache.json"
if not os.path.exists(cache_file):
print(f"Warning: {cache_file} not found. Skipping cache read.")
return

metric_scores = {
"Contextual Precision": [],
"Contextual Recall": [],
Expand All @@ -37,7 +44,7 @@ def read_deepeval_cache():
"Contextual Recall": [],
"Hallucination": [],
}
with open(".deepeval-cache.json") as f:
with open(cache_file) as f:
results = json.load(f)
for _, value in results["test_cases_lookup_map"].items():
for metric in value["cached_metrics_data"]:
Expand Down
11 changes: 7 additions & 4 deletions evaluation/auto_evaluation/eval_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def sanity_check(self):
continue
raise ValueError("Sanity check failed after timeout")

def evaluate(self, retriever: str):
def evaluate(self, retriever: str, limit: int | None = None):
retrieval_tcs = []
response_times = []

Expand All @@ -93,7 +93,8 @@ def evaluate(self, retriever: str):
)

# retrieval test cases
for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
questions = self.qns[:limit] if limit else self.qns
for i, qa_pair in enumerate(tqdm(questions, desc="Evaluating")):
question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
response, response_time = self.query(retriever, question)
response_text = response["response"]
Expand All @@ -114,7 +115,6 @@ def evaluate(self, retriever: str):
evaluate(
test_cases=retrieval_tcs,
metrics=[precision, recall, hallucination],
print_results=False,
)

# parse deepeval results
Expand Down Expand Up @@ -155,11 +155,14 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
)
parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on")
parser.add_argument("--retriever", type=str, help="Retriever to evaluate on")
parser.add_argument(
"--limit", type=int, help="Limit number of questions to evaluate", default=None
)
args = parser.parse_args()

# Pull the dataset from huggingface hub
hf_pull.main()

# Evaluate the model on the dataset
harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
harness.evaluate(args.retriever)
harness.evaluate(args.retriever, limit=args.limit)
22 changes: 18 additions & 4 deletions evaluation/auto_evaluation/llm_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,27 @@ retrievers=(
"agent-retriever"
)

# Set default limit (empty means run all)
LIMIT=${1:-}

echo "==================================="
echo "==> Dataset: EDA Corpus"
if [ -n "$LIMIT" ]; then
echo "==> Running with limit: $LIMIT questions"
fi
for retriever in "${retrievers[@]}" ; do
echo "==> Running tests for $retriever"
python eval_main.py \
--base_url http://localhost:8000 \
--dataset ./dataset/EDA_Corpus_100_Question.csv \
--retriever $retriever
if [ -n "$LIMIT" ]; then
python eval_main.py \
--base_url http://localhost:8000 \
--dataset ./dataset/EDA_Corpus_100_Question.csv \
--retriever $retriever \
--limit $LIMIT
else
python eval_main.py \
--base_url http://localhost:8000 \
--dataset ./dataset/EDA_Corpus_100_Question.csv \
--retriever $retriever
fi
done
echo "==================================="
7 changes: 3 additions & 4 deletions evaluation/auto_evaluation/src/metrics/content.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from deepeval.metrics import (
FaithfulnessMetric,
AnswerRelevancyMetric,
BiasMetric,
ToxicityMetric,
Expand All @@ -12,23 +11,23 @@
TOXICITY_THRESHOLD = 0.7


def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
def make_answer_relevancy_metric(model: DeepEvalBaseLLM) -> AnswerRelevancyMetric:
return AnswerRelevancyMetric(
threshold=ANSRELEVANCY_THRESHOLD,
model=model,
include_reason=True,
)


def make_bias_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
def make_bias_metric(model: DeepEvalBaseLLM) -> BiasMetric:
return BiasMetric(
threshold=BIAS_THRESHOLD,
model=model,
include_reason=True,
)


def make_toxicity_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
def make_toxicity_metric(model: DeepEvalBaseLLM) -> ToxicityMetric:
return ToxicityMetric(
threshold=TOXICITY_THRESHOLD,
model=model,
Expand Down
39 changes: 31 additions & 8 deletions evaluation/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
[build-system]
requires = ['setuptools>=60', 'Cython==3.0.7', 'wheel==0.42.0']
build-backend = "setuptools.build_meta"
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "ora-evaluation"
version = "1.0.0"
dynamic = ["dependencies", "optional-dependencies"]
requires-python = ">=3.12"
dependencies = [
"google-api-python-client==2.151.0",
"google-auth==2.30.0",
"google-auth-httplib2==0.2.0",
"google-auth-oauthlib==1.2.0",
"gspread==6.1.2",
"python-dotenv==1.0.1",
"requests==2.32.4",
"streamlit==1.37.0",
"deepeval==3.0.0",
"langchain-google-vertexai==2.0.15",
"asyncio==3.4.3",
"huggingface-hub==0.26.2",
"instructor[vertexai]==1.5.2",
"openai==1.58.1",
"pydantic==2.10.4",
"tqdm==4.67.1",
"plotly==5.24.1",
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
Expand All @@ -16,12 +34,17 @@ classifiers = [
"Programming Language :: Python :: 3 :: Only",
]

[tool.setuptools.dynamic]
dependencies = { file = ["requirements.txt"] }
optional-dependencies = { test = { file = ["requirements-test.txt"] } }
[project.optional-dependencies]
test = [
"mypy==1.10.1",
"ruff==0.5.1",
"types-requests==2.32.0.20250602",
"google-api-python-client-stubs==1.28.0",
"types-tqdm==4.67.0.20241221",
]

[tool.setuptools.packages.find]
include = ["auto_evaluation", "human_evaluation", "script_based_evaluation"]
[tool.hatch.build.targets.wheel]
packages = ["auto_evaluation", "human_evaluation", "script_based_evaluation"]

[tool.mypy]
python_version = "3.12"
Expand Down
5 changes: 0 additions & 5 deletions evaluation/requirements-test.txt

This file was deleted.

18 changes: 0 additions & 18 deletions evaluation/requirements.txt

This file was deleted.

Loading