From de9775ad36ba8a32759cc8d0dbf3c00a4d33d849 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Thu, 7 Nov 2024 18:27:47 +0000 Subject: [PATCH 01/10] initial scaffold for deepeval integration + remove unnecessary files Signed-off-by: Jack Luar --- .gitignore | 1 + evaluation/Makefile | 3 +- evaluation/auto_evaluation/__init__.py | 0 .../auto_evaluation/content_metrics.json | 1 - evaluation/auto_evaluation/dataset/hf_pull.py | 7 +- .../auto_evaluation/dataset/preprocess.py | 25 +++ evaluation/auto_evaluation/demo.py | 64 -------- evaluation/auto_evaluation/eval_main.py | 147 ++++++++++++++++++ .../auto_evaluation/retrieval_metrics.json | 1 - evaluation/pyproject.toml | 3 + 10 files changed, 184 insertions(+), 68 deletions(-) create mode 100644 evaluation/auto_evaluation/__init__.py delete mode 100644 evaluation/auto_evaluation/content_metrics.json create mode 100644 evaluation/auto_evaluation/dataset/preprocess.py delete mode 100644 evaluation/auto_evaluation/demo.py create mode 100644 evaluation/auto_evaluation/eval_main.py delete mode 100644 evaluation/auto_evaluation/retrieval_metrics.json diff --git a/.gitignore b/.gitignore index 10bfa179..78dbe117 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ backend/data/* backend/src/*.json *.pyc +*.egg-info/ frontend/*.json evaluation/human_evaluation/*.json /*.json diff --git a/evaluation/Makefile b/evaluation/Makefile index 72878508..6b07a4a5 100644 --- a/evaluation/Makefile +++ b/evaluation/Makefile @@ -1,7 +1,8 @@ init: @python3 -m venv .venv && \ . .venv/bin/activate && \ - pip install -r requirements.txt + pip install -r requirements.txt && \ + pip install -e . init-dev: init @. .venv/bin/activate && \ diff --git a/evaluation/auto_evaluation/__init__.py b/evaluation/auto_evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluation/auto_evaluation/content_metrics.json b/evaluation/auto_evaluation/content_metrics.json deleted file mode 100644 index 274e99ee..00000000 --- a/evaluation/auto_evaluation/content_metrics.json +++ /dev/null @@ -1 +0,0 @@ -{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}} \ No newline at end of file diff --git a/evaluation/auto_evaluation/dataset/hf_pull.py b/evaluation/auto_evaluation/dataset/hf_pull.py index c0541cbf..8df9a062 100644 --- a/evaluation/auto_evaluation/dataset/hf_pull.py +++ b/evaluation/auto_evaluation/dataset/hf_pull.py @@ -1,7 +1,8 @@ from huggingface_hub import snapshot_download import os -if __name__ == "__main__": + +def main(): cur_dir = os.path.dirname(os.path.abspath(__file__)) snapshot_download( "The-OpenROAD-Project/ORAssistant_Public_Evals", @@ -13,3 +14,7 @@ "README.md", ], ) + + +if __name__ == "__main__": + main() diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py new file mode 100644 index 00000000..3178bd7e --- /dev/null +++ b/evaluation/auto_evaluation/dataset/preprocess.py @@ -0,0 +1,25 @@ +import csv +from typing import Any + + +def read_data(csv_file: str) -> list[dict]: + questions = [] + with open(csv_file, "r") as f: + reader = csv.reader(f) + header = next(reader) # Skip the header row + assert len(header) == 2, "CSV file must have exactly 2 columns" + for row in reader: + questions.append( + {"question": row[0].strip(), "ground_truth": row[1].strip()} + ) + return questions + + +def write_data(results_list: list[dict[str, Any]], results_path: str): + keys = results_list[0].keys() + with open(results_path, "w") as f: + writer = csv.writer(f) + writer.writerow(list(keys)) + for result in results_list: + writer.writerow([result[key] for key in keys]) + print(f"Results written to {results_path}") diff --git a/evaluation/auto_evaluation/demo.py b/evaluation/auto_evaluation/demo.py deleted file mode 100644 index 7b7b909f..00000000 --- a/evaluation/auto_evaluation/demo.py +++ /dev/null @@ -1,64 +0,0 @@ -import os - -from dotenv import load_dotenv -from src.models.vertex_ai import GoogleVertexAILangChain - -# from src.metrics.geval import make_correctness_metric -from src.metrics.content import ( - make_bias_metric, - make_toxicity_metric, - make_answer_relevancy_metric, -) -from src.metrics.retrieval import ( - make_contextual_precision_metric, - make_contextual_recall_metric, - make_contextual_relevancy_metric, - make_faithfulness_metric, - make_hallucination_metric, -) -from deepeval.test_case import LLMTestCase -from deepeval import evaluate - -cur_dir = os.path.dirname(__file__) -root_dir = os.path.join(cur_dir, "../../") -load_dotenv(os.path.join(root_dir, ".env")) - -if __name__ == "__main__": - model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") - print("Retrieval metrics") - precision, recall, relevancy, faithfulness, hallucination = ( - make_contextual_precision_metric(model), - make_contextual_recall_metric(model), - make_contextual_relevancy_metric(model), - make_faithfulness_metric(model), - make_hallucination_metric(model), - ) - - test_case = LLMTestCase( - input="What if these shoes don't fit?", - actual_output="We offer a 30-day full refund at no extra cost.", - expected_output="You are eligible for a 30 day full refund at no extra cost.", - context=[ - "All customers are eligible for a 30 day full refund at no extra cost." - ], - retrieval_context=[ - "All customers are eligible for a 30 day full refund at no extra cost." - ], - ) - evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination]) - os.rename(".deepeval-cache.json", "retrieval_metrics.json") - - print("Content metrics") - answer_relevancy, bias, toxicity = ( - make_answer_relevancy_metric(model), - make_bias_metric(model), - make_toxicity_metric(model), - ) - - test_case = LLMTestCase( - input="What is the capital of France?", - actual_output="The capital of France is Paris.", - expected_output="Paris.", - ) - evaluate([test_case], [answer_relevancy, bias, toxicity]) - os.rename(".deepeval-cache.json", "content_metrics.json") diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py new file mode 100644 index 00000000..dfe2d474 --- /dev/null +++ b/evaluation/auto_evaluation/eval_main.py @@ -0,0 +1,147 @@ +""" +Evaluation script which takes in arguments to dataset and +the model to evaluate on the dataset. +""" + +import argparse +import requests +import os + +from datetime import datetime +from dotenv import load_dotenv +from deepeval.test_case import LLMTestCase +from deepeval import evaluate + +from auto_evaluation.src.models.vertex_ai import GoogleVertexAILangChain +from auto_evaluation.src.metrics.retrieval import ( + make_contextual_precision_metric, + make_contextual_recall_metric, + make_contextual_relevancy_metric, + make_faithfulness_metric, + make_hallucination_metric, +) +from auto_evaluation.dataset import hf_pull, preprocess + +load_dotenv() + +# List of all available retrievers +ALL_RETRIEVERS = { + "agent-retriever": "/graphs/agent-retriever", + "agent-retriever-reranker": "/graphs/agent-retriever", + "hybrid": "/graphs/hybrid", + "sim": "/graphs/sim", + "ensemble": "/graphs/ensemble", +} + + +class EvaluationHarness: + # TODO: Use async for EvaluationHarness. + # TODO: Also requires LLM Engine to be async + def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""): + self.base_url = base_url + self.dataset = dataset + self.reranker_base_url = reranker_base_url + self.qns = preprocess.read_data(self.dataset) + self.eval_model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") + self.log_dir = "logs" + os.makedirs(self.log_dir, exist_ok=True) + self.sanity_check() + + def sanity_check(self): + if not requests.get(f"{self.base_url}/health-check").status_code == 200: + raise ValueError("Endpoint is not running") + if not os.path.exists(self.dataset): + raise ValueError("Dataset path does not exist") + if ( + self.reranker_base_url + and not requests.get(f"{self.reranker_base_url}/health-check").status_code + == 200 + ): + raise ValueError("Reranker endpoint is not running") + + def get_logfile(self, retriever: str): + return os.path.join( + self.log_dir, f"{retriever}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + ) + + def evaluate(self, retriever: str): + log_file = self.get_logfile(retriever) + overall = [] + for i, qa_pair in enumerate(self.qns): + question, ground_truth = qa_pair["question"], qa_pair["ground_truth"] + response, response_time = self.query(retriever, question) + response_text = response["response"] + context = response["context"] + + # deepeval parallel evals + retrieval_tc = LLMTestCase( + input=question, + actual_output=response_text, + expected_output=ground_truth, + context=context, + retrieval_context=context, + ) + print("Retrieval metrics") + precision, recall, relevancy, faithfulness, hallucination = ( + make_contextual_precision_metric(self.eval_model), + make_contextual_recall_metric(self.eval_model), + make_contextual_relevancy_metric(self.eval_model), + make_faithfulness_metric(self.eval_model), + make_hallucination_metric(self.eval_model), + ) + evaluate( + [retrieval_tc], + [precision, recall, relevancy, faithfulness, hallucination], + ) + + result = { + "question": f"{i + 1}. {question}", + "ground_truth": ground_truth, + "retriever_type": retriever, + "response_time": response_time, + "response_text": response_text, + "tool": retriever, + "contextual_precision": precision.score, + "contextual_recall": recall.score, + "contextual_relevancy": relevancy.score, + "faithfulness": faithfulness.score, + "hallucination": hallucination.score, + } + overall.append(result) + + # Write to log file + preprocess.write_data(overall, log_file) + + def query(self, retriever: str, query: str) -> tuple[dict, float]: + """ + Returns the response json and the time taken to get the response (ms) + """ + endpoint = ALL_RETRIEVERS[retriever] + url = ( + f"{self.base_url}/{endpoint}" + if retriever != "agent-retriever-reranker" + else f"{self.reranker_base_url}/{endpoint}" + ) + payload = {"query": query, "list_context": True, "list_sources": True} + response = requests.post(url, json=payload) + return response.json(), response.elapsed.total_seconds() * 1000 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluation script") + parser.add_argument( + "--base_url", type=str, help="Base URL of the model to evaluate" + ) + parser.add_argument( + "--reranker_base_url", type=str, help="Base URL of the reranker", default="" + ) + parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on") + parser.add_argument("--retriever", type=str, help="Retriever to evaluate on") + args = parser.parse_args() + + # Pull the dataset from huggingface hub + hf_pull.main() + + # Evaluate the model on the dataset + harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url) + harness.evaluate(args.retriever) diff --git a/evaluation/auto_evaluation/retrieval_metrics.json b/evaluation/auto_evaluation/retrieval_metrics.json deleted file mode 100644 index 085c26e7..00000000 --- a/evaluation/auto_evaluation/retrieval_metrics.json +++ /dev/null @@ -1 +0,0 @@ -{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}} \ No newline at end of file diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml index 6c8e7ebe..013e1111 100644 --- a/evaluation/pyproject.toml +++ b/evaluation/pyproject.toml @@ -20,6 +20,9 @@ classifiers = [ dependencies = { file = ["requirements.txt"] } optional-dependencies = { test = { file = ["requirements-test.txt"] } } +[tool.setuptools.packages.find] +include = ["auto_evaluation", "human_evaluation"] + [tool.mypy] python_version = "3.12" warn_unused_configs = true From 914c0aca8007c58b3d2768a7c402d5caf956594a Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sat, 9 Nov 2024 14:31:26 +0000 Subject: [PATCH 02/10] add llm_tests target and CI Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 10 ++++++++++ Makefile | 4 +++- evaluation/Makefile | 9 +++++++++ evaluation/auto_evaluation/llm_tests.sh | 18 ++++++++++++++++++ evaluation/llm_tests_output.txt | 16 ++++++++++++++++ 5 files changed, 56 insertions(+), 1 deletion(-) create mode 100755 evaluation/auto_evaluation/llm_tests.sh create mode 100644 evaluation/llm_tests_output.txt diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 31ff92d0..b7349400 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,6 +28,16 @@ jobs: - name: Build Docker image run: | make docker + - name: Run LLM CI + working-directory: evaluation + run: | + make llm-tests + - name: Create commit comment + working-directory: evaluation + uses: peter-evans/commit-comment@v3 + with: + token: ${{ secrets.GH_PATH }} + body-path: llm-tests-output.txt - name: Teardown if: always() run: | diff --git a/Makefile b/Makefile index 1ebc3f65..1c6a81fa 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ -FOLDERS=backend frontend +.PHONY: init init-dev format check + +FOLDERS=backend frontend evaluation init: @for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done diff --git a/evaluation/Makefile b/evaluation/Makefile index 6b07a4a5..9dacad6b 100644 --- a/evaluation/Makefile +++ b/evaluation/Makefile @@ -1,3 +1,5 @@ +.PHONY: init init-dev format check clean + init: @python3 -m venv .venv && \ . .venv/bin/activate && \ @@ -16,3 +18,10 @@ format: check: @. .venv/bin/activate && \ ruff check --fix + +clean: + @rm -f llm_tests_output.txt + +llm-tests: clean + @. .venv/bin/activate && \ + ./auto_evaluation/llm_tests.sh > llm_tests_output.txt 2>&1 diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh new file mode 100755 index 00000000..c9a12f5e --- /dev/null +++ b/evaluation/auto_evaluation/llm_tests.sh @@ -0,0 +1,18 @@ +#!/bin/bash -eu + +retrievers=( + "agent-retriever" \ + "ensemble" \ +) + +echo "===================================" +echo "==> Dataset: EDA Corpus" +for retriever in "${retrievers[@]}" ; do + echo "==> Running tests for $retriever" + python auto_evaluation/eval_main.py \ + --base_url http://localhost:8000 \ + --dataset ./auto_evaluation/dataset/EDA_Corpus_100_Question.csv \ + --retriever $retriever + echo "==> Done" +done +echo "===================================" diff --git a/evaluation/llm_tests_output.txt b/evaluation/llm_tests_output.txt new file mode 100644 index 00000000..26299ffa --- /dev/null +++ b/evaluation/llm_tests_output.txt @@ -0,0 +1,16 @@ +=================================== +==> Dataset: EDA Corpus +==> Running tests for agent-retriever +/home/luars/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/__init__.py:49: UserWarning: You are using deepeval version 1.4.9, however version 1.5.0 is available. You should consider upgrading via the "pip install --upgrade deepeval" command. + warnings.warn( + Fetching 3 files: 0%| | 0/3 [00:00 + harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/luars/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 44, in __init__ + self.qns = preprocess.read_data(self.dataset) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/luars/ORAssistant/evaluation/auto_evaluation/dataset/preprocess.py", line 10, in read_data + assert len(header) == 2, "CSV file must have exactly 2 columns" +AssertionError: CSV file must have exactly 2 columns From 564445b48dba2d3318f879a661b76a0d3a6877af Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 05:38:43 +0000 Subject: [PATCH 03/10] fix CI syntax, seed env for evaluation Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 4 +- .gitignore | 5 +- evaluation/Makefile | 4 +- evaluation/auto_evaluation/eval_main.py | 96 +++++++++++-------- evaluation/auto_evaluation/llm_tests.sh | 4 +- .../auto_evaluation/src/models/vertex_ai.py | 3 + evaluation/llm_tests_output.txt | 16 ---- 7 files changed, 68 insertions(+), 64 deletions(-) delete mode 100644 evaluation/llm_tests_output.txt diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b7349400..c872253b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,13 +31,13 @@ jobs: - name: Run LLM CI working-directory: evaluation run: | + cp ../backend/.env . make llm-tests - name: Create commit comment - working-directory: evaluation uses: peter-evans/commit-comment@v3 with: token: ${{ secrets.GH_PATH }} - body-path: llm-tests-output.txt + body-path: evaluation/auto_evaluation/llm_tests_output.txt - name: Teardown if: always() run: | diff --git a/.gitignore b/.gitignore index 78dbe117..14a3dc29 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,8 @@ documents.txt .venv # evaluations -.deepeval_telemtry.txt +**/.deepeval_telemtry.txt *.csv -*.deepeval-cache.json +**/.deepeval-cache.json temp_test_run_data.json +**/llm_tests_output.txt diff --git a/evaluation/Makefile b/evaluation/Makefile index 9dacad6b..d0dd015f 100644 --- a/evaluation/Makefile +++ b/evaluation/Makefile @@ -21,7 +21,9 @@ check: clean: @rm -f llm_tests_output.txt + @rm -f **/.deepeval-cache.json llm-tests: clean @. .venv/bin/activate && \ - ./auto_evaluation/llm_tests.sh > llm_tests_output.txt 2>&1 + cd auto_evaluation && \ + ./llm_tests.sh 2>&1 | tee llm_tests_output.txt diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py index dfe2d474..1b66ba0d 100644 --- a/evaluation/auto_evaluation/eval_main.py +++ b/evaluation/auto_evaluation/eval_main.py @@ -7,7 +7,6 @@ import requests import os -from datetime import datetime from dotenv import load_dotenv from deepeval.test_case import LLMTestCase from deepeval import evaluate @@ -21,8 +20,10 @@ make_hallucination_metric, ) from auto_evaluation.dataset import hf_pull, preprocess +from tqdm import tqdm # type: ignore -load_dotenv() +eval_root_path = os.path.join(os.path.dirname(__file__), "..") +load_dotenv(dotenv_path=os.path.join(eval_root_path, ".env")) # List of all available retrievers ALL_RETRIEVERS = { @@ -48,32 +49,39 @@ def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""): self.sanity_check() def sanity_check(self): - if not requests.get(f"{self.base_url}/health-check").status_code == 200: + if not requests.get(f"{self.base_url}/healthcheck").status_code == 200: raise ValueError("Endpoint is not running") if not os.path.exists(self.dataset): raise ValueError("Dataset path does not exist") if ( self.reranker_base_url - and not requests.get(f"{self.reranker_base_url}/health-check").status_code + and not requests.get(f"{self.reranker_base_url}/healthcheck").status_code == 200 ): raise ValueError("Reranker endpoint is not running") - def get_logfile(self, retriever: str): - return os.path.join( - self.log_dir, f"{retriever}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + def evaluate(self, retriever: str): + retrieval_tcs = [] + response_times = [] + + # metrics + precision, recall, relevancy, faithfulness, hallucination = ( + make_contextual_precision_metric(self.eval_model), + make_contextual_recall_metric(self.eval_model), + make_contextual_relevancy_metric(self.eval_model), + make_faithfulness_metric(self.eval_model), + make_hallucination_metric(self.eval_model), ) - def evaluate(self, retriever: str): - log_file = self.get_logfile(retriever) - overall = [] - for i, qa_pair in enumerate(self.qns): + # retrieval test cases + for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")): + if i < 20: + continue question, ground_truth = qa_pair["question"], qa_pair["ground_truth"] response, response_time = self.query(retriever, question) response_text = response["response"] context = response["context"] - # deepeval parallel evals retrieval_tc = LLMTestCase( input=question, actual_output=response_text, @@ -81,36 +89,33 @@ def evaluate(self, retriever: str): context=context, retrieval_context=context, ) - print("Retrieval metrics") - precision, recall, relevancy, faithfulness, hallucination = ( - make_contextual_precision_metric(self.eval_model), - make_contextual_recall_metric(self.eval_model), - make_contextual_relevancy_metric(self.eval_model), - make_faithfulness_metric(self.eval_model), - make_hallucination_metric(self.eval_model), - ) - evaluate( - [retrieval_tc], - [precision, recall, relevancy, faithfulness, hallucination], - ) + retrieval_tcs.append(retrieval_tc) + response_times.append(response_time) + + # parallel evaluate + evaluate( + retrieval_tcs, + [precision, recall, relevancy, faithfulness, hallucination], + ) - result = { - "question": f"{i + 1}. {question}", - "ground_truth": ground_truth, - "retriever_type": retriever, - "response_time": response_time, - "response_text": response_text, - "tool": retriever, - "contextual_precision": precision.score, - "contextual_recall": recall.score, - "contextual_relevancy": relevancy.score, - "faithfulness": faithfulness.score, - "hallucination": hallucination.score, - } - overall.append(result) + # result = { + # "question": f"{i + 1}. {question}", + # "ground_truth": ground_truth, + # "retriever_type": retriever, + # "response_time": response_time, + # "response_text": response_text, + # "tool": retriever, + # "contextual_precision": precision.score, + # "contextual_recall": recall.score, + # "contextual_relevancy": relevancy.score, + # "faithfulness": faithfulness.score, + # "hallucination": hallucination.score, + # } + # print(result) + # overall.append(result) # Write to log file - preprocess.write_data(overall, log_file) + # preprocess.write_data(overall, log_file) def query(self, retriever: str, query: str) -> tuple[dict, float]: """ @@ -123,8 +128,17 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]: else f"{self.reranker_base_url}/{endpoint}" ) payload = {"query": query, "list_context": True, "list_sources": True} - response = requests.post(url, json=payload) - return response.json(), response.elapsed.total_seconds() * 1000 + try: + response = requests.post(url, json=payload) + return response.json(), response.elapsed.total_seconds() * 1000 + except Exception as e: + print(f"Error querying {retriever}: {e}") + return { + "response": "invalid", + "sources": [], + "context": [], + "tool": "string", + }, -999999 if __name__ == "__main__": diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh index c9a12f5e..fe7c3519 100755 --- a/evaluation/auto_evaluation/llm_tests.sh +++ b/evaluation/auto_evaluation/llm_tests.sh @@ -9,9 +9,9 @@ echo "===================================" echo "==> Dataset: EDA Corpus" for retriever in "${retrievers[@]}" ; do echo "==> Running tests for $retriever" - python auto_evaluation/eval_main.py \ + python eval_main.py \ --base_url http://localhost:8000 \ - --dataset ./auto_evaluation/dataset/EDA_Corpus_100_Question.csv \ + --dataset ./dataset/EDA_Corpus_100_Question.csv \ --retriever $retriever echo "==> Done" done diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py index 9d72fcbb..0edc6aeb 100644 --- a/evaluation/auto_evaluation/src/models/vertex_ai.py +++ b/evaluation/auto_evaluation/src/models/vertex_ai.py @@ -59,5 +59,8 @@ async def main_async(): if __name__ == "__main__": + from dotenv import load_dotenv + + load_dotenv() main() # asyncio.run(main_async()) diff --git a/evaluation/llm_tests_output.txt b/evaluation/llm_tests_output.txt deleted file mode 100644 index 26299ffa..00000000 --- a/evaluation/llm_tests_output.txt +++ /dev/null @@ -1,16 +0,0 @@ -=================================== -==> Dataset: EDA Corpus -==> Running tests for agent-retriever -/home/luars/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/__init__.py:49: UserWarning: You are using deepeval version 1.4.9, however version 1.5.0 is available. You should consider upgrading via the "pip install --upgrade deepeval" command. - warnings.warn( - Fetching 3 files: 0%| | 0/3 [00:00 - harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/luars/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 44, in __init__ - self.qns = preprocess.read_data(self.dataset) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/luars/ORAssistant/evaluation/auto_evaluation/dataset/preprocess.py", line 10, in read_data - assert len(header) == 2, "CSV file must have exactly 2 columns" -AssertionError: CSV file must have exactly 2 columns From 74c87300d84ded437aac975cf6c1bbe6c19a7f19 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 06:26:32 +0000 Subject: [PATCH 04/10] use python3.12 for default workflow Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c872253b..1cf7e057 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,6 +11,10 @@ jobs: build-backend-docker: runs-on: self-hosted steps: + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.12' - name: Checkout code uses: actions/checkout@v4 - name: Setup prereqs From 76f12a62c838b887f4bda5d04ce6733f4f89575d Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 12:36:00 +0000 Subject: [PATCH 05/10] * fix CI -> GH_PAT and list_contexts issue * add instructor for enforcing json llm outputs in deepeval * silent deepeval outputs * add json deepeval-cache parser * set two deepeval metrics as notimplemented -protobuf error Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 2 +- backend/Dockerfile | 2 +- backend/src/api/routers/graphs.py | 2 +- backend/src/tools/format_docs.py | 9 +-- .../auto_evaluation/dataset/preprocess.py | 31 ++++++++++ evaluation/auto_evaluation/eval_main.py | 41 +++++-------- evaluation/auto_evaluation/llm_tests.sh | 2 - .../auto_evaluation/src/metrics/retrieval.py | 12 ++-- .../auto_evaluation/src/models/vertex_ai.py | 57 +++++++++++++++---- evaluation/requirements.txt | 1 + 10 files changed, 103 insertions(+), 56 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1cf7e057..9b8f33b8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -40,7 +40,7 @@ jobs: - name: Create commit comment uses: peter-evans/commit-comment@v3 with: - token: ${{ secrets.GH_PATH }} + token: ${{ secrets.GH_PAT }} body-path: evaluation/auto_evaluation/llm_tests_output.txt - name: Teardown if: always() diff --git a/backend/Dockerfile b/backend/Dockerfile index bc6e29f1..bc6737f8 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py EXPOSE 8000 -CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/backend/src/api/routers/graphs.py b/backend/src/api/routers/graphs.py index 0666ab78..93e8b13f 100644 --- a/backend/src/api/routers/graphs.py +++ b/backend/src/api/routers/graphs.py @@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse: tool_index = 1 for tool in tools: urls.extend(list(output[tool_index].values())[0]["urls"]) - context.extend(list(set(list(output[tool_index].values())[0]["context"]))) + context.append(list(output[tool_index].values())[0]["context"]) tool_index += 1 else: llm_response = "LLM response extraction failed" diff --git a/backend/src/tools/format_docs.py b/backend/src/tools/format_docs.py index bcd9fbf8..a2376c41 100644 --- a/backend/src/tools/format_docs.py +++ b/backend/src/tools/format_docs.py @@ -5,7 +5,7 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]: doc_text = "" - doc_texts = "" + doc_texts = [] doc_urls = [] doc_srcs = [] @@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]: doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}" else: doc_text = doc.page_content + doc_texts.append(doc_text) if "url" in doc.metadata: doc_urls.append(doc.metadata["url"]) + + doc_output = "\n\n -------------------------- \n\n".join(doc_texts) - doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}" - - return doc_texts, doc_srcs, doc_urls + return doc_output, doc_srcs, doc_urls diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py index 3178bd7e..49c59f0f 100644 --- a/evaluation/auto_evaluation/dataset/preprocess.py +++ b/evaluation/auto_evaluation/dataset/preprocess.py @@ -1,4 +1,5 @@ import csv +import json from typing import Any @@ -23,3 +24,33 @@ def write_data(results_list: list[dict[str, Any]], results_path: str): for result in results_list: writer.writerow([result[key] for key in keys]) print(f"Results written to {results_path}") + + +def read_deepeval_cache(): + metric_scores = { + "Contextual Precision": [], + "Contextual Recall": [], + "Hallucination": [], + } + metric_passes = { + "Contextual Precision": [], + "Contextual Recall": [], + "Hallucination": [], + } + with open(".deepeval-cache.json") as f: + results = json.load(f) + for _, value in results["test_cases_lookup_map"].items(): + for metric in value["cached_metrics_data"]: + metric_scores[metric["metric_data"]["name"]].append( + metric["metric_data"]["score"] + ) + metric_passes[metric["metric_data"]["name"]].append( + metric["metric_data"]["success"] + ) + + print("Metric Scores: ", metric_scores) + print("Metric Passes: ", metric_passes) + + +if __name__ == "__main__": + read_deepeval_cache() diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py index 1b66ba0d..1676b481 100644 --- a/evaluation/auto_evaluation/eval_main.py +++ b/evaluation/auto_evaluation/eval_main.py @@ -4,6 +4,7 @@ """ import argparse +import time import requests import os @@ -15,8 +16,6 @@ from auto_evaluation.src.metrics.retrieval import ( make_contextual_precision_metric, make_contextual_recall_metric, - make_contextual_relevancy_metric, - make_faithfulness_metric, make_hallucination_metric, ) from auto_evaluation.dataset import hf_pull, preprocess @@ -65,29 +64,29 @@ def evaluate(self, retriever: str): response_times = [] # metrics - precision, recall, relevancy, faithfulness, hallucination = ( + precision, recall, hallucination = ( make_contextual_precision_metric(self.eval_model), make_contextual_recall_metric(self.eval_model), - make_contextual_relevancy_metric(self.eval_model), - make_faithfulness_metric(self.eval_model), make_hallucination_metric(self.eval_model), ) # retrieval test cases for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")): - if i < 20: + if i >= 1: continue question, ground_truth = qa_pair["question"], qa_pair["ground_truth"] response, response_time = self.query(retriever, question) response_text = response["response"] context = response["context"] + context_list = context[0].split("--------------------------") + # works for: precision, recall, hallucination retrieval_tc = LLMTestCase( input=question, actual_output=response_text, expected_output=ground_truth, - context=context, - retrieval_context=context, + context=context_list, + retrieval_context=context_list, ) retrieval_tcs.append(retrieval_tc) response_times.append(response_time) @@ -95,27 +94,12 @@ def evaluate(self, retriever: str): # parallel evaluate evaluate( retrieval_tcs, - [precision, recall, relevancy, faithfulness, hallucination], + [precision, recall, hallucination], + print_results=False, ) - # result = { - # "question": f"{i + 1}. {question}", - # "ground_truth": ground_truth, - # "retriever_type": retriever, - # "response_time": response_time, - # "response_text": response_text, - # "tool": retriever, - # "contextual_precision": precision.score, - # "contextual_recall": recall.score, - # "contextual_relevancy": relevancy.score, - # "faithfulness": faithfulness.score, - # "hallucination": hallucination.score, - # } - # print(result) - # overall.append(result) - - # Write to log file - # preprocess.write_data(overall, log_file) + # parse deepeval results + preprocess.read_deepeval_cache() def query(self, retriever: str, query: str) -> tuple[dict, float]: """ @@ -127,8 +111,9 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]: if retriever != "agent-retriever-reranker" else f"{self.reranker_base_url}/{endpoint}" ) - payload = {"query": query, "list_context": True, "list_sources": True} + payload = {"query": query, "list_context": True, "list_sources": False} try: + time.sleep(5) response = requests.post(url, json=payload) return response.json(), response.elapsed.total_seconds() * 1000 except Exception as e: diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh index fe7c3519..d44cec1b 100755 --- a/evaluation/auto_evaluation/llm_tests.sh +++ b/evaluation/auto_evaluation/llm_tests.sh @@ -2,7 +2,6 @@ retrievers=( "agent-retriever" \ - "ensemble" \ ) echo "===================================" @@ -13,6 +12,5 @@ for retriever in "${retrievers[@]}" ; do --base_url http://localhost:8000 \ --dataset ./dataset/EDA_Corpus_100_Question.csv \ --retriever $retriever - echo "==> Done" done echo "===================================" diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py index cd7d286d..fc6470df 100644 --- a/evaluation/auto_evaluation/src/metrics/retrieval.py +++ b/evaluation/auto_evaluation/src/metrics/retrieval.py @@ -35,18 +35,14 @@ def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMet def make_contextual_relevancy_metric( model: DeepEvalBaseLLM, ) -> ContextualRelevancyMetric: - return ContextualRelevancyMetric( - threshold=RELEVANCY_THRESHOLD, - model=model, - include_reason=True, + raise NotImplementedError( + "ContextualRelevancyMetric is not implemented due to protobuf incompatability" ) def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: - return FaithfulnessMetric( - threshold=FAITHFULNESS_THRESHOLD, - model=model, - include_reason=True, + raise NotImplementedError( + "FaithfulnessMetric is not implemented due to protobuf incompatability" ) diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py index 0edc6aeb..ecffb108 100644 --- a/evaluation/auto_evaluation/src/models/vertex_ai.py +++ b/evaluation/auto_evaluation/src/models/vertex_ai.py @@ -3,10 +3,18 @@ Custom DeepEvalLLM wrapper. """ +import instructor + from typing import Any -from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory +# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory +from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory # type: ignore from deepeval.models.base_model import DeepEvalBaseLLM +from pydantic import BaseModel + + +class Response(BaseModel): + content: str class GoogleVertexAILangChain(DeepEvalBaseLLM): @@ -26,17 +34,43 @@ def load_model(self, *args, **kwargs): HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, } - return ChatVertexAI( + return GenerativeModel( model_name=self.model_name, safety_settings=safety_settings, ) - def generate(self, prompt: str) -> Any: - return self.model.invoke(prompt).content + def generate(self, prompt: str, schema: BaseModel) -> Any: + instructor_client = instructor.from_vertexai( + client=self.load_model(), + mode=instructor.Mode.VERTEXAI_TOOLS, + ) + resp = instructor_client.messages.create( # type: ignore + messages=[ + { + "role": "user", + "content": prompt, + } + ], + response_model=schema, + ) + return resp - async def a_generate(self, prompt: str) -> Any: - response = await self.model.ainvoke(prompt) - return response.content + async def a_generate(self, prompt: str, schema: BaseModel) -> Any: + instructor_client = instructor.from_vertexai( + client=self.load_model(), + mode=instructor.Mode.VERTEXAI_TOOLS, + _async=True, + ) + resp = await instructor_client.messages.create( # type: ignore + messages=[ + { + "role": "user", + "content": prompt, + } + ], + response_model=schema, + ) + return resp def get_model_name(self): return self.model_name @@ -46,7 +80,7 @@ def main(): model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") prompt = "Write me a joke" print(f"Prompt: {prompt}") - response = model.generate(prompt) + response = model.generate(prompt, schema=Response) print(f"Response: {response}") @@ -54,13 +88,14 @@ async def main_async(): model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") prompt = "Write me a joke" print(f"Prompt: {prompt}") - response = await model.a_generate(prompt) + response = await model.a_generate(prompt, Response) print(f"Response: {response}") if __name__ == "__main__": + import asyncio from dotenv import load_dotenv load_dotenv() - main() - # asyncio.run(main_async()) + # main() + asyncio.run(main_async()) diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt index 22f269d4..96e6f1ae 100644 --- a/evaluation/requirements.txt +++ b/evaluation/requirements.txt @@ -12,3 +12,4 @@ deepeval==1.4.9 langchain-google-vertexai==2.0.6 asyncio==3.4.3 huggingface-hub==0.26.2 +instructor[vertexai]==1.5.2 From 80e2da97f62d89b0ffe761cf6ce8808da8143786 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 12:49:36 +0000 Subject: [PATCH 06/10] populate env variables in 1 step Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9b8f33b8..dcbf0b76 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,14 +28,16 @@ jobs: cp backend/.env.example backend/.env sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env + cp backend/.env evaluation/.env + cp backend/.env frontend/.env cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src + cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/src - name: Build Docker image run: | make docker - name: Run LLM CI working-directory: evaluation run: | - cp ../backend/.env . make llm-tests - name: Create commit comment uses: peter-evans/commit-comment@v3 From b76dfb0c5b526c971d7422c6b866d2a2e2cec152 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 13:17:09 +0000 Subject: [PATCH 07/10] Copy secret.json to correct path Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index dcbf0b76..e5b86790 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,7 +31,7 @@ jobs: cp backend/.env evaluation/.env cp backend/.env frontend/.env cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src - cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/src + cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src - name: Build Docker image run: | make docker From bff18861d2315c230933223db5b5de58c4e37e2d Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 13:38:18 +0000 Subject: [PATCH 08/10] add sleep to make sure docker comes up and is alive Signed-off-by: Jack Luar --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e5b86790..d6cf4205 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,6 +35,7 @@ jobs: - name: Build Docker image run: | make docker + sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed. - name: Run LLM CI working-directory: evaluation run: | From 61d54ca7ace4d0e4b60a9440c46b9b11d1f9160e Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 14:31:37 +0000 Subject: [PATCH 09/10] evaluate all qns Signed-off-by: Jack Luar --- evaluation/auto_evaluation/eval_main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py index 1676b481..aac81503 100644 --- a/evaluation/auto_evaluation/eval_main.py +++ b/evaluation/auto_evaluation/eval_main.py @@ -72,8 +72,6 @@ def evaluate(self, retriever: str): # retrieval test cases for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")): - if i >= 1: - continue question, ground_truth = qa_pair["question"], qa_pair["ground_truth"] response, response_time = self.query(retriever, question) response_text = response["response"] From b3f05ef5c81a3fee1de1a8af809af4bd36dbf5f6 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 10 Nov 2024 15:23:11 +0000 Subject: [PATCH 10/10] cleanup and use average stats Signed-off-by: Jack Luar --- evaluation/auto_evaluation/dataset/preprocess.py | 8 ++++++-- evaluation/auto_evaluation/src/models/vertex_ai.py | 2 -- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py index 49c59f0f..3a7c6f9f 100644 --- a/evaluation/auto_evaluation/dataset/preprocess.py +++ b/evaluation/auto_evaluation/dataset/preprocess.py @@ -48,8 +48,12 @@ def read_deepeval_cache(): metric["metric_data"]["success"] ) - print("Metric Scores: ", metric_scores) - print("Metric Passes: ", metric_passes) + print("Average Metric Scores: ") + for key, value in metric_scores.items(): + print(key, sum(value) / len(value)) + print("Metric Passrates: ") + for key, value in metric_passes.items(): + print(key, value.count(True) / len(value)) if __name__ == "__main__": diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py index ecffb108..31a64748 100644 --- a/evaluation/auto_evaluation/src/models/vertex_ai.py +++ b/evaluation/auto_evaluation/src/models/vertex_ai.py @@ -6,8 +6,6 @@ import instructor from typing import Any - -# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory # type: ignore from deepeval.models.base_model import DeepEvalBaseLLM from pydantic import BaseModel