From de9775ad36ba8a32759cc8d0dbf3c00a4d33d849 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Thu, 7 Nov 2024 18:27:47 +0000
Subject: [PATCH 01/10] initial scaffold for deepeval integration + remove
 unnecessary files

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .gitignore                                    |   1 +
 evaluation/Makefile                           |   3 +-
 evaluation/auto_evaluation/__init__.py        |   0
 .../auto_evaluation/content_metrics.json      |   1 -
 evaluation/auto_evaluation/dataset/hf_pull.py |   7 +-
 .../auto_evaluation/dataset/preprocess.py     |  25 +++
 evaluation/auto_evaluation/demo.py            |  64 --------
 evaluation/auto_evaluation/eval_main.py       | 147 ++++++++++++++++++
 .../auto_evaluation/retrieval_metrics.json    |   1 -
 evaluation/pyproject.toml                     |   3 +
 10 files changed, 184 insertions(+), 68 deletions(-)
 create mode 100644 evaluation/auto_evaluation/__init__.py
 delete mode 100644 evaluation/auto_evaluation/content_metrics.json
 create mode 100644 evaluation/auto_evaluation/dataset/preprocess.py
 delete mode 100644 evaluation/auto_evaluation/demo.py
 create mode 100644 evaluation/auto_evaluation/eval_main.py
 delete mode 100644 evaluation/auto_evaluation/retrieval_metrics.json

diff --git a/.gitignore b/.gitignore
index 10bfa179..78dbe117 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ __pycache__/
 backend/data/*
 backend/src/*.json
 *.pyc
+*.egg-info/
 frontend/*.json  
 evaluation/human_evaluation/*.json  
 /*.json
diff --git a/evaluation/Makefile b/evaluation/Makefile
index 72878508..6b07a4a5 100644
--- a/evaluation/Makefile
+++ b/evaluation/Makefile
@@ -1,7 +1,8 @@
 init:
 	@python3 -m venv .venv && \
 		. .venv/bin/activate && \
-		pip install -r requirements.txt
+		pip install -r requirements.txt && \
+		pip install -e .
 
 init-dev: init
 	@. .venv/bin/activate && \
diff --git a/evaluation/auto_evaluation/__init__.py b/evaluation/auto_evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evaluation/auto_evaluation/content_metrics.json b/evaluation/auto_evaluation/content_metrics.json
deleted file mode 100644
index 274e99ee..00000000
--- a/evaluation/auto_evaluation/content_metrics.json
+++ /dev/null
@@ -1 +0,0 @@
-{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdicts\": [\n            {\n                \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n                \"verdict\": \"yes\",\n                \"reason\": null\n            }\n        ]\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n    \"Customers are eligible for a full refund.\",\n    \"The refund period lasts 30 days.\",\n    \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n    \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n    \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
\ No newline at end of file
diff --git a/evaluation/auto_evaluation/dataset/hf_pull.py b/evaluation/auto_evaluation/dataset/hf_pull.py
index c0541cbf..8df9a062 100644
--- a/evaluation/auto_evaluation/dataset/hf_pull.py
+++ b/evaluation/auto_evaluation/dataset/hf_pull.py
@@ -1,7 +1,8 @@
 from huggingface_hub import snapshot_download
 import os
 
-if __name__ == "__main__":
+
+def main():
     cur_dir = os.path.dirname(os.path.abspath(__file__))
     snapshot_download(
         "The-OpenROAD-Project/ORAssistant_Public_Evals",
@@ -13,3 +14,7 @@
             "README.md",
         ],
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
new file mode 100644
index 00000000..3178bd7e
--- /dev/null
+++ b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -0,0 +1,25 @@
+import csv
+from typing import Any
+
+
+def read_data(csv_file: str) -> list[dict]:
+    questions = []
+    with open(csv_file, "r") as f:
+        reader = csv.reader(f)
+        header = next(reader)  # Skip the header row
+        assert len(header) == 2, "CSV file must have exactly 2 columns"
+        for row in reader:
+            questions.append(
+                {"question": row[0].strip(), "ground_truth": row[1].strip()}
+            )
+    return questions
+
+
+def write_data(results_list: list[dict[str, Any]], results_path: str):
+    keys = results_list[0].keys()
+    with open(results_path, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(list(keys))
+        for result in results_list:
+            writer.writerow([result[key] for key in keys])
+    print(f"Results written to {results_path}")
diff --git a/evaluation/auto_evaluation/demo.py b/evaluation/auto_evaluation/demo.py
deleted file mode 100644
index 7b7b909f..00000000
--- a/evaluation/auto_evaluation/demo.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import os
-
-from dotenv import load_dotenv
-from src.models.vertex_ai import GoogleVertexAILangChain
-
-# from src.metrics.geval import make_correctness_metric
-from src.metrics.content import (
-    make_bias_metric,
-    make_toxicity_metric,
-    make_answer_relevancy_metric,
-)
-from src.metrics.retrieval import (
-    make_contextual_precision_metric,
-    make_contextual_recall_metric,
-    make_contextual_relevancy_metric,
-    make_faithfulness_metric,
-    make_hallucination_metric,
-)
-from deepeval.test_case import LLMTestCase
-from deepeval import evaluate
-
-cur_dir = os.path.dirname(__file__)
-root_dir = os.path.join(cur_dir, "../../")
-load_dotenv(os.path.join(root_dir, ".env"))
-
-if __name__ == "__main__":
-    model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
-    print("Retrieval metrics")
-    precision, recall, relevancy, faithfulness, hallucination = (
-        make_contextual_precision_metric(model),
-        make_contextual_recall_metric(model),
-        make_contextual_relevancy_metric(model),
-        make_faithfulness_metric(model),
-        make_hallucination_metric(model),
-    )
-
-    test_case = LLMTestCase(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",
-        expected_output="You are eligible for a 30 day full refund at no extra cost.",
-        context=[
-            "All customers are eligible for a 30 day full refund at no extra cost."
-        ],
-        retrieval_context=[
-            "All customers are eligible for a 30 day full refund at no extra cost."
-        ],
-    )
-    evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination])
-    os.rename(".deepeval-cache.json", "retrieval_metrics.json")
-
-    print("Content metrics")
-    answer_relevancy, bias, toxicity = (
-        make_answer_relevancy_metric(model),
-        make_bias_metric(model),
-        make_toxicity_metric(model),
-    )
-
-    test_case = LLMTestCase(
-        input="What is the capital of France?",
-        actual_output="The capital of France is Paris.",
-        expected_output="Paris.",
-    )
-    evaluate([test_case], [answer_relevancy, bias, toxicity])
-    os.rename(".deepeval-cache.json", "content_metrics.json")
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
new file mode 100644
index 00000000..dfe2d474
--- /dev/null
+++ b/evaluation/auto_evaluation/eval_main.py
@@ -0,0 +1,147 @@
+"""
+Evaluation script which takes in arguments to dataset and
+the model to evaluate on the dataset.
+"""
+
+import argparse
+import requests
+import os
+
+from datetime import datetime
+from dotenv import load_dotenv
+from deepeval.test_case import LLMTestCase
+from deepeval import evaluate
+
+from auto_evaluation.src.models.vertex_ai import GoogleVertexAILangChain
+from auto_evaluation.src.metrics.retrieval import (
+    make_contextual_precision_metric,
+    make_contextual_recall_metric,
+    make_contextual_relevancy_metric,
+    make_faithfulness_metric,
+    make_hallucination_metric,
+)
+from auto_evaluation.dataset import hf_pull, preprocess
+
+load_dotenv()
+
+# List of all available retrievers
+ALL_RETRIEVERS = {
+    "agent-retriever": "/graphs/agent-retriever",
+    "agent-retriever-reranker": "/graphs/agent-retriever",
+    "hybrid": "/graphs/hybrid",
+    "sim": "/graphs/sim",
+    "ensemble": "/graphs/ensemble",
+}
+
+
+class EvaluationHarness:
+    # TODO: Use async for EvaluationHarness.
+    # TODO: Also requires LLM Engine to be async
+    def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""):
+        self.base_url = base_url
+        self.dataset = dataset
+        self.reranker_base_url = reranker_base_url
+        self.qns = preprocess.read_data(self.dataset)
+        self.eval_model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
+        self.log_dir = "logs"
+        os.makedirs(self.log_dir, exist_ok=True)
+        self.sanity_check()
+
+    def sanity_check(self):
+        if not requests.get(f"{self.base_url}/health-check").status_code == 200:
+            raise ValueError("Endpoint is not running")
+        if not os.path.exists(self.dataset):
+            raise ValueError("Dataset path does not exist")
+        if (
+            self.reranker_base_url
+            and not requests.get(f"{self.reranker_base_url}/health-check").status_code
+            == 200
+        ):
+            raise ValueError("Reranker endpoint is not running")
+
+    def get_logfile(self, retriever: str):
+        return os.path.join(
+            self.log_dir, f"{retriever}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+        )
+
+    def evaluate(self, retriever: str):
+        log_file = self.get_logfile(retriever)
+        overall = []
+        for i, qa_pair in enumerate(self.qns):
+            question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
+            response, response_time = self.query(retriever, question)
+            response_text = response["response"]
+            context = response["context"]
+
+            # deepeval parallel evals
+            retrieval_tc = LLMTestCase(
+                input=question,
+                actual_output=response_text,
+                expected_output=ground_truth,
+                context=context,
+                retrieval_context=context,
+            )
+            print("Retrieval metrics")
+            precision, recall, relevancy, faithfulness, hallucination = (
+                make_contextual_precision_metric(self.eval_model),
+                make_contextual_recall_metric(self.eval_model),
+                make_contextual_relevancy_metric(self.eval_model),
+                make_faithfulness_metric(self.eval_model),
+                make_hallucination_metric(self.eval_model),
+            )
+            evaluate(
+                [retrieval_tc],
+                [precision, recall, relevancy, faithfulness, hallucination],
+            )
+
+            result = {
+                "question": f"{i + 1}. {question}",
+                "ground_truth": ground_truth,
+                "retriever_type": retriever,
+                "response_time": response_time,
+                "response_text": response_text,
+                "tool": retriever,
+                "contextual_precision": precision.score,
+                "contextual_recall": recall.score,
+                "contextual_relevancy": relevancy.score,
+                "faithfulness": faithfulness.score,
+                "hallucination": hallucination.score,
+            }
+            overall.append(result)
+
+        # Write to log file
+        preprocess.write_data(overall, log_file)
+
+    def query(self, retriever: str, query: str) -> tuple[dict, float]:
+        """
+        Returns the response json and the time taken to get the response (ms)
+        """
+        endpoint = ALL_RETRIEVERS[retriever]
+        url = (
+            f"{self.base_url}/{endpoint}"
+            if retriever != "agent-retriever-reranker"
+            else f"{self.reranker_base_url}/{endpoint}"
+        )
+        payload = {"query": query, "list_context": True, "list_sources": True}
+        response = requests.post(url, json=payload)
+        return response.json(), response.elapsed.total_seconds() * 1000
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluation script")
+    parser.add_argument(
+        "--base_url", type=str, help="Base URL of the model to evaluate"
+    )
+    parser.add_argument(
+        "--reranker_base_url", type=str, help="Base URL of the reranker", default=""
+    )
+    parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on")
+    parser.add_argument("--retriever", type=str, help="Retriever to evaluate on")
+    args = parser.parse_args()
+
+    # Pull the dataset from huggingface hub
+    hf_pull.main()
+
+    # Evaluate the model on the dataset
+    harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
+    harness.evaluate(args.retriever)
diff --git a/evaluation/auto_evaluation/retrieval_metrics.json b/evaluation/auto_evaluation/retrieval_metrics.json
deleted file mode 100644
index 085c26e7..00000000
--- a/evaluation/auto_evaluation/retrieval_metrics.json
+++ /dev/null
@@ -1 +0,0 @@
-{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdicts\": [\n            {\n                \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n                \"verdict\": \"yes\",\n                \"reason\": null\n            }\n        ]\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n    \"Customers are eligible for a full refund.\",\n    \"The refund period lasts 30 days.\",\n    \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n    \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
\ No newline at end of file
diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml
index 6c8e7ebe..013e1111 100644
--- a/evaluation/pyproject.toml
+++ b/evaluation/pyproject.toml
@@ -20,6 +20,9 @@ classifiers = [
 dependencies = { file = ["requirements.txt"] }
 optional-dependencies = { test = { file = ["requirements-test.txt"] } }
 
+[tool.setuptools.packages.find]
+include = ["auto_evaluation", "human_evaluation"]
+
 [tool.mypy]
 python_version = "3.12"
 warn_unused_configs = true

From 914c0aca8007c58b3d2768a7c402d5caf956594a Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sat, 9 Nov 2024 14:31:26 +0000
Subject: [PATCH 02/10] add llm_tests target and CI

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml               | 10 ++++++++++
 Makefile                                |  4 +++-
 evaluation/Makefile                     |  9 +++++++++
 evaluation/auto_evaluation/llm_tests.sh | 18 ++++++++++++++++++
 evaluation/llm_tests_output.txt         | 16 ++++++++++++++++
 5 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100755 evaluation/auto_evaluation/llm_tests.sh
 create mode 100644 evaluation/llm_tests_output.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 31ff92d0..b7349400 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -28,6 +28,16 @@ jobs:
     - name: Build Docker image
       run: |
         make docker
+    - name: Run LLM CI
+      working-directory: evaluation
+      run: |
+        make llm-tests
+    - name: Create commit comment
+      working-directory: evaluation
+      uses: peter-evans/commit-comment@v3
+      with:
+        token: ${{ secrets.GH_PATH }}
+        body-path: llm-tests-output.txt
     - name: Teardown
       if: always()
       run: |
diff --git a/Makefile b/Makefile
index 1ebc3f65..1c6a81fa 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
-FOLDERS=backend frontend
+.PHONY: init init-dev format check
+
+FOLDERS=backend frontend evaluation
 
 init:
 	@for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done
diff --git a/evaluation/Makefile b/evaluation/Makefile
index 6b07a4a5..9dacad6b 100644
--- a/evaluation/Makefile
+++ b/evaluation/Makefile
@@ -1,3 +1,5 @@
+.PHONY: init init-dev format check clean
+
 init:
 	@python3 -m venv .venv && \
 		. .venv/bin/activate && \
@@ -16,3 +18,10 @@ format:
 check:
 	@. .venv/bin/activate && \
 		ruff check --fix
+
+clean:
+	@rm -f llm_tests_output.txt
+
+llm-tests: clean
+	@. .venv/bin/activate && \
+		./auto_evaluation/llm_tests.sh > llm_tests_output.txt 2>&1
diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh
new file mode 100755
index 00000000..c9a12f5e
--- /dev/null
+++ b/evaluation/auto_evaluation/llm_tests.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -eu
+
+retrievers=(
+    "agent-retriever" \
+    "ensemble" \
+)
+
+echo "==================================="
+echo "==> Dataset: EDA Corpus"
+for retriever in "${retrievers[@]}" ; do
+    echo "==> Running tests for $retriever"
+    python auto_evaluation/eval_main.py \
+       --base_url http://localhost:8000 \
+       --dataset ./auto_evaluation/dataset/EDA_Corpus_100_Question.csv \
+       --retriever $retriever
+    echo "==> Done"
+done
+echo "==================================="
diff --git a/evaluation/llm_tests_output.txt b/evaluation/llm_tests_output.txt
new file mode 100644
index 00000000..26299ffa
--- /dev/null
+++ b/evaluation/llm_tests_output.txt
@@ -0,0 +1,16 @@
+===================================
+==> Dataset: EDA Corpus
+==> Running tests for agent-retriever
+/home/luars/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/__init__.py:49: UserWarning: You are using deepeval version 1.4.9, however version 1.5.0 is available. You should consider upgrading via the "pip install --upgrade deepeval" command.
+  warnings.warn(
+Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 33.41it/s]
+Traceback (most recent call last):
+  File "/home/luars/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 146, in <module>
+    harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
+              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/luars/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 44, in __init__
+    self.qns = preprocess.read_data(self.dataset)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/luars/ORAssistant/evaluation/auto_evaluation/dataset/preprocess.py", line 10, in read_data
+    assert len(header) == 2, "CSV file must have exactly 2 columns"
+AssertionError: CSV file must have exactly 2 columns

From 564445b48dba2d3318f879a661b76a0d3a6877af Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 05:38:43 +0000
Subject: [PATCH 03/10] fix CI syntax, seed env for evaluation

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml                     |  4 +-
 .gitignore                                    |  5 +-
 evaluation/Makefile                           |  4 +-
 evaluation/auto_evaluation/eval_main.py       | 96 +++++++++++--------
 evaluation/auto_evaluation/llm_tests.sh       |  4 +-
 .../auto_evaluation/src/models/vertex_ai.py   |  3 +
 evaluation/llm_tests_output.txt               | 16 ----
 7 files changed, 68 insertions(+), 64 deletions(-)
 delete mode 100644 evaluation/llm_tests_output.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index b7349400..c872253b 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -31,13 +31,13 @@ jobs:
     - name: Run LLM CI
       working-directory: evaluation
       run: |
+        cp ../backend/.env .
         make llm-tests
     - name: Create commit comment
-      working-directory: evaluation
       uses: peter-evans/commit-comment@v3
       with:
         token: ${{ secrets.GH_PATH }}
-        body-path: llm-tests-output.txt
+        body-path: evaluation/auto_evaluation/llm_tests_output.txt
     - name: Teardown
       if: always()
       run: |
diff --git a/.gitignore b/.gitignore
index 78dbe117..14a3dc29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,7 +22,8 @@ documents.txt
 .venv
 
 # evaluations
-.deepeval_telemtry.txt
+**/.deepeval_telemtry.txt
 *.csv
-*.deepeval-cache.json
+**/.deepeval-cache.json
 temp_test_run_data.json
+**/llm_tests_output.txt
diff --git a/evaluation/Makefile b/evaluation/Makefile
index 9dacad6b..d0dd015f 100644
--- a/evaluation/Makefile
+++ b/evaluation/Makefile
@@ -21,7 +21,9 @@ check:
 
 clean:
 	@rm -f llm_tests_output.txt
+	@rm -f **/.deepeval-cache.json
 
 llm-tests: clean
 	@. .venv/bin/activate && \
-		./auto_evaluation/llm_tests.sh > llm_tests_output.txt 2>&1
+		cd auto_evaluation && \
+		./llm_tests.sh 2>&1 | tee llm_tests_output.txt
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
index dfe2d474..1b66ba0d 100644
--- a/evaluation/auto_evaluation/eval_main.py
+++ b/evaluation/auto_evaluation/eval_main.py
@@ -7,7 +7,6 @@
 import requests
 import os
 
-from datetime import datetime
 from dotenv import load_dotenv
 from deepeval.test_case import LLMTestCase
 from deepeval import evaluate
@@ -21,8 +20,10 @@
     make_hallucination_metric,
 )
 from auto_evaluation.dataset import hf_pull, preprocess
+from tqdm import tqdm  # type: ignore
 
-load_dotenv()
+eval_root_path = os.path.join(os.path.dirname(__file__), "..")
+load_dotenv(dotenv_path=os.path.join(eval_root_path, ".env"))
 
 # List of all available retrievers
 ALL_RETRIEVERS = {
@@ -48,32 +49,39 @@ def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""):
         self.sanity_check()
 
     def sanity_check(self):
-        if not requests.get(f"{self.base_url}/health-check").status_code == 200:
+        if not requests.get(f"{self.base_url}/healthcheck").status_code == 200:
             raise ValueError("Endpoint is not running")
         if not os.path.exists(self.dataset):
             raise ValueError("Dataset path does not exist")
         if (
             self.reranker_base_url
-            and not requests.get(f"{self.reranker_base_url}/health-check").status_code
+            and not requests.get(f"{self.reranker_base_url}/healthcheck").status_code
             == 200
         ):
             raise ValueError("Reranker endpoint is not running")
 
-    def get_logfile(self, retriever: str):
-        return os.path.join(
-            self.log_dir, f"{retriever}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+    def evaluate(self, retriever: str):
+        retrieval_tcs = []
+        response_times = []
+
+        # metrics
+        precision, recall, relevancy, faithfulness, hallucination = (
+            make_contextual_precision_metric(self.eval_model),
+            make_contextual_recall_metric(self.eval_model),
+            make_contextual_relevancy_metric(self.eval_model),
+            make_faithfulness_metric(self.eval_model),
+            make_hallucination_metric(self.eval_model),
         )
 
-    def evaluate(self, retriever: str):
-        log_file = self.get_logfile(retriever)
-        overall = []
-        for i, qa_pair in enumerate(self.qns):
+        # retrieval test cases
+        for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
+            if i < 20:
+                continue
             question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
             response, response_time = self.query(retriever, question)
             response_text = response["response"]
             context = response["context"]
 
-            # deepeval parallel evals
             retrieval_tc = LLMTestCase(
                 input=question,
                 actual_output=response_text,
@@ -81,36 +89,33 @@ def evaluate(self, retriever: str):
                 context=context,
                 retrieval_context=context,
             )
-            print("Retrieval metrics")
-            precision, recall, relevancy, faithfulness, hallucination = (
-                make_contextual_precision_metric(self.eval_model),
-                make_contextual_recall_metric(self.eval_model),
-                make_contextual_relevancy_metric(self.eval_model),
-                make_faithfulness_metric(self.eval_model),
-                make_hallucination_metric(self.eval_model),
-            )
-            evaluate(
-                [retrieval_tc],
-                [precision, recall, relevancy, faithfulness, hallucination],
-            )
+            retrieval_tcs.append(retrieval_tc)
+            response_times.append(response_time)
+
+        # parallel evaluate
+        evaluate(
+            retrieval_tcs,
+            [precision, recall, relevancy, faithfulness, hallucination],
+        )
 
-            result = {
-                "question": f"{i + 1}. {question}",
-                "ground_truth": ground_truth,
-                "retriever_type": retriever,
-                "response_time": response_time,
-                "response_text": response_text,
-                "tool": retriever,
-                "contextual_precision": precision.score,
-                "contextual_recall": recall.score,
-                "contextual_relevancy": relevancy.score,
-                "faithfulness": faithfulness.score,
-                "hallucination": hallucination.score,
-            }
-            overall.append(result)
+        #     result = {
+        #         "question": f"{i + 1}. {question}",
+        #         "ground_truth": ground_truth,
+        #         "retriever_type": retriever,
+        #         "response_time": response_time,
+        #         "response_text": response_text,
+        #         "tool": retriever,
+        #         "contextual_precision": precision.score,
+        #         "contextual_recall": recall.score,
+        #         "contextual_relevancy": relevancy.score,
+        #         "faithfulness": faithfulness.score,
+        #         "hallucination": hallucination.score,
+        #     }
+        #     print(result)
+        #     overall.append(result)
 
         # Write to log file
-        preprocess.write_data(overall, log_file)
+        # preprocess.write_data(overall, log_file)
 
     def query(self, retriever: str, query: str) -> tuple[dict, float]:
         """
@@ -123,8 +128,17 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
             else f"{self.reranker_base_url}/{endpoint}"
         )
         payload = {"query": query, "list_context": True, "list_sources": True}
-        response = requests.post(url, json=payload)
-        return response.json(), response.elapsed.total_seconds() * 1000
+        try:
+            response = requests.post(url, json=payload)
+            return response.json(), response.elapsed.total_seconds() * 1000
+        except Exception as e:
+            print(f"Error querying {retriever}: {e}")
+            return {
+                "response": "invalid",
+                "sources": [],
+                "context": [],
+                "tool": "string",
+            }, -999999
 
 
 if __name__ == "__main__":
diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh
index c9a12f5e..fe7c3519 100755
--- a/evaluation/auto_evaluation/llm_tests.sh
+++ b/evaluation/auto_evaluation/llm_tests.sh
@@ -9,9 +9,9 @@ echo "==================================="
 echo "==> Dataset: EDA Corpus"
 for retriever in "${retrievers[@]}" ; do
     echo "==> Running tests for $retriever"
-    python auto_evaluation/eval_main.py \
+    python eval_main.py \
        --base_url http://localhost:8000 \
-       --dataset ./auto_evaluation/dataset/EDA_Corpus_100_Question.csv \
+       --dataset ./dataset/EDA_Corpus_100_Question.csv \
        --retriever $retriever
     echo "==> Done"
 done
diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py
index 9d72fcbb..0edc6aeb 100644
--- a/evaluation/auto_evaluation/src/models/vertex_ai.py
+++ b/evaluation/auto_evaluation/src/models/vertex_ai.py
@@ -59,5 +59,8 @@ async def main_async():
 
 
 if __name__ == "__main__":
+    from dotenv import load_dotenv
+
+    load_dotenv()
     main()
     # asyncio.run(main_async())
diff --git a/evaluation/llm_tests_output.txt b/evaluation/llm_tests_output.txt
deleted file mode 100644
index 26299ffa..00000000
--- a/evaluation/llm_tests_output.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-===================================
-==> Dataset: EDA Corpus
-==> Running tests for agent-retriever
-/home/luars/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/__init__.py:49: UserWarning: You are using deepeval version 1.4.9, however version 1.5.0 is available. You should consider upgrading via the "pip install --upgrade deepeval" command.
-  warnings.warn(
-Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 33.41it/s]
-Traceback (most recent call last):
-  File "/home/luars/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 146, in <module>
-    harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
-              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/home/luars/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 44, in __init__
-    self.qns = preprocess.read_data(self.dataset)
-               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/home/luars/ORAssistant/evaluation/auto_evaluation/dataset/preprocess.py", line 10, in read_data
-    assert len(header) == 2, "CSV file must have exactly 2 columns"
-AssertionError: CSV file must have exactly 2 columns

From 74c87300d84ded437aac975cf6c1bbe6c19a7f19 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 06:26:32 +0000
Subject: [PATCH 04/10] use python3.12 for default workflow

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index c872253b..1cf7e057 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -11,6 +11,10 @@ jobs:
   build-backend-docker:
     runs-on: self-hosted
     steps:
+    - name: Setup python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
     - name: Checkout code
       uses: actions/checkout@v4
     - name: Setup prereqs

From 76f12a62c838b887f4bda5d04ce6733f4f89575d Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 12:36:00 +0000
Subject: [PATCH 05/10] * fix CI -> GH_PAT and list_contexts issue * add
 instructor for enforcing json llm outputs in deepeval * silent deepeval
 outputs * add json deepeval-cache parser * set two deepeval metrics as
 notimplemented -protobuf error

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml                     |  2 +-
 backend/Dockerfile                            |  2 +-
 backend/src/api/routers/graphs.py             |  2 +-
 backend/src/tools/format_docs.py              |  9 +--
 .../auto_evaluation/dataset/preprocess.py     | 31 ++++++++++
 evaluation/auto_evaluation/eval_main.py       | 41 +++++--------
 evaluation/auto_evaluation/llm_tests.sh       |  2 -
 .../auto_evaluation/src/metrics/retrieval.py  | 12 ++--
 .../auto_evaluation/src/models/vertex_ai.py   | 57 +++++++++++++++----
 evaluation/requirements.txt                   |  1 +
 10 files changed, 103 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 1cf7e057..9b8f33b8 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -40,7 +40,7 @@ jobs:
     - name: Create commit comment
       uses: peter-evans/commit-comment@v3
       with:
-        token: ${{ secrets.GH_PATH }}
+        token: ${{ secrets.GH_PAT }}
         body-path: evaluation/auto_evaluation/llm_tests_output.txt
     - name: Teardown
       if: always()
diff --git a/backend/Dockerfile b/backend/Dockerfile
index bc6e29f1..bc6737f8 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py
 
 EXPOSE 8000
 
-CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/backend/src/api/routers/graphs.py b/backend/src/api/routers/graphs.py
index 0666ab78..93e8b13f 100644
--- a/backend/src/api/routers/graphs.py
+++ b/backend/src/api/routers/graphs.py
@@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
         tool_index = 1
         for tool in tools:
             urls.extend(list(output[tool_index].values())[0]["urls"])
-            context.extend(list(set(list(output[tool_index].values())[0]["context"])))
+            context.append(list(output[tool_index].values())[0]["context"])
             tool_index += 1
     else:
         llm_response = "LLM response extraction failed"
diff --git a/backend/src/tools/format_docs.py b/backend/src/tools/format_docs.py
index bcd9fbf8..a2376c41 100644
--- a/backend/src/tools/format_docs.py
+++ b/backend/src/tools/format_docs.py
@@ -5,7 +5,7 @@
 
 def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
     doc_text = ""
-    doc_texts = ""
+    doc_texts = []
     doc_urls = []
     doc_srcs = []
 
@@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
                 doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
             else:
                 doc_text = doc.page_content
+            doc_texts.append(doc_text)
 
         if "url" in doc.metadata:
             doc_urls.append(doc.metadata["url"])
+    
+    doc_output = "\n\n -------------------------- \n\n".join(doc_texts)
 
-        doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"
-
-    return doc_texts, doc_srcs, doc_urls
+    return doc_output, doc_srcs, doc_urls
diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
index 3178bd7e..49c59f0f 100644
--- a/evaluation/auto_evaluation/dataset/preprocess.py
+++ b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -1,4 +1,5 @@
 import csv
+import json
 from typing import Any
 
 
@@ -23,3 +24,33 @@ def write_data(results_list: list[dict[str, Any]], results_path: str):
         for result in results_list:
             writer.writerow([result[key] for key in keys])
     print(f"Results written to {results_path}")
+
+
+def read_deepeval_cache():
+    metric_scores = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    metric_passes = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    with open(".deepeval-cache.json") as f:
+        results = json.load(f)
+    for _, value in results["test_cases_lookup_map"].items():
+        for metric in value["cached_metrics_data"]:
+            metric_scores[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["score"]
+            )
+            metric_passes[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["success"]
+            )
+
+    print("Metric Scores: ", metric_scores)
+    print("Metric Passes: ", metric_passes)
+
+
+if __name__ == "__main__":
+    read_deepeval_cache()
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
index 1b66ba0d..1676b481 100644
--- a/evaluation/auto_evaluation/eval_main.py
+++ b/evaluation/auto_evaluation/eval_main.py
@@ -4,6 +4,7 @@
 """
 
 import argparse
+import time
 import requests
 import os
 
@@ -15,8 +16,6 @@
 from auto_evaluation.src.metrics.retrieval import (
     make_contextual_precision_metric,
     make_contextual_recall_metric,
-    make_contextual_relevancy_metric,
-    make_faithfulness_metric,
     make_hallucination_metric,
 )
 from auto_evaluation.dataset import hf_pull, preprocess
@@ -65,29 +64,29 @@ def evaluate(self, retriever: str):
         response_times = []
 
         # metrics
-        precision, recall, relevancy, faithfulness, hallucination = (
+        precision, recall, hallucination = (
             make_contextual_precision_metric(self.eval_model),
             make_contextual_recall_metric(self.eval_model),
-            make_contextual_relevancy_metric(self.eval_model),
-            make_faithfulness_metric(self.eval_model),
             make_hallucination_metric(self.eval_model),
         )
 
         # retrieval test cases
         for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
-            if i < 20:
+            if i >= 1:
                 continue
             question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
             response, response_time = self.query(retriever, question)
             response_text = response["response"]
             context = response["context"]
+            context_list = context[0].split("--------------------------")
 
+            # works for: precision, recall, hallucination
             retrieval_tc = LLMTestCase(
                 input=question,
                 actual_output=response_text,
                 expected_output=ground_truth,
-                context=context,
-                retrieval_context=context,
+                context=context_list,
+                retrieval_context=context_list,
             )
             retrieval_tcs.append(retrieval_tc)
             response_times.append(response_time)
@@ -95,27 +94,12 @@ def evaluate(self, retriever: str):
         # parallel evaluate
         evaluate(
             retrieval_tcs,
-            [precision, recall, relevancy, faithfulness, hallucination],
+            [precision, recall, hallucination],
+            print_results=False,
         )
 
-        #     result = {
-        #         "question": f"{i + 1}. {question}",
-        #         "ground_truth": ground_truth,
-        #         "retriever_type": retriever,
-        #         "response_time": response_time,
-        #         "response_text": response_text,
-        #         "tool": retriever,
-        #         "contextual_precision": precision.score,
-        #         "contextual_recall": recall.score,
-        #         "contextual_relevancy": relevancy.score,
-        #         "faithfulness": faithfulness.score,
-        #         "hallucination": hallucination.score,
-        #     }
-        #     print(result)
-        #     overall.append(result)
-
-        # Write to log file
-        # preprocess.write_data(overall, log_file)
+        # parse deepeval results
+        preprocess.read_deepeval_cache()
 
     def query(self, retriever: str, query: str) -> tuple[dict, float]:
         """
@@ -127,8 +111,9 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
             if retriever != "agent-retriever-reranker"
             else f"{self.reranker_base_url}/{endpoint}"
         )
-        payload = {"query": query, "list_context": True, "list_sources": True}
+        payload = {"query": query, "list_context": True, "list_sources": False}
         try:
+            time.sleep(5)
             response = requests.post(url, json=payload)
             return response.json(), response.elapsed.total_seconds() * 1000
         except Exception as e:
diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh
index fe7c3519..d44cec1b 100755
--- a/evaluation/auto_evaluation/llm_tests.sh
+++ b/evaluation/auto_evaluation/llm_tests.sh
@@ -2,7 +2,6 @@
 
 retrievers=(
     "agent-retriever" \
-    "ensemble" \
 )
 
 echo "==================================="
@@ -13,6 +12,5 @@ for retriever in "${retrievers[@]}" ; do
        --base_url http://localhost:8000 \
        --dataset ./dataset/EDA_Corpus_100_Question.csv \
        --retriever $retriever
-    echo "==> Done"
 done
 echo "==================================="
diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py
index cd7d286d..fc6470df 100644
--- a/evaluation/auto_evaluation/src/metrics/retrieval.py
+++ b/evaluation/auto_evaluation/src/metrics/retrieval.py
@@ -35,18 +35,14 @@ def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMet
 def make_contextual_relevancy_metric(
     model: DeepEvalBaseLLM,
 ) -> ContextualRelevancyMetric:
-    return ContextualRelevancyMetric(
-        threshold=RELEVANCY_THRESHOLD,
-        model=model,
-        include_reason=True,
+    raise NotImplementedError(
+        "ContextualRelevancyMetric is not implemented due to protobuf incompatability"
     )
 
 
 def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
-    return FaithfulnessMetric(
-        threshold=FAITHFULNESS_THRESHOLD,
-        model=model,
-        include_reason=True,
+    raise NotImplementedError(
+        "FaithfulnessMetric is not implemented due to protobuf incompatability"
     )
 
 
diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py
index 0edc6aeb..ecffb108 100644
--- a/evaluation/auto_evaluation/src/models/vertex_ai.py
+++ b/evaluation/auto_evaluation/src/models/vertex_ai.py
@@ -3,10 +3,18 @@
 Custom DeepEvalLLM wrapper.
 """
 
+import instructor
+
 from typing import Any
 
-from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
+# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
+from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory  # type: ignore
 from deepeval.models.base_model import DeepEvalBaseLLM
+from pydantic import BaseModel
+
+
+class Response(BaseModel):
+    content: str
 
 
 class GoogleVertexAILangChain(DeepEvalBaseLLM):
@@ -26,17 +34,43 @@ def load_model(self, *args, **kwargs):
             HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
         }
 
-        return ChatVertexAI(
+        return GenerativeModel(
             model_name=self.model_name,
             safety_settings=safety_settings,
         )
 
-    def generate(self, prompt: str) -> Any:
-        return self.model.invoke(prompt).content
+    def generate(self, prompt: str, schema: BaseModel) -> Any:
+        instructor_client = instructor.from_vertexai(
+            client=self.load_model(),
+            mode=instructor.Mode.VERTEXAI_TOOLS,
+        )
+        resp = instructor_client.messages.create(  # type: ignore
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
 
-    async def a_generate(self, prompt: str) -> Any:
-        response = await self.model.ainvoke(prompt)
-        return response.content
+    async def a_generate(self, prompt: str, schema: BaseModel) -> Any:
+        instructor_client = instructor.from_vertexai(
+            client=self.load_model(),
+            mode=instructor.Mode.VERTEXAI_TOOLS,
+            _async=True,
+        )
+        resp = await instructor_client.messages.create(  # type: ignore
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
 
     def get_model_name(self):
         return self.model_name
@@ -46,7 +80,7 @@ def main():
     model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
     prompt = "Write me a joke"
     print(f"Prompt: {prompt}")
-    response = model.generate(prompt)
+    response = model.generate(prompt, schema=Response)
     print(f"Response: {response}")
 
 
@@ -54,13 +88,14 @@ async def main_async():
     model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
     prompt = "Write me a joke"
     print(f"Prompt: {prompt}")
-    response = await model.a_generate(prompt)
+    response = await model.a_generate(prompt, Response)
     print(f"Response: {response}")
 
 
 if __name__ == "__main__":
+    import asyncio
     from dotenv import load_dotenv
 
     load_dotenv()
-    main()
-    # asyncio.run(main_async())
+    # main()
+    asyncio.run(main_async())
diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt
index 22f269d4..96e6f1ae 100644
--- a/evaluation/requirements.txt
+++ b/evaluation/requirements.txt
@@ -12,3 +12,4 @@ deepeval==1.4.9
 langchain-google-vertexai==2.0.6
 asyncio==3.4.3
 huggingface-hub==0.26.2
+instructor[vertexai]==1.5.2

From 80e2da97f62d89b0ffe761cf6ce8808da8143786 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 12:49:36 +0000
Subject: [PATCH 06/10] populate env variables in 1 step

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 9b8f33b8..dcbf0b76 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -28,14 +28,16 @@ jobs:
         cp backend/.env.example backend/.env
         sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
         sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
+        cp backend/.env evaluation/.env
+        cp backend/.env frontend/.env
         cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
+        cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/src
     - name: Build Docker image
       run: |
         make docker
     - name: Run LLM CI
       working-directory: evaluation
       run: |
-        cp ../backend/.env .
         make llm-tests
     - name: Create commit comment
       uses: peter-evans/commit-comment@v3

From b76dfb0c5b526c971d7422c6b866d2a2e2cec152 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 13:17:09 +0000
Subject: [PATCH 07/10] Copy secret.json to correct path

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index dcbf0b76..e5b86790 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -31,7 +31,7 @@ jobs:
         cp backend/.env evaluation/.env
         cp backend/.env frontend/.env
         cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
-        cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/src
+        cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
     - name: Build Docker image
       run: |
         make docker

From bff18861d2315c230933223db5b5de58c4e37e2d Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 13:38:18 +0000
Subject: [PATCH 08/10] add sleep to make sure docker comes up and is alive

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 .github/workflows/ci.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index e5b86790..d6cf4205 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -35,6 +35,7 @@ jobs:
     - name: Build Docker image
       run: |
         make docker
+        sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed.
     - name: Run LLM CI
       working-directory: evaluation
       run: |

From 61d54ca7ace4d0e4b60a9440c46b9b11d1f9160e Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 14:31:37 +0000
Subject: [PATCH 09/10] evaluate all qns

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 evaluation/auto_evaluation/eval_main.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
index 1676b481..aac81503 100644
--- a/evaluation/auto_evaluation/eval_main.py
+++ b/evaluation/auto_evaluation/eval_main.py
@@ -72,8 +72,6 @@ def evaluate(self, retriever: str):
 
         # retrieval test cases
         for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
-            if i >= 1:
-                continue
             question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
             response, response_time = self.query(retriever, question)
             response_text = response["response"]

From b3f05ef5c81a3fee1de1a8af809af4bd36dbf5f6 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 10 Nov 2024 15:23:11 +0000
Subject: [PATCH 10/10] cleanup and use average stats

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 evaluation/auto_evaluation/dataset/preprocess.py   | 8 ++++++--
 evaluation/auto_evaluation/src/models/vertex_ai.py | 2 --
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
index 49c59f0f..3a7c6f9f 100644
--- a/evaluation/auto_evaluation/dataset/preprocess.py
+++ b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -48,8 +48,12 @@ def read_deepeval_cache():
                 metric["metric_data"]["success"]
             )
 
-    print("Metric Scores: ", metric_scores)
-    print("Metric Passes: ", metric_passes)
+    print("Average Metric Scores: ")
+    for key, value in metric_scores.items():
+        print(key, sum(value) / len(value))
+    print("Metric Passrates: ")
+    for key, value in metric_passes.items():
+        print(key, value.count(True) / len(value))
 
 
 if __name__ == "__main__":
diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py
index ecffb108..31a64748 100644
--- a/evaluation/auto_evaluation/src/models/vertex_ai.py
+++ b/evaluation/auto_evaluation/src/models/vertex_ai.py
@@ -6,8 +6,6 @@
 import instructor
 
 from typing import Any
-
-# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
 from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory  # type: ignore
 from deepeval.models.base_model import DeepEvalBaseLLM
 from pydantic import BaseModel