Deepeval (#22)

seankim658 · web-flow · commit c61751a2dbd2 · 2024-08-25T23:11:26.000-04:00
* test prompt logging

* add deepeval

* add test entry point

* update deepeval version

* add initial test

* add deepeval cache

* improve test separation

* add init unit testing docs

* ignore deepeval env file

* remove legacy evaluators and improve source node handling

* improve source node handling

* add unit testing docs

* finish domain specific tests

* fix source hyperlinks
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ evaluation_results/
 site/
 prompt.md
 summary.md
+.deepeval-cache.json
+.deepeval
diff --git a/bcorag/bcorag.py b/bcorag/bcorag.py
@@ -12,7 +12,6 @@
 from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
 from llama_index.llms.openai import OpenAI  # type: ignore
 from llama_index.embeddings.openai import OpenAIEmbedding  # type: ignore
 from llama_index.core.node_parser import SemanticSplitterNodeParser
@@ -28,14 +27,14 @@
 import os
 from contextlib import contextmanager, redirect_stdout
 import json
-from . import EVALUATION_LLM
 from .custom_types.core_types import (
     GitData,
     GitFilter,
     GitFilters,
     UserSelections,
     DomainKey,
     DomainContent,
+    add_source_nodes,
     default_domain_content,
 )
 from .custom_types.output_map_types import (
@@ -50,6 +49,9 @@
 import bcorag.misc_functions as misc_fns
 from .prompts import DOMAIN_MAP, QUERY_PROMPT, SUPPLEMENT_PROMPT
 
+# import llama_index.core
+# llama_index.core.set_global_handler("simple")
+
 
 @contextmanager
 def supress_stdout():
@@ -99,23 +101,22 @@ class BcoRag:
         The token counts or None if in production mode.
     _git_data : GitData or None
         The git data or None if no github repo was included.
-    _faithfulness_evaluator : Optional[FaithfulnessEvaluator]
-        The faithfulness evalauator instance.
-    _relevancy_evaluator : Optional[RelevancyEvaluator]
-        The relevancy evaluator instance.
     _documents : list[Documents]
         The list of documents (containers for the data source).
     _index : VectorStoreIndex
         The vector store index instance.
     _query_engine : RetrieverQueryEngine
         The query engine.
+    _other_docs : list[str] | None
+        Any other miscellaneous documents to include in the indexing process.
+    _domain_content : DomainContent
+        Holds the most recent generated domain.
     """
 
     def __init__(
         self,
         user_selections: UserSelections,
         output_dir: str = "./output",
-        evaluation_metrics: bool = False,
     ):
         """Constructor.
 
@@ -161,12 +162,6 @@ def __init__(
         )
         self._other_docs: list[str] | None = user_selections["other_docs"]
         self.domain_content: DomainContent = default_domain_content()
-        self._faithfulness_evaluator: Optional[FaithfulnessEvaluator] = None
-        self._relevancy_evaluator: Optional[RelevancyEvaluator] = None
-        if evaluation_metrics:
-            _evaluation_llm = OpenAI(model=EVALUATION_LLM, temperature=0.0)
-            self._faithfulness_evaluator = FaithfulnessEvaluator(llm=_evaluation_llm)
-            self._relevancy_evaluator = RelevancyEvaluator(llm=_evaluation_llm)
 
         openai_api_key = os.getenv("OPENAI_API_KEY")
         if not openai_api_key:
@@ -354,6 +349,9 @@ def perform_query(self, domain: DomainKey) -> str:
         query_response = str(response_object.response)
 
         self.domain_content[domain] = query_response
+        self.domain_content = add_source_nodes(
+            domain_content=self.domain_content, nodes=response_object.source_nodes
+        )
 
         source_str = ""
         for idx, source_node in enumerate(response_object.source_nodes):
@@ -370,30 +368,6 @@ def perform_query(self, domain: DomainKey) -> str:
             )
             source_str += "\n"
 
-        if self._faithfulness_evaluator and self._relevancy_evaluator:
-            for idx, source_node in enumerate(response_object.source_nodes):
-                faithfulness_eval = self._faithfulness_evaluator.evaluate(
-                    response=response_object.response,
-                    contexts=[source_node.get_content()],
-                )
-                relevancy_eval = self._relevancy_evaluator.evaluate(
-                    query=query_prompt,
-                    response=response_object.response,
-                    contexts=[source_node.get_content()],
-                )
-                for name, eval in {
-                    "faithfulness": faithfulness_eval,
-                    "relevancy": relevancy_eval,
-                }.items():
-                    self._display_info(
-                        {
-                            "passing": eval.passing,
-                            "score": eval.score,
-                            "feedback": eval.feedback,
-                        },
-                        f"{name.title()} Evaluation for node {idx + 1}:",
-                    )
-
         if self._debug:
             self._display_info(query_prompt, f"QUERY PROMPT for the {domain} domain:")
             self._token_counts["input"] += self._token_counter.prompt_llm_token_count  # type: ignore
diff --git a/bcorag/custom_types/core_types.py b/bcorag/custom_types/core_types.py
@@ -16,6 +16,7 @@
 from typing import TypedDict, Optional, Literal
 from enum import Enum
 from llama_index.readers.github import GithubRepositoryReader  # type: ignore
+from llama_index.core.schema import NodeWithScore
 
 ### General literals
 
@@ -252,6 +253,23 @@ def create_user_selections(
 ### Most recent generated domain schema
 
 
+class SourceNode(TypedDict):
+    """Holds the source node information for one node.
+
+    Attributes
+    ----------
+    node_id : str
+    content : str
+    metdata : str
+    score : str
+    """
+
+    node_id: str
+    content: str
+    metadata: str
+    score: str
+
+
 class DomainContent(TypedDict):
     """Holds the most recent generated domain for in memory storage.
 
@@ -262,7 +280,7 @@ class DomainContent(TypedDict):
     description: Optional[str]
     execution: Optional[str]
     parametric: Optional[str]
-    error: Optional[str]
+    error: Optional[list[str]]
     """
 
     usability: Optional[str]
@@ -271,6 +289,7 @@ class DomainContent(TypedDict):
     execution: Optional[str]
     parametric: Optional[str]
     error: Optional[str]
+    last_source_nodes: Optional[list[SourceNode]]
 
 
 def default_domain_content() -> DomainContent:
@@ -287,10 +306,42 @@ def default_domain_content() -> DomainContent:
         "execution": None,
         "parametric": None,
         "error": None,
+        "last_source_nodes": None,
     }
     return return_data
 
 
+def add_source_nodes(
+    domain_content: DomainContent, nodes: list[NodeWithScore]
+) -> DomainContent:
+    """Adds source node data to the domain content.
+
+    Parameters
+    ----------
+    domain_content : DomainContent
+        The domain content instance to add source node data to.
+    nodes : list[NodeWithScore]
+        List of nodes with score data.
+
+    Returns
+    -------
+    DomainContent
+        The updated domain content object.
+    """
+    node_list: list[SourceNode] = []
+    for node in nodes:
+        node_list.append(
+            {
+                "node_id": node.node.node_id,
+                "content": node.node.get_content(),
+                "metadata": node.node.get_metadata_str(),
+                "score": str(node.score),
+            }
+        )
+    domain_content["last_source_nodes"] = node_list
+    return domain_content
+
+
 ### Domain map prompting schemas
 
 
diff --git a/docs/unit-testing.md b/docs/unit-testing.md
@@ -0,0 +1,46 @@
+# Automated Testing
+
+The `test_bco_rag.py` script contains a suite of tests designed to evaluate the functionality of the BcoRag tool using the `pytest` framework and the open source LLM evaluation framework [DeepEval](https://docs.confident-ai.com/).
+
+## Test Cases
+
+There is one test case for each domain:
+
+- `test_usability`
+- `test_io`
+- `test_description`
+- `test_execution`
+- `test_parametric`
+- `test_error`
+
+## Test Metrics
+
+The test suite evaluates two different metrics:
+
+**Answer Relevancy**:
+
+The answer relevancy metric is used to evaluate how relevant the finalized generated output (in our case, the generated domain) is to the original input prompt. It attempts to evaluate relevancy (does the generated content directly relate to the question at hand), appropriateness (is the content appropriate given the context of the input) and focus (does the content stay on topic).
+
+> The answer relevancy metric measures the quality of your RAG pipeline's generator by evaluating how relevant the `actual_output` of your LLM application is compared to the provided input.
+
+- Source: [Answer Relevancy](https://docs.confident-ai.com/docs/metrics-answer-relevancy)
+
+**Faithfulness**:
+
+The faithfulness metric assesses how accurate and truthful the finalized generated output (in our case, the generated domain) is concerning the source material (the retrieved content). It attempts to ensure that the content is relevant, factual, and does not contradict the information gathered from the retrieval step.
+
+> The faithfulness metric measures the quality of your RAG pipeline's generator by evaluating whether the `actual_output` factually aligns with the contents of your `retrieval_context`.
+
+- Source: [Faithfulness](https://docs.confident-ai.com/docs/metrics-faithfulness)
+
+## Running The Tests
+
+It is not recommended to run all the tests at once. The test suite uses `gpt-4o` in the backend to evaluate the above metrics.
+
+To run one test at a time:
+
+`deepeval test run test_bco_rag.py::test_{domain}`
+
+To run all the tests at once:
+
+`deepeval test run test_bco_rag.py`
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -33,6 +33,7 @@ nav:
   - Other Features:
     - In-Progress Documentation: "in-progress.md"
     - Parameter Search: "parameter-search.md"
+    - Automated Testing: "unit-testing.md"
     - Evaluation App: "evaluation-app.md"
   - Code Documentation:
     - Core:
diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ mkdocs==1.6.0
 mkdocstrings[python]
 mkdocs-material==9.5.29
 pymdown-extensions==10.8.1
+deepeval==1.1.0
diff --git a/test_bco_rag.py b/test_bco_rag.py