Skip to content

Commit c61751a

Browse files
authored
Deepeval (#22)
* test prompt logging * add deepeval * add test entry point * update deepeval version * add initial test * add deepeval cache * improve test separation * add init unit testing docs * ignore deepeval env file * remove legacy evaluators and improve source node handling * improve source node handling * add unit testing docs * finish domain specific tests * fix source hyperlinks
1 parent 8f068af commit c61751a

File tree

7 files changed

+363
-38
lines changed

7 files changed

+363
-38
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@ evaluation_results/
66
site/
77
prompt.md
88
summary.md
9+
.deepeval-cache.json
10+
.deepeval

bcorag/bcorag.py

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
1313
from llama_index.core.retrievers import VectorIndexRetriever
1414
from llama_index.core.query_engine import RetrieverQueryEngine
15-
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
1615
from llama_index.llms.openai import OpenAI # type: ignore
1716
from llama_index.embeddings.openai import OpenAIEmbedding # type: ignore
1817
from llama_index.core.node_parser import SemanticSplitterNodeParser
@@ -28,14 +27,14 @@
2827
import os
2928
from contextlib import contextmanager, redirect_stdout
3029
import json
31-
from . import EVALUATION_LLM
3230
from .custom_types.core_types import (
3331
GitData,
3432
GitFilter,
3533
GitFilters,
3634
UserSelections,
3735
DomainKey,
3836
DomainContent,
37+
add_source_nodes,
3938
default_domain_content,
4039
)
4140
from .custom_types.output_map_types import (
@@ -50,6 +49,9 @@
5049
import bcorag.misc_functions as misc_fns
5150
from .prompts import DOMAIN_MAP, QUERY_PROMPT, SUPPLEMENT_PROMPT
5251

52+
# import llama_index.core
53+
# llama_index.core.set_global_handler("simple")
54+
5355

5456
@contextmanager
5557
def supress_stdout():
@@ -99,23 +101,22 @@ class BcoRag:
99101
The token counts or None if in production mode.
100102
_git_data : GitData or None
101103
The git data or None if no github repo was included.
102-
_faithfulness_evaluator : Optional[FaithfulnessEvaluator]
103-
The faithfulness evalauator instance.
104-
_relevancy_evaluator : Optional[RelevancyEvaluator]
105-
The relevancy evaluator instance.
106104
_documents : list[Documents]
107105
The list of documents (containers for the data source).
108106
_index : VectorStoreIndex
109107
The vector store index instance.
110108
_query_engine : RetrieverQueryEngine
111109
The query engine.
110+
_other_docs : list[str] | None
111+
Any other miscellaneous documents to include in the indexing process.
112+
_domain_content : DomainContent
113+
Holds the most recent generated domain.
112114
"""
113115

114116
def __init__(
115117
self,
116118
user_selections: UserSelections,
117119
output_dir: str = "./output",
118-
evaluation_metrics: bool = False,
119120
):
120121
"""Constructor.
121122
@@ -161,12 +162,6 @@ def __init__(
161162
)
162163
self._other_docs: list[str] | None = user_selections["other_docs"]
163164
self.domain_content: DomainContent = default_domain_content()
164-
self._faithfulness_evaluator: Optional[FaithfulnessEvaluator] = None
165-
self._relevancy_evaluator: Optional[RelevancyEvaluator] = None
166-
if evaluation_metrics:
167-
_evaluation_llm = OpenAI(model=EVALUATION_LLM, temperature=0.0)
168-
self._faithfulness_evaluator = FaithfulnessEvaluator(llm=_evaluation_llm)
169-
self._relevancy_evaluator = RelevancyEvaluator(llm=_evaluation_llm)
170165

171166
openai_api_key = os.getenv("OPENAI_API_KEY")
172167
if not openai_api_key:
@@ -354,6 +349,9 @@ def perform_query(self, domain: DomainKey) -> str:
354349
query_response = str(response_object.response)
355350

356351
self.domain_content[domain] = query_response
352+
self.domain_content = add_source_nodes(
353+
domain_content=self.domain_content, nodes=response_object.source_nodes
354+
)
357355

358356
source_str = ""
359357
for idx, source_node in enumerate(response_object.source_nodes):
@@ -370,30 +368,6 @@ def perform_query(self, domain: DomainKey) -> str:
370368
)
371369
source_str += "\n"
372370

373-
if self._faithfulness_evaluator and self._relevancy_evaluator:
374-
for idx, source_node in enumerate(response_object.source_nodes):
375-
faithfulness_eval = self._faithfulness_evaluator.evaluate(
376-
response=response_object.response,
377-
contexts=[source_node.get_content()],
378-
)
379-
relevancy_eval = self._relevancy_evaluator.evaluate(
380-
query=query_prompt,
381-
response=response_object.response,
382-
contexts=[source_node.get_content()],
383-
)
384-
for name, eval in {
385-
"faithfulness": faithfulness_eval,
386-
"relevancy": relevancy_eval,
387-
}.items():
388-
self._display_info(
389-
{
390-
"passing": eval.passing,
391-
"score": eval.score,
392-
"feedback": eval.feedback,
393-
},
394-
f"{name.title()} Evaluation for node {idx + 1}:",
395-
)
396-
397371
if self._debug:
398372
self._display_info(query_prompt, f"QUERY PROMPT for the {domain} domain:")
399373
self._token_counts["input"] += self._token_counter.prompt_llm_token_count # type: ignore

bcorag/custom_types/core_types.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing import TypedDict, Optional, Literal
1717
from enum import Enum
1818
from llama_index.readers.github import GithubRepositoryReader # type: ignore
19+
from llama_index.core.schema import NodeWithScore
1920

2021
### General literals
2122

@@ -252,6 +253,23 @@ def create_user_selections(
252253
### Most recent generated domain schema
253254

254255

256+
class SourceNode(TypedDict):
257+
"""Holds the source node information for one node.
258+
259+
Attributes
260+
----------
261+
node_id : str
262+
content : str
263+
metdata : str
264+
score : str
265+
"""
266+
267+
node_id: str
268+
content: str
269+
metadata: str
270+
score: str
271+
272+
255273
class DomainContent(TypedDict):
256274
"""Holds the most recent generated domain for in memory storage.
257275
@@ -262,7 +280,7 @@ class DomainContent(TypedDict):
262280
description: Optional[str]
263281
execution: Optional[str]
264282
parametric: Optional[str]
265-
error: Optional[str]
283+
error: Optional[list[str]]
266284
"""
267285

268286
usability: Optional[str]
@@ -271,6 +289,7 @@ class DomainContent(TypedDict):
271289
execution: Optional[str]
272290
parametric: Optional[str]
273291
error: Optional[str]
292+
last_source_nodes: Optional[list[SourceNode]]
274293

275294

276295
def default_domain_content() -> DomainContent:
@@ -287,10 +306,42 @@ def default_domain_content() -> DomainContent:
287306
"execution": None,
288307
"parametric": None,
289308
"error": None,
309+
"last_source_nodes": None,
290310
}
291311
return return_data
292312

293313

314+
def add_source_nodes(
315+
domain_content: DomainContent, nodes: list[NodeWithScore]
316+
) -> DomainContent:
317+
"""Adds source node data to the domain content.
318+
319+
Parameters
320+
----------
321+
domain_content : DomainContent
322+
The domain content instance to add source node data to.
323+
nodes : list[NodeWithScore]
324+
List of nodes with score data.
325+
326+
Returns
327+
-------
328+
DomainContent
329+
The updated domain content object.
330+
"""
331+
node_list: list[SourceNode] = []
332+
for node in nodes:
333+
node_list.append(
334+
{
335+
"node_id": node.node.node_id,
336+
"content": node.node.get_content(),
337+
"metadata": node.node.get_metadata_str(),
338+
"score": str(node.score),
339+
}
340+
)
341+
domain_content["last_source_nodes"] = node_list
342+
return domain_content
343+
344+
294345
### Domain map prompting schemas
295346

296347

docs/unit-testing.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Automated Testing
2+
3+
The `test_bco_rag.py` script contains a suite of tests designed to evaluate the functionality of the BcoRag tool using the `pytest` framework and the open source LLM evaluation framework [DeepEval](https://docs.confident-ai.com/).
4+
5+
## Test Cases
6+
7+
There is one test case for each domain:
8+
9+
- `test_usability`
10+
- `test_io`
11+
- `test_description`
12+
- `test_execution`
13+
- `test_parametric`
14+
- `test_error`
15+
16+
## Test Metrics
17+
18+
The test suite evaluates two different metrics:
19+
20+
**Answer Relevancy**:
21+
22+
The answer relevancy metric is used to evaluate how relevant the finalized generated output (in our case, the generated domain) is to the original input prompt. It attempts to evaluate relevancy (does the generated content directly relate to the question at hand), appropriateness (is the content appropriate given the context of the input) and focus (does the content stay on topic).
23+
24+
> The answer relevancy metric measures the quality of your RAG pipeline's generator by evaluating how relevant the `actual_output` of your LLM application is compared to the provided input.
25+
26+
- Source: [Answer Relevancy](https://docs.confident-ai.com/docs/metrics-answer-relevancy)
27+
28+
**Faithfulness**:
29+
30+
The faithfulness metric assesses how accurate and truthful the finalized generated output (in our case, the generated domain) is concerning the source material (the retrieved content). It attempts to ensure that the content is relevant, factual, and does not contradict the information gathered from the retrieval step.
31+
32+
> The faithfulness metric measures the quality of your RAG pipeline's generator by evaluating whether the `actual_output` factually aligns with the contents of your `retrieval_context`.
33+
34+
- Source: [Faithfulness](https://docs.confident-ai.com/docs/metrics-faithfulness)
35+
36+
## Running The Tests
37+
38+
It is not recommended to run all the tests at once. The test suite uses `gpt-4o` in the backend to evaluate the above metrics.
39+
40+
To run one test at a time:
41+
42+
`deepeval test run test_bco_rag.py::test_{domain}`
43+
44+
To run all the tests at once:
45+
46+
`deepeval test run test_bco_rag.py`

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ nav:
3333
- Other Features:
3434
- In-Progress Documentation: "in-progress.md"
3535
- Parameter Search: "parameter-search.md"
36+
- Automated Testing: "unit-testing.md"
3637
- Evaluation App: "evaluation-app.md"
3738
- Code Documentation:
3839
- Core:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ mkdocs==1.6.0
1313
mkdocstrings[python]
1414
mkdocs-material==9.5.29
1515
pymdown-extensions==10.8.1
16+
deepeval==1.1.0

0 commit comments

Comments
 (0)