diff --git a/.github/workflows/test_core.yml b/.github/workflows/test_core.yml index c3d5f6eff..dad8078a1 100644 --- a/.github/workflows/test_core.yml +++ b/.github/workflows/test_core.yml @@ -61,6 +61,9 @@ jobs: - name: Install dev dependencies run: poetry install --no-interaction --with dev + - name: Install other dependencies + run: poetry run pip install beautifulsoup4 + #---------------------------------------------- # run test suite #---------------------------------------------- diff --git a/a.py b/a.py index 77e6c610f..77737010a 100644 --- a/a.py +++ b/a.py @@ -1,84 +1,84 @@ -""" -Example script demonstrating how to use DeepEval's PromptOptimizer. -""" - -from openai import OpenAI -from deepeval.optimizer import PromptOptimizer -from deepeval.prompt import Prompt -from deepeval.dataset import Golden -from deepeval.metrics import AnswerRelevancyMetric - -# Initialize OpenAI client -client = OpenAI() - - -def model_callback(prompt: Prompt, golden: Golden) -> str: - """ - Callback function that runs your LLM with the optimized prompt. - This is called during scoring to evaluate how well the prompt performs. - """ - # Interpolate the prompt template with the golden's input - final_prompt = prompt.interpolate(query=golden.input) - - # Call your LLM - response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": final_prompt}], - ) - - return response.choices[0].message.content - - -# Define your initial prompt template (intentionally bad for testing optimization) -prompt = Prompt( - text_template="""idk maybe try to respond to this thing if u want lol - -{query} - -whatever:""" -) - -# Define your evaluation dataset (goldens) -goldens = [ - Golden( - input="What is the capital of France?", - expected_output="Paris", - ), - Golden( - input="Who wrote Romeo and Juliet?", - expected_output="William Shakespeare", - ), - Golden( - input="What is the chemical symbol for gold?", - expected_output="Au", - ), - Golden( - input="In what year did World War II end?", - expected_output="1945", - ), -] - -# Define metrics to optimize for -metrics = [AnswerRelevancyMetric(threshold=0.7)] - -from deepeval.optimizer.configs import DisplayConfig -from deepeval.optimizer.algorithms import GEPA - -# Create the optimizer -optimizer = PromptOptimizer( - model_callback=model_callback, - metrics=metrics, - optimizer_model="gpt-4o", # Model used for rewriting prompts - display_config=DisplayConfig(announce_ties=True), - algorithm=GEPA(iterations=1), -) - -# Run optimization -optimized_prompt = optimizer.optimize(prompt=prompt, goldens=goldens) - -# Print results -print("\n" + "=" * 60) -print("OPTIMIZATION COMPLETE") -print("=" * 60) -print(f"\nOriginal prompt:\n{prompt.text_template}") -print(f"\nOptimized prompt:\n{optimized_prompt.text_template}") +# """ +# Example script demonstrating how to use DeepEval's PromptOptimizer. +# """ + +# from openai import OpenAI +# from deepeval.optimizer import PromptOptimizer +# from deepeval.prompt import Prompt +# from deepeval.dataset import Golden +# from deepeval.metrics import AnswerRelevancyMetric + +# # Initialize OpenAI client +# client = OpenAI() + + +# def model_callback(prompt: Prompt, golden: Golden) -> str: +# """ +# Callback function that runs your LLM with the optimized prompt. +# This is called during scoring to evaluate how well the prompt performs. +# """ +# # Interpolate the prompt template with the golden's input +# final_prompt = prompt.interpolate(query=golden.input) + +# # Call your LLM +# response = client.chat.completions.create( +# model="gpt-4o-mini", +# messages=[{"role": "user", "content": final_prompt}], +# ) + +# return response.choices[0].message.content + + +# # Define your initial prompt template (intentionally bad for testing optimization) +# prompt = Prompt( +# text_template="""idk maybe try to respond to this thing if u want lol + +# {query} + +# whatever:""" +# ) + +# # Define your evaluation dataset (goldens) +# goldens = [ +# Golden( +# input="What is the capital of France?", +# expected_output="Paris", +# ), +# Golden( +# input="Who wrote Romeo and Juliet?", +# expected_output="William Shakespeare", +# ), +# Golden( +# input="What is the chemical symbol for gold?", +# expected_output="Au", +# ), +# Golden( +# input="In what year did World War II end?", +# expected_output="1945", +# ), +# ] + +# # Define metrics to optimize for +# metrics = [AnswerRelevancyMetric(threshold=0.7)] + +# from deepeval.optimizer.configs import DisplayConfig +# from deepeval.optimizer.algorithms import GEPA + +# # Create the optimizer +# optimizer = PromptOptimizer( +# model_callback=model_callback, +# metrics=metrics, +# optimizer_model="gpt-4o", # Model used for rewriting prompts +# display_config=DisplayConfig(announce_ties=True), +# algorithm=GEPA(iterations=1), +# ) + +# # Run optimization +# optimized_prompt = optimizer.optimize(prompt=prompt, goldens=goldens) + +# # Print results +# print("\n" + "=" * 60) +# print("OPTIMIZATION COMPLETE") +# print("=" * 60) +# print(f"\nOriginal prompt:\n{prompt.text_template}") +# print(f"\nOptimized prompt:\n{optimized_prompt.text_template}") diff --git a/deepeval/test_case/__init__.py b/deepeval/test_case/__init__.py index add3d339f..12e7c3168 100644 --- a/deepeval/test_case/__init__.py +++ b/deepeval/test_case/__init__.py @@ -4,6 +4,7 @@ ToolCall, ToolCallParams, MLLMImage, + Context, ) from .conversational_test_case import ( ConversationalTestCase, @@ -24,6 +25,7 @@ "LLMTestCaseParams", "ToolCall", "ToolCallParams", + "Context", "ConversationalTestCase", "Turn", "TurnParams", diff --git a/deepeval/test_case/conversational_test_case.py b/deepeval/test_case/conversational_test_case.py index 4e252a2e4..32b576339 100644 --- a/deepeval/test_case/conversational_test_case.py +++ b/deepeval/test_case/conversational_test_case.py @@ -6,7 +6,7 @@ model_validator, AliasChoices, ) -from typing import List, Optional, Dict, Literal +from typing import List, Optional, Dict, Literal, Union from copy import deepcopy from enum import Enum @@ -18,7 +18,7 @@ MCPToolCall, validate_mcp_servers, ) -from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY +from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY, Context class TurnParams(Enum): @@ -131,7 +131,7 @@ def validate_input(cls, data): class ConversationalTestCase(BaseModel): turns: List[Turn] scenario: Optional[str] = Field(default=None) - context: Optional[List[str]] = Field(default=None) + context: Optional[List[Union[str, Context]]] = Field(default=None) name: Optional[str] = Field(default=None) user_description: Optional[str] = Field( default=None, @@ -163,9 +163,41 @@ class ConversationalTestCase(BaseModel): _dataset_rank: Optional[int] = PrivateAttr(default=None) _dataset_alias: Optional[str] = PrivateAttr(default=None) _dataset_id: Optional[str] = PrivateAttr(default=None) + _context_items: Optional[List[Union[str, Context]]] = PrivateAttr( + default=None + ) @model_validator(mode="after") - def set_is_multimodal(self): + def post_init(self): + + self._handle_context_data() + self._set_is_multimodal() + + return self + + def _handle_context_data(self): + if self.context is None: + return + + self._context_items = self.context[:] + + resolved_context = [] + + for item in self.context: + if isinstance(item, Context): + resolved = item.resolve_contexts() + if isinstance(resolved, list): + resolved_context.extend(resolved) + else: + resolved_context.append(resolved) + else: + resolved_context.append(item) + + self.context = resolved_context + + return self + + def _set_is_multimodal(self): import re if self.multimodal is True: @@ -195,8 +227,6 @@ def set_is_multimodal(self): for context in turn.retrieval_context ) - return self - @model_validator(mode="before") def validate_input(cls, data): turns = data.get("turns") @@ -209,9 +239,12 @@ def validate_input(cls, data): # Ensure `context` is None or a list of strings if context is not None: if not isinstance(context, list) or not all( - isinstance(item, str) for item in context + (isinstance(item, str) or isinstance(item, Context)) + for item in context ): - raise TypeError("'context' must be None or a list of strings") + raise TypeError( + "'context' must be None or a list of or 'Context'" + ) if mcp_servers is not None: validate_mcp_servers(mcp_servers) diff --git a/deepeval/test_case/llm_test_case.py b/deepeval/test_case/llm_test_case.py index 65563db6d..f59bd3d2d 100644 --- a/deepeval/test_case/llm_test_case.py +++ b/deepeval/test_case/llm_test_case.py @@ -5,7 +5,9 @@ PrivateAttr, AliasChoices, ) -from typing import List, Optional, Dict, Any +from typing import List, Optional, Dict, Any, Literal, Union +from pathlib import Path +import requests from enum import Enum import json import uuid @@ -163,6 +165,87 @@ def as_data_uri(self) -> Optional[str]: return f"data:{self.mimeType};base64,{self.dataBase64}" +@dataclass +class Context: + type: Literal["file", "url"] + source: str + chunk_size: int = 2048 + chunk_overlap: int = 128 + _content: Optional[str] = None + + def __post_init__(self): + if self.chunk_overlap >= self.chunk_size: + raise ValueError("chunk_overlap must be smaller than chunk_size") + + if self.type == "file": + path = Path(self.source) + if not path.exists(): + raise ValueError(f"Context file does not exist: {self.source}") + + if self.type == "url": + if not self.source.startswith(("http://", "https://")): + raise ValueError(f"Invalid URL context source: {self.source}") + + def resolve_contexts(self) -> Union[str, List[str]]: + if self._content is None: + if self.type == "file": + self._content = self._load_file(self.source) + elif self.type == "url": + self._content = self._fetch_url(self.source) + + if len(self._content) <= self.chunk_size: + return self._content + + return self._chunk_text(self._content) + + def _load_file(self, path: str) -> str: + from deepeval.synthesizer.chunking.doc_chunker import DocumentChunker + + chunker = DocumentChunker(embedder=None) + chunker.load_doc(path, encoding="utf-8") + + if not chunker.sections: + return "" + + return " ".join(section.page_content for section in chunker.sections) + + def _fetch_url(self, url: str) -> str: + resp = requests.get(url, timeout=10) + resp.raise_for_status() + text = resp.text + + return self.html_to_text(text) + + def html_to_text(self, html: str) -> str: + try: + from bs4 import BeautifulSoup + except Exception as e: + raise Exception( + f"BeautifulSoup (bs4) is required for URL contexts." + f"Install with `pip install beautifulsoup4`. Root cause: {e}" + ) + + soup = BeautifulSoup(html, "html.parser") + + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + + return soup.get_text(separator=" ", strip=True) + + def _chunk_text(self, text: str) -> List[str]: + chunks = [] + start = 0 + text_length = len(text) + + while start < text_length: + end = start + self.chunk_size + chunk = text[start:end] + chunks.append(chunk) + start = end - self.chunk_overlap + + return chunks + + class LLMTestCaseParams(Enum): INPUT = "input" ACTUAL_OUTPUT = "actual_output" @@ -312,7 +395,7 @@ class LLMTestCase(BaseModel): serialization_alias="expectedOutput", validation_alias=AliasChoices("expectedOutput", "expected_output"), ) - context: Optional[List[str]] = Field( + context: Optional[List[Union[str, Context]]] = Field( default=None, serialization_alias="context" ) retrieval_context: Optional[List[str]] = Field( @@ -371,10 +454,39 @@ class LLMTestCase(BaseModel): _identifier: Optional[str] = PrivateAttr( default_factory=lambda: str(uuid.uuid4()) ) + _context_items: Optional[List[Union[str, Context]]] = PrivateAttr( + default=None + ) @model_validator(mode="after") - def set_is_multimodal(self): - import re + def post_init(self): + + self._handle_context_data() + self._set_is_multimodal() + + return self + + def _handle_context_data(self): + if self.context is None: + return + + self._context_items = self.context[:] + + resolved_context = [] + + for item in self.context: + if isinstance(item, Context): + resolved = item.resolve_contexts() + if isinstance(resolved, list): + resolved_context.extend(resolved) + else: + resolved_context.append(resolved) + else: + resolved_context.append(item) + + self.context = resolved_context + + def _set_is_multimodal(self): if self.multimodal is True: return self @@ -404,7 +516,6 @@ def set_is_multimodal(self): ) self.multimodal = auto_detect - return self @model_validator(mode="before") def validate_input(cls, data): @@ -430,9 +541,12 @@ def validate_input(cls, data): # Ensure `context` is None or a list of strings if context is not None: if not isinstance(context, list) or not all( - isinstance(item, str) for item in context + (isinstance(item, str) or isinstance(item, Context)) + for item in context ): - raise TypeError("'context' must be None or a list of strings") + raise TypeError( + "'context' must be None or a list of strings or 'Context'" + ) # Ensure `retrieval_context` is None or a list of strings if retrieval_context is not None: diff --git a/docs/docs/evaluation-multiturn-test-cases.mdx b/docs/docs/evaluation-multiturn-test-cases.mdx index fb2c1dea8..c2a488806 100644 --- a/docs/docs/evaluation-multiturn-test-cases.mdx +++ b/docs/docs/evaluation-multiturn-test-cases.mdx @@ -133,17 +133,48 @@ test_case = ConversationalTestCase( The `context` is an **optional** parameter that represents additional data received by your LLM application as supplementary sources of golden truth. You can view it as the ideal segment of your knowledge base relevant as support information to a specific input. Context is **static** and should not be generated dynamically. +A context accepts a list of: + +- strings and f-strings with `MLLMImage` objects, and +- `Context` objects, which allow you to reference files or website URLs that `deepeval` will automatically fetch and process. + ```python -from deepeval.test_case import Turn, ConversationalTestCase +from deepeval.test_case import Turn, ConversationalTestCase, Context + +website_context = Context( + type="url", + source="https://www.trydeepteam.com/docs/getting-started", +) + +file_context = Context( + type="file", + source="/path/to/your/file.pdf", +) test_case = ConversationalTestCase( - context=["Customers must be over 50 to be eligible for a refund."], - turns=[Turn(...)] + context=[ + "Customers must be over 50 to be eligible for a refund.", + website_context, + file_context, + ], + turns=[ + Turn(...) + ] ) ``` +The `Context` object allows you to pass large or external sources of truth (such as files or websites) directly into test cases. `deepeval` automatically handles **fetching**, **parsing**, and **chunking** this data before using it during evaluation. + +```python +class Context: + type: Literal["file", "url"] + source: str + chunk_size: int = 2048 + chunk_overlap: int = 128 +``` + :::info -A single-turn `LLMTestCase` also contains `context`. +Single-turn `LLMTestCase` also supports the `context` parameter and follows the same semantics. ::: ## Including Images diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx index 740fe6a93..f68ae8006 100644 --- a/docs/docs/evaluation-test-cases.mdx +++ b/docs/docs/evaluation-test-cases.mdx @@ -278,22 +278,55 @@ test_case = LLMTestCase( The `context` is an **optional** parameter that represents additional data received by your LLM application as supplementary sources of golden truth. You can view it as the ideal segment of your knowledge base relevant as support information to a specific input. Context is **static** and should not be generated dynamically. -Unlike other parameters, a context accepts a list of strings. +Unlike other parameters, `context` accepts a list containing: + +- strings and f-strings with `MLLMImage` objects, and +- `Context` objects, which allow you to reference files or website URLs that `deepeval` will automatically fetch and process. ```python # A hypothetical LLM application example +from deepeval.test_case import Context import chatbot +website_context = Context( + type="url", + source="https://www.trydeepteam.com/docs/getting-started", +) +file_context = Context(type="file", source="/path/to/your/file") + input = "Why did the chicken cross the road?" test_case = LLMTestCase( input=input, actual_output=chatbot.run(input), expected_output="To get to the other side!", - context=["The chicken wanted to cross the road."] + context=[ + "The chicken wanted to cross the road.", + website_context, + file_context, + ] ) ``` +Sometimes, the context can live in large documents or on the internet. To make this seamless, `deepeval` allows you to pass file paths or URLs via the `Context` object. The data model for `Context` is shown below: + +```python +class Context: + type: Literal["file", "url"] + source: str + chunk_size: int = 2048 + chunk_overlap: int = 128 +``` + +The `Context` object accepts 2 mandatory and 2 optional parameters: + +- `type`: A string value of either `"file"` or `"url"`, which determines how deepeval resolves the context data. +- `source`: A string representing either the file path or the website URL from which the context should be fetched. +- [Optional] `chunk_size`: The maximum size (in characters) of each context chunk. Defaults to `2048`. +- [Optional] `chunk_overlap`: The number of overlapping characters between consecutive chunks. Defaults to `128`. + +The `Context` object currently supports the following file formats: `pdf`, `txt`, `docx`, `md`, `markdown`, and `mdx`. + :::note Often times people confuse `expected_output` with `context` since due to their similar level of factual accuracy. However, while both are (or should be) factually correct, `expected_output` also takes aspects like tone and linguistic patterns into account, whereas context is strictly factual. ::: diff --git a/docs/guides/guides-using-custom-llms.mdx b/docs/guides/guides-using-custom-llms.mdx index 54621c90a..e677c6b3b 100644 --- a/docs/guides/guides-using-custom-llms.mdx +++ b/docs/guides/guides-using-custom-llms.mdx @@ -538,6 +538,167 @@ You may wish to wish any JSON confinement libraries out there, and we're just su In the final section, we'll show several popular end-to-end examples of custom LLMs using either `lm-format-enforcer` or `instructor` for JSON confinement. +## Multimodal Support For Images + +`deepeval` natively supports passing images in your test cases using the `MLLMImage` object, these objects are converted to `deepeval` special slug strings, and are parsed at the model level in the `generate` functions. This section will show you how to update your `generate` functions to support multimodal evaluations with your custom LLM. + +### Detecting Multimodal Inputs + +In `deepeval`, passing `MLLMImage`s in your test cases converts them to special slugs like `[DEEPEVAL:IMAGE:]`. An example is shown below: + +```python +from deepeval.test_case import LLMTestCase, MLLMImage + +test_case = LLMTestCase( + input=f"What's in this image: {MLLMImage(url="./my_image.jpeg")}" + actual_output=f"The image contains a red car drifting in the rain." +) +``` + +From the above code the `input` of `LLMTestCase` will be shown as following: + +``` +What's in this image: [DEEPEVAL:IMAGE:] +``` + +The evaluation models are passed the same strings with the slug shown above, `deepeval` provides a method called `check_if_multimodal` that can be used to check if a string has multimodal images. + +```python +from deepeval.utils import check_if_multimodal + +print(check_if_multimodal(test_case.input)) # True +print(check_if_multimodal(test_case.actual_output)) # False +``` + +### Parsing Multimodal Strings + +`deepeval` provides a helper method called `convert_to_multi_modal_array`, which can be used to convert a string with the `deepeval` image slug into a list of strings and `MLLMImage` objects, you can use this list to do ETL and provide images separately to your custom LLM application in its native format. Here's an example: + +```python +from deepeval.utils import convert_to_multi_modal_array + +print(convert_to_multi_modal_array(test_case.input)) +``` + +This returns an ordered list containing plain strings and `MLLMImage` objects: + +``` +[ + "What's in this image: ", + MLLMImage(...) +] +``` + +The placeholder slug is replaced with the original `MLLMImage` instance you passed into the test case. From here, you can do simple ETL and map text and images into your LLM provider’s native multimodal format. + +### Reimplementing `generate` and `a_generate` to Support Multimodal Inputs + +Different providers accept multimodal inputs differently. As a concrete example, OpenAI expects images to be passed as part of a structured `content` array: + +```python +from openai import OpenAI +client = OpenAI() + +response = client.chat.completions.create( + model="gpt-4.1", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + }, + }, + ], + } + ], + max_tokens=300, +) +``` + +Below is an example showing how to construct an OpenAI-compatible multimodal payload inside a custom DeepEval evaluation model. + +```python {15,16,28} +from typing import Union +from pydantic import BaseModel +from deepeval.models import DeepEvalBaseLLM +from deepeval.test_case import MLLMImage +from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array + +class CustomMultimodalModel(DeepEvalBaseLLM): + def __init__(self, client): + self.client = client + + def load_model(self): + return self.client + + def generate(self, prompt: str, schema: BaseModel) -> BaseModel: + if check_if_multimodal(prompt): + multimodal_elements = convert_to_multi_modal_array(prompt) + content = [] + for element in multimodal_elements: + if isinstance(element, str): + content.append({"type": "text", "text": element}) + elif isinstance(element, MLLMImage): + if element.url and not element.local: + content.append({ + "type": "image_url", + "image_url": {"url": element.url}, + }) + else: + element.ensure_images_loaded() + data_uri = ( + f"data:{element.mimeType};base64,{element.dataBase64}" + ) + content.append({ + "type": "image_url", + "image_url": {"url": data_uri}, + }) + else: + content = [{"type": "text", "text": prompt}] + + response = self.client.chat.completions.create( + model="gpt-4.1", + messages=[ + { + "role": "user", + "content": content, + } + ], + ) + return schema.model_validate_json(response.choices[0].message.content) + + async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel: + return self.generate(prompt, schema) +``` + +Here's what we've done in the above code: + +- We first checked if the prompt provided to the model is multimodal using the `check_if_multimodal` method. +- If the prompt is multimodal, we converted it into the list of strings and `MLLMImage` instances using the `convert_to_multi_modal_array` method. +- If the image in the `MLLMImage` instance is an external image, we simply passed the external url since OpenAI API supports it +- If the image is local, we converted it to `base64` uri as OpenAI API requires `base64` uri data for local images. +- The `ensure_images_loaded` method for `MLLMImage` instances, makes sure that all local images are fetched and parsed to populate the `dataBase64` and `mimeType` attributes. + +This is how you can use the `deepeval`'s internal methods to allow your custom models to support multimodal inputs. + +:::note +If your custom model supports multimodal images, please override the `supports_multimodal` method to return `True` as that would allow `deepeval` to know that the evaluation model can handle multimodal data. Here's how to do it: + +```python +from pydantic import BaseModel + +class CustomMultimodalModel(DeepEvalBaseLLM): + ... + + def supports_multimodal(self) -> Union[bool, None]: + return True +``` +::: + ## More Examples ### `Mistral-7B-Instruct-v0.3` through `transformers` @@ -589,6 +750,7 @@ class CustomMistral7B(DeepEvalBaseLLM): return self.model def generate(self, prompt: str, schema: BaseModel) -> BaseModel: + # Convert your multimodal inputs here model = self.load_model() pipeline = pipeline( "text-generation", @@ -659,6 +821,7 @@ class CustomGeminiFlash(DeepEvalBaseLLM): return self.model def generate(self, prompt: str, schema: BaseModel) -> BaseModel: + # Convert your multimodal inputs here client = self.load_model() instructor_client = instructor.from_gemini( client=client, @@ -709,6 +872,7 @@ class CustomClaudeOpus(DeepEvalBaseLLM): return self.model def generate(self, prompt: str, schema: BaseModel) -> BaseModel: + # Convert your multimodal inputs here client = self.load_model() instructor_client = instructor.from_anthropic(client) resp = instructor_client.messages.create( diff --git a/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py b/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py index b0a24c17d..e46dbc728 100644 --- a/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py +++ b/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py @@ -1,6 +1,7 @@ import pytest from pydantic import ValidationError -from deepeval.test_case import ConversationalTestCase, Turn, TurnParams +from deepeval.test_case import ConversationalTestCase, Turn, Context +import tempfile class TestConversationalTestCaseInitialization: @@ -90,16 +91,12 @@ def test_dict_turn_is_accepted(self): def test_invalid_context_type_raises_error(self): turns = [Turn(role="user", content="Hello")] - with pytest.raises( - TypeError, match="'context' must be None or a list of strings" - ): + with pytest.raises(TypeError): ConversationalTestCase(turns=turns, context="not a list") def test_invalid_context_items_raises_error(self): turns = [Turn(role="user", content="Hello")] - with pytest.raises( - TypeError, match="'context' must be None or a list of strings" - ): + with pytest.raises(TypeError): ConversationalTestCase( turns=turns, context=["valid", 123, "invalid"] ) @@ -115,6 +112,30 @@ def test_none_context_is_valid(self): test_case = ConversationalTestCase(turns=turns, context=None) assert test_case.context is None + def test_minimal_file_context(self): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"hello world") + path = f.name + + context = Context(type="file", source=path) + assert context.type == "file" + assert context.source == path + assert context.chunk_size == 2048 + assert context.chunk_overlap == 128 + assert context._content is None + + def test_external_context_is_valid(self): + turns = [Turn(role="user", content="Hello")] + context = Context( + type="url", + source="https://www.trydeepteam.com/docs/getting-started", + ) + test_case = ConversationalTestCase( + turns=turns, context=["Testing context", context] + ) + + assert len(test_case.context) > 3 + class TestConversationalTestCaseComplexScenarios: diff --git a/tests/test_core/test_test_case/test_single_turn.py b/tests/test_core/test_test_case/test_single_turn.py index b61878b11..ef122cb2f 100644 --- a/tests/test_core/test_test_case/test_single_turn.py +++ b/tests/test_core/test_test_case/test_single_turn.py @@ -2,14 +2,14 @@ import uuid from unittest.mock import patch from pydantic import ValidationError - +import tempfile from deepeval.test_case import ( LLMTestCase, ToolCall, LLMTestCaseParams, ToolCallParams, ) -from deepeval.test_case.mcp import MCPServer +from deepeval.test_case import Context class TestLLMTestCaseInitialization: @@ -40,6 +40,7 @@ def test_minimal_initialization(self): assert test_case._dataset_rank is None assert test_case._dataset_alias is None assert test_case._dataset_id is None + assert test_case._context_items is None assert isinstance(test_case._identifier, str) def test_full_initialization(self): @@ -50,12 +51,16 @@ def test_full_initialization(self): output={"results": ["result1", "result2"]}, input_parameters={"query": "test query"}, ) + context = Context( + type="url", + source="https://www.trydeepteam.com/docs/getting-started", + ) test_case = LLMTestCase( input="What is machine learning?", actual_output="Machine learning is a subset of AI...", expected_output="Machine learning is a method of data analysis...", - context=["ML is important", "AI revolution"], + context=["ML is important", "AI revolution", context], retrieval_context=["Retrieved context 1", "Retrieved context 2"], additional_metadata={"source": "test", "version": 1.0}, tools_called=[tool_call], @@ -75,7 +80,12 @@ def test_full_initialization(self): test_case.expected_output == "Machine learning is a method of data analysis..." ) - assert test_case.context == ["ML is important", "AI revolution"] + assert len(test_case.context) > 3 + assert test_case._context_items == [ + "ML is important", + "AI revolution", + context, + ] assert test_case.retrieval_context == [ "Retrieved context 1", "Retrieved context 2", @@ -774,3 +784,170 @@ def test_tool_calling_scenario(self): assert test_case.tools_called[0].name == "web_search" assert test_case.tools_called[1].name == "calculator" assert len(test_case.expected_tools) == 1 + + +class TestContextInitialization: + def test_minimal_file_context(self): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"hello world") + path = f.name + + context = Context(type="file", source=path) + assert context.type == "file" + assert context.source == path + assert context.chunk_size == 2048 + assert context.chunk_overlap == 128 + assert context._content is None + + def test_minimal_url_context(self): + context = Context( + type="url", + source="https://example.com", + ) + assert context.type == "url" + assert context.source.startswith("https://") + + def test_invalid_file_path(self): + with pytest.raises(ValueError): + Context(type="file", source="/non/existent/file.txt") + + def test_invalid_url(self): + with pytest.raises(ValueError): + Context(type="url", source="ftp://example.com") + + def test_invalid_chunk_overlap(self): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"text") + path = f.name + + with pytest.raises(ValueError): + Context( + type="file", + source=path, + chunk_size=100, + chunk_overlap=100, + ) + + +class TestContextFileLoading: + def test_txt_file_loading(self): + content = "This is a simple text file." + + with tempfile.NamedTemporaryFile( + suffix=".txt", mode="w", delete=False + ) as f: + f.write(content) + path = f.name + + context = Context(type="file", source=path) + resolved = context.resolve_contexts() + + assert isinstance(resolved, str) + assert content in resolved + + def test_file_chunking(self): + content = "A" * 5000 + + with tempfile.NamedTemporaryFile( + suffix=".txt", mode="w", delete=False + ) as f: + f.write(content) + path = f.name + + context = Context( + type="file", + source=path, + chunk_size=1000, + chunk_overlap=100, + ) + + chunks = context.resolve_contexts() + assert isinstance(chunks, list) + assert len(chunks) > 1 + + +class TestContextURLLoading: + def test_html_stripping(self, monkeypatch): + html = """ + + Test + +

Hello

+ +

World

+ + + """ + + class MockResponse: + status_code = 200 + text = html + + def raise_for_status(self): + pass + + monkeypatch.setattr( + "requests.get", lambda *args, **kwargs: MockResponse() + ) + + context = Context( + type="url", + source="https://example.com", + ) + + resolved = context.resolve_contexts() + assert "Hello" in resolved + assert "World" in resolved + assert "alert" not in resolved + + def test_url_chunking(self, monkeypatch): + html = "

" + ("text " * 2000) + "

" + + class MockResponse: + status_code = 200 + text = html + + def raise_for_status(self): + pass + + monkeypatch.setattr( + "requests.get", lambda *args, **kwargs: MockResponse() + ) + + context = Context( + type="url", + source="https://example.com", + chunk_size=500, + chunk_overlap=50, + ) + + chunks = context.resolve_contexts() + assert isinstance(chunks, list) + assert len(chunks) > 1 + + +class TestContextResolutionBehavior: + def test_resolve_idempotent(self, monkeypatch): + html = "

Hello World

" + + class MockResponse: + status_code = 200 + text = html + + def raise_for_status(self): + pass + + monkeypatch.setattr( + "requests.get", lambda *args, **kwargs: MockResponse() + ) + + context = Context( + type="url", + source="https://example.com", + ) + + first = context.resolve_contexts() + second = context.resolve_contexts() + + assert first == second + assert context._content is not None