Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
a921a15
context object added to LLMTestCase
A-Vamshi Dec 17, 2025
3393519
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 17, 2025
0887bb7
context object added to ConversationalTestCase
A-Vamshi Dec 17, 2025
c1d71f0
lint
A-Vamshi Dec 17, 2025
51ac81a
docs updated for context
A-Vamshi Dec 17, 2025
08dc76c
Merge branch 'main' of https://github.com/A-Vamshi/deepeval
A-Vamshi Dec 17, 2025
6b0aea7
tests for context
A-Vamshi Dec 17, 2025
4908d0b
conversational context
A-Vamshi Dec 17, 2025
cea5937
lint
A-Vamshi Dec 17, 2025
652e2c9
fix workflow
A-Vamshi Dec 17, 2025
31284a9
.
A-Vamshi Dec 18, 2025
2124b6d
docs update
A-Vamshi Dec 18, 2025
7b14816
changed source_type to type in docs
A-Vamshi Dec 18, 2025
bb9bf14
changed source_type to type
A-Vamshi Dec 18, 2025
13f47b9
tests updated
A-Vamshi Dec 18, 2025
1d900d4
custom LLM docs updated for multimodal support
A-Vamshi Dec 18, 2025
41df442
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 18, 2025
7e30d57
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 22, 2025
54f8f8f
.
A-Vamshi Dec 22, 2025
3c9ea62
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 23, 2025
4a1de34
portkey docs added
A-Vamshi Dec 23, 2025
4c475b6
.
A-Vamshi Dec 23, 2025
550b507
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 23, 2025
0bbefde
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 29, 2025
8be2a81
fix bedrock for backward compatibility
A-Vamshi Dec 29, 2025
41f6816
fix models to calculate cost only if available
A-Vamshi Dec 29, 2025
9ae7da5
fix metrics to use the standard method for generation
A-Vamshi Dec 29, 2025
81e47e6
.
A-Vamshi Dec 29, 2025
105c032
print None in console
A-Vamshi Dec 29, 2025
0d1edcb
.
A-Vamshi Dec 29, 2025
5cbb1d0
Merge branch 'confident-ai:main' into main
A-Vamshi Dec 30, 2025
a6cb6da
Merge branch 'confident-ai:main' into main
A-Vamshi Jan 5, 2026
a6e3531
Merge branch 'confident-ai:main' into main
A-Vamshi Jan 6, 2026
b40fe1c
Merge branch 'confident-ai:main' into main
A-Vamshi Jan 6, 2026
83de3c1
new metrics endpoint
A-Vamshi Jan 7, 2026
ca79ac6
helper methods to create payload
A-Vamshi Jan 7, 2026
087c9e0
GEval now supports upload method
A-Vamshi Jan 7, 2026
3c5d6de
ConversationalGEval now supports upload method
A-Vamshi Jan 7, 2026
0080f4e
lint
A-Vamshi Jan 7, 2026
0f75a85
tests added for upload method
A-Vamshi Jan 7, 2026
de92272
Merge branch 'confident-ai:main' into main
A-Vamshi Jan 7, 2026
bb53b92
Merge branch 'main' of https://github.com/A-Vamshi/deepeval
A-Vamshi Jan 7, 2026
8f2092a
.
A-Vamshi Jan 7, 2026
4a8e997
.
A-Vamshi Jan 7, 2026
2baae42
Merge branch 'main' into context_update
A-Vamshi Jan 7, 2026
8cddfea
Merge branch 'confident-ai:main' into context_update
A-Vamshi Jan 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/test_core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ jobs:
- name: Install dev dependencies
run: poetry install --no-interaction --with dev

- name: Install other dependencies
run: poetry run pip install beautifulsoup4

#----------------------------------------------
# run test suite
#----------------------------------------------
Expand Down
168 changes: 84 additions & 84 deletions a.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,84 @@
"""
Example script demonstrating how to use DeepEval's PromptOptimizer.
"""

from openai import OpenAI
from deepeval.optimizer import PromptOptimizer
from deepeval.prompt import Prompt
from deepeval.dataset import Golden
from deepeval.metrics import AnswerRelevancyMetric

# Initialize OpenAI client
client = OpenAI()


def model_callback(prompt: Prompt, golden: Golden) -> str:
"""
Callback function that runs your LLM with the optimized prompt.
This is called during scoring to evaluate how well the prompt performs.
"""
# Interpolate the prompt template with the golden's input
final_prompt = prompt.interpolate(query=golden.input)

# Call your LLM
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": final_prompt}],
)

return response.choices[0].message.content


# Define your initial prompt template (intentionally bad for testing optimization)
prompt = Prompt(
text_template="""idk maybe try to respond to this thing if u want lol

{query}

whatever:"""
)

# Define your evaluation dataset (goldens)
goldens = [
Golden(
input="What is the capital of France?",
expected_output="Paris",
),
Golden(
input="Who wrote Romeo and Juliet?",
expected_output="William Shakespeare",
),
Golden(
input="What is the chemical symbol for gold?",
expected_output="Au",
),
Golden(
input="In what year did World War II end?",
expected_output="1945",
),
]

# Define metrics to optimize for
metrics = [AnswerRelevancyMetric(threshold=0.7)]

from deepeval.optimizer.configs import DisplayConfig
from deepeval.optimizer.algorithms import GEPA

# Create the optimizer
optimizer = PromptOptimizer(
model_callback=model_callback,
metrics=metrics,
optimizer_model="gpt-4o", # Model used for rewriting prompts
display_config=DisplayConfig(announce_ties=True),
algorithm=GEPA(iterations=1),
)

# Run optimization
optimized_prompt = optimizer.optimize(prompt=prompt, goldens=goldens)

# Print results
print("\n" + "=" * 60)
print("OPTIMIZATION COMPLETE")
print("=" * 60)
print(f"\nOriginal prompt:\n{prompt.text_template}")
print(f"\nOptimized prompt:\n{optimized_prompt.text_template}")
# """
# Example script demonstrating how to use DeepEval's PromptOptimizer.
# """

# from openai import OpenAI
# from deepeval.optimizer import PromptOptimizer
# from deepeval.prompt import Prompt
# from deepeval.dataset import Golden
# from deepeval.metrics import AnswerRelevancyMetric

# # Initialize OpenAI client
# client = OpenAI()


# def model_callback(prompt: Prompt, golden: Golden) -> str:
# """
# Callback function that runs your LLM with the optimized prompt.
# This is called during scoring to evaluate how well the prompt performs.
# """
# # Interpolate the prompt template with the golden's input
# final_prompt = prompt.interpolate(query=golden.input)

# # Call your LLM
# response = client.chat.completions.create(
# model="gpt-4o-mini",
# messages=[{"role": "user", "content": final_prompt}],
# )

# return response.choices[0].message.content


# # Define your initial prompt template (intentionally bad for testing optimization)
# prompt = Prompt(
# text_template="""idk maybe try to respond to this thing if u want lol

# {query}

# whatever:"""
# )

# # Define your evaluation dataset (goldens)
# goldens = [
# Golden(
# input="What is the capital of France?",
# expected_output="Paris",
# ),
# Golden(
# input="Who wrote Romeo and Juliet?",
# expected_output="William Shakespeare",
# ),
# Golden(
# input="What is the chemical symbol for gold?",
# expected_output="Au",
# ),
# Golden(
# input="In what year did World War II end?",
# expected_output="1945",
# ),
# ]

# # Define metrics to optimize for
# metrics = [AnswerRelevancyMetric(threshold=0.7)]

# from deepeval.optimizer.configs import DisplayConfig
# from deepeval.optimizer.algorithms import GEPA

# # Create the optimizer
# optimizer = PromptOptimizer(
# model_callback=model_callback,
# metrics=metrics,
# optimizer_model="gpt-4o", # Model used for rewriting prompts
# display_config=DisplayConfig(announce_ties=True),
# algorithm=GEPA(iterations=1),
# )

# # Run optimization
# optimized_prompt = optimizer.optimize(prompt=prompt, goldens=goldens)

# # Print results
# print("\n" + "=" * 60)
# print("OPTIMIZATION COMPLETE")
# print("=" * 60)
# print(f"\nOriginal prompt:\n{prompt.text_template}")
# print(f"\nOptimized prompt:\n{optimized_prompt.text_template}")
2 changes: 2 additions & 0 deletions deepeval/test_case/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
ToolCall,
ToolCallParams,
MLLMImage,
Context,
)
from .conversational_test_case import (
ConversationalTestCase,
Expand All @@ -24,6 +25,7 @@
"LLMTestCaseParams",
"ToolCall",
"ToolCallParams",
"Context",
"ConversationalTestCase",
"Turn",
"TurnParams",
Expand Down
49 changes: 41 additions & 8 deletions deepeval/test_case/conversational_test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
model_validator,
AliasChoices,
)
from typing import List, Optional, Dict, Literal
from typing import List, Optional, Dict, Literal, Union
from copy import deepcopy
from enum import Enum

Expand All @@ -18,7 +18,7 @@
MCPToolCall,
validate_mcp_servers,
)
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY, Context


class TurnParams(Enum):
Expand Down Expand Up @@ -131,7 +131,7 @@ def validate_input(cls, data):
class ConversationalTestCase(BaseModel):
turns: List[Turn]
scenario: Optional[str] = Field(default=None)
context: Optional[List[str]] = Field(default=None)
context: Optional[List[Union[str, Context]]] = Field(default=None)
name: Optional[str] = Field(default=None)
user_description: Optional[str] = Field(
default=None,
Expand Down Expand Up @@ -163,9 +163,41 @@ class ConversationalTestCase(BaseModel):
_dataset_rank: Optional[int] = PrivateAttr(default=None)
_dataset_alias: Optional[str] = PrivateAttr(default=None)
_dataset_id: Optional[str] = PrivateAttr(default=None)
_context_items: Optional[List[Union[str, Context]]] = PrivateAttr(
default=None
)

@model_validator(mode="after")
def set_is_multimodal(self):
def post_init(self):

self._handle_context_data()
self._set_is_multimodal()

return self

def _handle_context_data(self):
if self.context is None:
return

self._context_items = self.context[:]

resolved_context = []

for item in self.context:
if isinstance(item, Context):
resolved = item.resolve_contexts()
if isinstance(resolved, list):
resolved_context.extend(resolved)
else:
resolved_context.append(resolved)
else:
resolved_context.append(item)

self.context = resolved_context

return self

def _set_is_multimodal(self):
import re

if self.multimodal is True:
Expand Down Expand Up @@ -195,8 +227,6 @@ def set_is_multimodal(self):
for context in turn.retrieval_context
)

return self

@model_validator(mode="before")
def validate_input(cls, data):
turns = data.get("turns")
Expand All @@ -209,9 +239,12 @@ def validate_input(cls, data):
# Ensure `context` is None or a list of strings
if context is not None:
if not isinstance(context, list) or not all(
isinstance(item, str) for item in context
(isinstance(item, str) or isinstance(item, Context))
for item in context
):
raise TypeError("'context' must be None or a list of strings")
raise TypeError(
"'context' must be None or a list of or 'Context'"
)

if mcp_servers is not None:
validate_mcp_servers(mcp_servers)
Expand Down
Loading
Loading