Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_evaluator():
with open(eval_set_file_path, "w") as f:
json.dump(EVAL_SET_DATA, f)

base_evaluator.generate_eval_data(file_path=eval_set_file_path)
base_evaluator.build_eval_set(file_path=eval_set_file_path)

assert len(base_evaluator.invocation_list) == 1
assert len(base_evaluator.invocation_list[0].invocations) == 1
Expand All @@ -149,7 +149,7 @@ def test_tracing_file_to_evalset():
with open(tracing_file_path, "w") as f:
json.dump(TRACE_SET_DATA, f)

base_evaluator.generate_eval_data(file_path=tracing_file_path)
base_evaluator.build_eval_set(file_path=tracing_file_path)

assert len(base_evaluator.invocation_list) == 1
assert len(base_evaluator.invocation_list[0].invocations) == 1
Expand Down
4 changes: 2 additions & 2 deletions veadk/evaluation/adk_evaluator/adk_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

from veadk.agent import Agent

from ..base_evaluator import BaseEvaluator
from veadk.evaluation.base_evaluator import BaseEvaluator


def formatted_timestamp():
Expand Down Expand Up @@ -238,7 +238,7 @@ def __init__(
# TODO: implement
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove


@override
async def eval(
async def evaluate(
self,
eval_set_file_path: str,
eval_id: str = f"test_{formatted_timestamp()}",
Expand Down
46 changes: 26 additions & 20 deletions veadk/evaluation/base_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,13 @@
from veadk.utils.misc import formatted_timestamp


class InvocationTestData(BaseModel):
class ToolInvocation(BaseModel):
tool_name: str
tool_args: dict[str, Any] = {}
tool_result: Any = None


class Invocation(BaseModel):
invocation_id: str = ""
input: str
actual_output: str
Expand All @@ -38,8 +44,8 @@ class InvocationTestData(BaseModel):
latency: str = "" # ms


class EvalCaseData(BaseModel):
invocations: list[InvocationTestData]
class EvalTestCase(BaseModel):
invocations: list[Invocation]


class MetricResult(BaseModel):
Expand Down Expand Up @@ -78,23 +84,23 @@ def __init__(
):
self.name = name
self.agent = agent
self.invocation_list: list[EvalCaseData] = []
self.invocation_list: list[EvalTestCase] = []
self.result_list: list[EvalResultData] = []
self.agent_information_list: list[dict] = []

def _load_eval_set(self, eval_set_file: str) -> EvalSet:
from .eval_set_file_loader import load_eval_set_from_file
def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file

return load_eval_set_from_file(eval_set_file)
return load_eval_set_from_file(eval_json_path)

def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
try:
with open(tracing_file, "r") as f:
with open(tracing_json_path, "r") as f:
tracing_data = json.load(f)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
except Exception as e:
raise ValueError(f"Error reading file {tracing_file}: {e}")
raise ValueError(f"Error reading file {tracing_json_path}: {e}")

# Group spans by trace_id
trace_groups = {}
Expand Down Expand Up @@ -188,9 +194,9 @@ def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:

return evalset

def generate_eval_data(self, file_path: str):
def build_eval_set(self, file_path: str):
"""Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
eval_case_data_list: list[EvalCaseData] = []
eval_case_data_list: list[EvalTestCase] = []

try:
with open(file_path, "r") as f:
Expand All @@ -201,22 +207,22 @@ def generate_eval_data(self, file_path: str):
raise ValueError(f"Error reading file {file_path}: {e}")

if isinstance(file_content, dict) and "eval_cases" in file_content:
eval_cases = self._load_eval_set(file_path).eval_cases
eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
elif (
isinstance(file_content, list)
and len(file_content) > 0
and all(
isinstance(span, dict) and "trace_id" in span for span in file_content
)
):
eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases
else:
raise ValueError(
f"Unsupported file format in {file_path}. Please provide a valid file."
)

for eval_case in eval_cases:
eval_case_data = EvalCaseData(invocations=[])
eval_case_data = EvalTestCase(invocations=[])
if eval_case.session_input:
self.agent_information_list.append(
{
Expand Down Expand Up @@ -247,7 +253,7 @@ def generate_eval_data(self, file_path: str):
)

eval_case_data.invocations.append(
InvocationTestData(
Invocation(
invocation_id=invocation.invocation_id,
input=_input,
actual_output="",
Expand All @@ -261,7 +267,7 @@ def generate_eval_data(self, file_path: str):
eval_case_data_list.append(eval_case_data)
self.invocation_list = eval_case_data_list

async def _run_agent_for_actual_data(self):
async def generate_actual_outputs(self):
for eval_case_data, agent_information in zip(
self.invocation_list, self.agent_information_list
):
Expand Down Expand Up @@ -333,7 +339,7 @@ async def _run_agent_for_actual_data(self):
invocation.actual_tool = _actual_tool
invocation.latency = _latency

def get_data(self) -> list[list[dict[str, Any]]]:
def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
"""Merge the evaluation data and return it in the format of list[list[dict]]"""
result = []
for i, eval_case in enumerate(self.invocation_list):
Expand All @@ -360,7 +366,7 @@ def get_data(self) -> list[list[dict[str, Any]]]:
return result

@abstractmethod
async def eval(
async def evaluate(
self,
eval_set_file_path: str,
metrics: list[Any],
Expand Down
13 changes: 8 additions & 5 deletions veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
from veadk.utils.logger import get_logger

from ..base_evaluator import BaseEvaluator, EvalResultData, MetricResult
from ..utils.prometheus import PrometheusPushgatewayConfig, push_to_prometheus
from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
from veadk.evaluation.utils.prometheus import (
PrometheusPushgatewayConfig,
push_to_prometheus,
)

logger = get_logger(__name__)

Expand Down Expand Up @@ -66,19 +69,19 @@ def __init__(
self.prometheus_config = prometheus_config

@override
async def eval(
async def evaluate(
self,
eval_set_file_path: str,
metrics: list[BaseMetric],
eval_id: str = f"test_{formatted_timestamp()}",
):
"""Target to Google ADK, we will use the same evaluation case format as Google ADK."""
# Get evaluation data by parsing eval set file
self.generate_eval_data(eval_set_file_path)
self.build_eval_set(eval_set_file_path)

# Get actual data by running agent
logger.info("Start to run agent for actual data.")
await self._run_agent_for_actual_data()
await self.generate_actual_outputs()
eval_case_data_list = self.invocation_list

# Build test cases in Deepeval format
Expand Down