Skip to content

Commit 8ef9f6b

Browse files
committed
refactor(evaluator): refine base_evaluator
1 parent 7cef667 commit 8ef9f6b

File tree

4 files changed

+38
-29
lines changed

4 files changed

+38
-29
lines changed

tests/test_evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_evaluator():
129129
with open(eval_set_file_path, "w") as f:
130130
json.dump(EVAL_SET_DATA, f)
131131

132-
base_evaluator.generate_eval_data(file_path=eval_set_file_path)
132+
base_evaluator.build_eval_set(file_path=eval_set_file_path)
133133

134134
assert len(base_evaluator.invocation_list) == 1
135135
assert len(base_evaluator.invocation_list[0].invocations) == 1
@@ -149,7 +149,7 @@ def test_tracing_file_to_evalset():
149149
with open(tracing_file_path, "w") as f:
150150
json.dump(TRACE_SET_DATA, f)
151151

152-
base_evaluator.generate_eval_data(file_path=tracing_file_path)
152+
base_evaluator.build_eval_set(file_path=tracing_file_path)
153153

154154
assert len(base_evaluator.invocation_list) == 1
155155
assert len(base_evaluator.invocation_list[0].invocations) == 1

veadk/evaluation/adk_evaluator/adk_evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939

4040
from veadk.agent import Agent
4141

42-
from ..base_evaluator import BaseEvaluator
42+
from veadk.evaluation.base_evaluator import BaseEvaluator
4343

4444

4545
def formatted_timestamp():
@@ -238,7 +238,7 @@ def __init__(
238238
# TODO: implement
239239

240240
@override
241-
async def eval(
241+
async def evaluate(
242242
self,
243243
eval_set_file_path: str,
244244
eval_id: str = f"test_{formatted_timestamp()}",

veadk/evaluation/base_evaluator.py

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,13 @@
2828
from veadk.utils.misc import formatted_timestamp
2929

3030

31-
class InvocationTestData(BaseModel):
31+
class ToolInvocation(BaseModel):
32+
tool_name: str
33+
tool_args: dict[str, Any] = {}
34+
tool_result: Any = None
35+
36+
37+
class Invocation(BaseModel):
3238
invocation_id: str = ""
3339
input: str
3440
actual_output: str
@@ -38,8 +44,8 @@ class InvocationTestData(BaseModel):
3844
latency: str = "" # ms
3945

4046

41-
class EvalCaseData(BaseModel):
42-
invocations: list[InvocationTestData]
47+
class EvalTestCase(BaseModel):
48+
invocations: list[Invocation]
4349

4450

4551
class MetricResult(BaseModel):
@@ -78,23 +84,23 @@ def __init__(
7884
):
7985
self.name = name
8086
self.agent = agent
81-
self.invocation_list: list[EvalCaseData] = []
87+
self.invocation_list: list[EvalTestCase] = []
8288
self.result_list: list[EvalResultData] = []
8389
self.agent_information_list: list[dict] = []
8490

85-
def _load_eval_set(self, eval_set_file: str) -> EvalSet:
86-
from .eval_set_file_loader import load_eval_set_from_file
91+
def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
92+
from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
8793

88-
return load_eval_set_from_file(eval_set_file)
94+
return load_eval_set_from_file(eval_json_path)
8995

90-
def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
96+
def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
9197
try:
92-
with open(tracing_file, "r") as f:
98+
with open(tracing_json_path, "r") as f:
9399
tracing_data = json.load(f)
94100
except json.JSONDecodeError as e:
95-
raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
101+
raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
96102
except Exception as e:
97-
raise ValueError(f"Error reading file {tracing_file}: {e}")
103+
raise ValueError(f"Error reading file {tracing_json_path}: {e}")
98104

99105
# Group spans by trace_id
100106
trace_groups = {}
@@ -188,9 +194,9 @@ def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
188194

189195
return evalset
190196

191-
def generate_eval_data(self, file_path: str):
197+
def build_eval_set(self, file_path: str):
192198
"""Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
193-
eval_case_data_list: list[EvalCaseData] = []
199+
eval_case_data_list: list[EvalTestCase] = []
194200

195201
try:
196202
with open(file_path, "r") as f:
@@ -201,22 +207,22 @@ def generate_eval_data(self, file_path: str):
201207
raise ValueError(f"Error reading file {file_path}: {e}")
202208

203209
if isinstance(file_content, dict) and "eval_cases" in file_content:
204-
eval_cases = self._load_eval_set(file_path).eval_cases
210+
eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
205211
elif (
206212
isinstance(file_content, list)
207213
and len(file_content) > 0
208214
and all(
209215
isinstance(span, dict) and "trace_id" in span for span in file_content
210216
)
211217
):
212-
eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
218+
eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases
213219
else:
214220
raise ValueError(
215221
f"Unsupported file format in {file_path}. Please provide a valid file."
216222
)
217223

218224
for eval_case in eval_cases:
219-
eval_case_data = EvalCaseData(invocations=[])
225+
eval_case_data = EvalTestCase(invocations=[])
220226
if eval_case.session_input:
221227
self.agent_information_list.append(
222228
{
@@ -247,7 +253,7 @@ def generate_eval_data(self, file_path: str):
247253
)
248254

249255
eval_case_data.invocations.append(
250-
InvocationTestData(
256+
Invocation(
251257
invocation_id=invocation.invocation_id,
252258
input=_input,
253259
actual_output="",
@@ -261,7 +267,7 @@ def generate_eval_data(self, file_path: str):
261267
eval_case_data_list.append(eval_case_data)
262268
self.invocation_list = eval_case_data_list
263269

264-
async def _run_agent_for_actual_data(self):
270+
async def generate_actual_outputs(self):
265271
for eval_case_data, agent_information in zip(
266272
self.invocation_list, self.agent_information_list
267273
):
@@ -333,7 +339,7 @@ async def _run_agent_for_actual_data(self):
333339
invocation.actual_tool = _actual_tool
334340
invocation.latency = _latency
335341

336-
def get_data(self) -> list[list[dict[str, Any]]]:
342+
def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
337343
"""Merge the evaluation data and return it in the format of list[list[dict]]"""
338344
result = []
339345
for i, eval_case in enumerate(self.invocation_list):
@@ -360,7 +366,7 @@ def get_data(self) -> list[list[dict[str, Any]]]:
360366
return result
361367

362368
@abstractmethod
363-
async def eval(
369+
async def evaluate(
364370
self,
365371
eval_set_file_path: str,
366372
metrics: list[Any],

veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,11 @@
2727
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
2828
from veadk.utils.logger import get_logger
2929

30-
from ..base_evaluator import BaseEvaluator, EvalResultData, MetricResult
31-
from ..utils.prometheus import PrometheusPushgatewayConfig, push_to_prometheus
30+
from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
31+
from veadk.evaluation.utils.prometheus import (
32+
PrometheusPushgatewayConfig,
33+
push_to_prometheus,
34+
)
3235

3336
logger = get_logger(__name__)
3437

@@ -66,19 +69,19 @@ def __init__(
6669
self.prometheus_config = prometheus_config
6770

6871
@override
69-
async def eval(
72+
async def evaluate(
7073
self,
7174
eval_set_file_path: str,
7275
metrics: list[BaseMetric],
7376
eval_id: str = f"test_{formatted_timestamp()}",
7477
):
7578
"""Target to Google ADK, we will use the same evaluation case format as Google ADK."""
7679
# Get evaluation data by parsing eval set file
77-
self.generate_eval_data(eval_set_file_path)
80+
self.build_eval_set(eval_set_file_path)
7881

7982
# Get actual data by running agent
8083
logger.info("Start to run agent for actual data.")
81-
await self._run_agent_for_actual_data()
84+
await self.generate_actual_outputs()
8285
eval_case_data_list = self.invocation_list
8386

8487
# Build test cases in Deepeval format

0 commit comments

Comments
 (0)