Skip to content

Commit 06281a5

Browse files
authored
Merge pull request #176 from volcengine/feat/cli-eval
feat(eval): support do eval from cli
2 parents 05a0b57 + 54e0b69 commit 06281a5

File tree

7 files changed

+322
-95
lines changed

7 files changed

+322
-95
lines changed

docs/content/90.cli/1.overview.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,12 @@ navigation:
55
icon: i-lucide-heading-1
66
---
77

8-
VeADK 提供了多类命令。
8+
VeADK 提供如下命令便捷您的操作:
9+
10+
| 命令 | 描述 | 说明 |
11+
| :-- | :-- | :-- |
12+
| `veadk init` | 生成可在 VeFaaS 中部署的项目脚手架 | 将会在您的目录中新增 `deploy.yaml` 文件 |
13+
| `veadk deploy` | 将某个项目部署到 VeFaaS 中 | |
14+
| `veadk prompt` | 优化智能体系统提示词 | 借助火山引擎 PromptPilot 产品 |
15+
| `veadk web` | 支持长短期记忆、知识库的前端调试界面 | 兼容 Google ADK web |
16+
| `veadk eval` | 支持不同后端的评测 | 评测后端包括 `adk``deepeval`,评测数据集源包括 Google ADK 评测集格式文件,以及 Tracing 文件 |

docs/content/90.cli/2.commands.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,37 @@ veadk web --session_service_uri="mysql+pymysql://{user}:{password}@{host}/{datab
5050
```
5151

5252
它们能够自动读取执行命令目录中的`agent.py`文件,并加载`root_agent`全局变量。
53+
54+
## 评测
55+
56+
通过使用 `veadk eval` 来进行智能体评测,相关参数如下:
57+
58+
::field-group
59+
::field{name="--agent-dir" type="string"}
60+
待评测的 Agent 目录。目录中需要符合 Google ADK 的项目结构,即需要具备一个导出 `root_agent` 的名为 `agent.py` 的文件。
61+
::
62+
63+
::field{name="--agent-a2a-url" type="string"}
64+
待评测的云端 Agent 路径,要求云端 Agent 使用 A2A 协议进行部署。
65+
::
66+
67+
::field{name="--evalset-file" type="string"}
68+
Google ADK 格式的评测集文件
69+
::
70+
71+
::field{name="--evaluator" type="string"}
72+
评测器类别:`adk` 为 Google ADK 内置评测器,评测模型为自身;`deepeval` 为 DeepEval 评测器,评测模型可以通过 `--judge-model-name` 参数指定。
73+
::
74+
75+
::field{name="--judge-model-name" type="string"}
76+
默认为 `doubao-1-5-pro-256k-250115` - 评测模型,该参数在 `--evaluator` 值为 `adk` 时无效。
77+
::
78+
79+
::field{name="--volcengine-access-key" type="string"}
80+
火山引擎 Access Key
81+
::
82+
83+
::field{name="--volcengine-secret-key" type="string"}
84+
火山引擎 Secret Key
85+
::
86+
::

tests/test_evaluator.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -62,60 +62,61 @@
6262
TRACE_SET_DATA = [
6363
{
6464
"name": "execute_tool get_city_weather",
65-
"span_id": 5421848634094108689,
66-
"trace_id": 115143748123782151752771111946932434777,
67-
"start_time": 1754884672226444000,
68-
"end_time": 1754884672226993000,
65+
"span_id": 4497348974122733469,
66+
"trace_id": 142655176138954930885272077198014871976,
67+
"start_time": 1758158957162250000,
68+
"end_time": 1758158957162426000,
6969
"attributes": {
7070
"gen_ai.tool.name": "get_city_weather",
71-
"gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
72-
"gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
73-
"gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
71+
"gen_ai.tool.input": '{"name": "get_city_weather", "description": "Retrieves the weather information of a given city. the args must in English", "parameters": {"city": "Beijing"}}',
72+
"gen_ai.tool.output": '{"id": "call_w4bj25flpvs74zgyyiquqh5s", "name": "get_city_weather", "response": {"result": "Sunny, 25°C"}}',
7473
},
75-
"parent_span_id": 7997784243558253239,
74+
"parent_span_id": 574819447039686650,
7675
},
7776
{
7877
"name": "call_llm",
79-
"span_id": 7997784243558253239,
80-
"trace_id": 115143748123782151752771111946932434777,
78+
"span_id": 574819447039686650,
79+
"trace_id": 142655176138954930885272077198014871976,
80+
"start_time": 1758158945807630000,
81+
"end_time": 1758158957171304000,
8182
"attributes": {
82-
"session.id": "veadk_example_session",
83-
"user.id": "veadk_default_user",
83+
"gen_ai.app.name": "veadk_default_app",
84+
"gen_ai.user.id": "veadk_default_user",
85+
"gen_ai.prompt.0.role": "user",
86+
"gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
8487
},
85-
"parent_span_id": 14844888006539887900,
88+
"parent_span_id": 13789664766018020416,
8689
},
8790
{
8891
"name": "call_llm",
89-
"span_id": 7789424022423491416,
90-
"trace_id": 115143748123782151752771111946932434777,
92+
"span_id": 9007934154052797946,
93+
"trace_id": 142655176138954930885272077198014871976,
94+
"start_time": 1758158957171713000,
95+
"end_time": 1758158964035230000,
9196
"attributes": {
92-
"session.id": "veadk_example_session",
93-
"user.id": "veadk_default_user",
97+
"gen_ai.app.name": "veadk_default_app",
98+
"gen_ai.user.id": "veadk_default_user",
99+
"gen_ai.prompt.0.content": "How is the weather like in BeiJing?",
100+
"gen_ai.completion.0.content": "The weather in Beijing is sunny with a temperature of 25°C.",
94101
},
95-
"parent_span_id": 14844888006539887900,
102+
"parent_span_id": 13789664766018020416,
96103
},
97104
{
98105
"name": "agent_run [chat_robot]",
99-
"span_id": 14844888006539887900,
100-
"trace_id": 115143748123782151752771111946932434777,
101-
"attributes": {
102-
"session.id": "veadk_example_session",
103-
"user.id": "veadk_default_user",
104-
},
105-
"parent_span_id": 2943363177785645047,
106+
"span_id": 13789664766018020416,
107+
"trace_id": 142655176138954930885272077198014871976,
108+
"start_time": 1758158945807350000,
109+
"end_time": 1758158964035291000,
110+
"attributes": {},
111+
"parent_span_id": 5589459087402275636,
106112
},
107113
{
108-
"name": "invocation [veadk_default_app]",
109-
"span_id": 2943363177785645047,
110-
"trace_id": 115143748123782151752771111946932434777,
111-
"start_time": 1754884660687962000,
112-
"end_time": 1754884676664833000,
113-
"attributes": {
114-
"input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
115-
"user.id": "veadk_default_user",
116-
"session.id": "veadk_example_session",
117-
"output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
118-
},
114+
"name": "invocation",
115+
"span_id": 5589459087402275636,
116+
"trace_id": 142655176138954930885272077198014871976,
117+
"start_time": 1758158945807233000,
118+
"end_time": 1758158964035304000,
119+
"attributes": {},
119120
"parent_span_id": None,
120121
},
121122
]
@@ -154,8 +155,8 @@ def test_tracing_file_to_evalset():
154155
assert len(base_evaluator.invocation_list) == 1
155156
assert len(base_evaluator.invocation_list[0].invocations) == 1
156157
assert (
157-
base_evaluator.invocation_list[0].invocations[0].invocation_id
158-
== "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
158+
base_evaluator.invocation_list[0].invocations[0].expected_output
159+
== "The weather in Beijing is sunny with a temperature of 25°C."
159160
)
160161

161162
os.remove(tracing_file_path)

veadk/cli/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
import click
1717

1818
from veadk.cli.cli_deploy import deploy
19+
from veadk.cli.cli_eval import eval
1920
from veadk.cli.cli_init import init
21+
from veadk.cli.cli_pipeline import pipeline
2022
from veadk.cli.cli_prompt import prompt
2123
from veadk.cli.cli_web import web
22-
from veadk.cli.cli_pipeline import pipeline
2324
from veadk.version import VERSION
2425

2526

@@ -37,6 +38,7 @@ def veadk():
3738
veadk.add_command(prompt)
3839
veadk.add_command(web)
3940
veadk.add_command(pipeline)
41+
veadk.add_command(eval)
4042

4143
if __name__ == "__main__":
4244
veadk()

veadk/cli/cli_eval.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import click
16+
17+
from veadk.utils.logger import get_logger
18+
19+
logger = get_logger(__name__)
20+
21+
22+
@click.command()
23+
@click.option(
24+
"--agent-dir",
25+
default=".",
26+
help="To-be-evaluated agent directory. Must export `root_agent` in `agent.py`",
27+
)
28+
@click.option(
29+
"--agent-a2a-url",
30+
default=None,
31+
help="To-be-evaluated agent URL. The agent should be deployed as A2A mode.",
32+
)
33+
@click.option(
34+
"--evalset-file",
35+
required=True,
36+
help="Google ADK formatted evalset file path",
37+
)
38+
@click.option(
39+
"--evaluator",
40+
type=click.Choice(["adk", "deepeval"], case_sensitive=False),
41+
help="Evaluator type, choose `adk` or `deepeval`",
42+
)
43+
@click.option(
44+
"--judge-model-name",
45+
default="doubao-1-5-pro-256k-250115",
46+
help="Judge model name, default is `doubao-1-5-pro-256k-250115`. Useless under `adk` evaluator.",
47+
)
48+
@click.option(
49+
"--volcengine-access-key",
50+
default=None,
51+
help="Volcengine access key for using Volcengine models",
52+
)
53+
@click.option(
54+
"--volcengine-secret-key",
55+
default=None,
56+
help="Volcengine secret key for using Volcengine models",
57+
)
58+
def eval(
59+
agent_dir: str,
60+
agent_a2a_url: str,
61+
evalset_file: str,
62+
evaluator: str,
63+
judge_model_name: str,
64+
volcengine_access_key: str,
65+
volcengine_secret_key: str,
66+
) -> None:
67+
import asyncio
68+
import os
69+
from pathlib import Path
70+
71+
from google.adk.cli.utils.agent_loader import AgentLoader
72+
73+
from veadk.a2a.remote_ve_agent import RemoteVeAgent
74+
from veadk.config import getenv, settings
75+
from veadk.prompts.prompt_evaluator import eval_principle_prompt
76+
77+
try:
78+
from deepeval.metrics import GEval, ToolCorrectnessMetric
79+
from deepeval.test_case import LLMTestCaseParams
80+
81+
from veadk.evaluation.adk_evaluator import ADKEvaluator
82+
from veadk.evaluation.deepeval_evaluator import DeepevalEvaluator
83+
except ImportError:
84+
raise ImportError(
85+
"Please install veadk with `[evaluation]` extras, e.g., `pip install veadk-python[eval]`"
86+
)
87+
88+
# ====== prepare agent instance ======
89+
if not agent_dir and not agent_a2a_url:
90+
raise ValueError(
91+
"Option `--agent-dir` or `--agent-a2a-url` should be provided one of them."
92+
)
93+
94+
if agent_dir and agent_a2a_url:
95+
logger.warning(
96+
"`--agent-dir` and `--agent-a2a-url` are both provided, will use `--agent-a2a-url`."
97+
)
98+
agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url)
99+
logger.info(f"Loaded agent from {agent_a2a_url}")
100+
101+
if not agent_dir and agent_a2a_url:
102+
agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url)
103+
logger.info(f"Loaded agent from {agent_a2a_url}")
104+
105+
if agent_dir and not agent_a2a_url:
106+
agent_instance = AgentLoader(str(Path(agent_dir).parent.resolve())).load_agent(
107+
str(Path(agent_dir).name)
108+
)
109+
logger.info(f"Loaded agent from {agent_dir}, agent name: {agent_instance.name}")
110+
111+
# ====== prepare envs ======
112+
if volcengine_access_key and "VOLCENGINE_ACCESS_KEY" not in os.environ:
113+
os.environ["VOLCENGINE_ACCESS_KEY"] = volcengine_access_key
114+
if volcengine_secret_key and "VOLCENGINE_SECRET_KEY" not in os.environ:
115+
os.environ["VOLCENGINE_SECRET_KEY"] = volcengine_secret_key
116+
117+
# ====== prepare evaluator instance ======
118+
evaluator_instance = None
119+
if evaluator == "adk" and judge_model_name:
120+
logger.warning(
121+
"Using Google ADK evaluator, `--judge-model-name` will be ignored."
122+
)
123+
evaluator_instance = ADKEvaluator(agent=agent_instance)
124+
125+
asyncio.run(evaluator_instance.evaluate(eval_set_file_path=evalset_file))
126+
127+
if evaluator == "deepeval":
128+
if not volcengine_access_key:
129+
volcengine_access_key = getenv("VOLCENGINE_ACCESS_KEY")
130+
if not volcengine_secret_key:
131+
volcengine_secret_key = getenv("VOLCENGINE_SECRET_KEY")
132+
133+
evaluator_instance = DeepevalEvaluator(
134+
agent=agent_instance,
135+
judge_model_api_key=settings.model.api_key,
136+
judge_model_name=judge_model_name,
137+
)
138+
139+
judge_model = evaluator_instance.judge_model
140+
141+
metrics = [
142+
GEval(
143+
threshold=0.8,
144+
name="Base Evaluation",
145+
criteria=eval_principle_prompt,
146+
evaluation_params=[
147+
LLMTestCaseParams.INPUT,
148+
LLMTestCaseParams.ACTUAL_OUTPUT,
149+
LLMTestCaseParams.EXPECTED_OUTPUT,
150+
],
151+
model=judge_model,
152+
),
153+
ToolCorrectnessMetric(threshold=0.5),
154+
]
155+
156+
asyncio.run(
157+
evaluator_instance.evaluate(
158+
eval_set_file_path=evalset_file, metrics=metrics
159+
)
160+
)

0 commit comments

Comments
 (0)