Skip to content

Commit 764e3dd

Browse files
committed
feat(eval): support do eval from cli
1 parent 05a0b57 commit 764e3dd

File tree

5 files changed

+210
-14
lines changed

5 files changed

+210
-14
lines changed

docs/content/90.cli/1.overview.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,12 @@ navigation:
55
icon: i-lucide-heading-1
66
---
77

8-
VeADK 提供了多类命令。
8+
VeADK 提供如下命令便捷您的操作:
9+
10+
| 命令 | 描述 | 说明 |
11+
| :-- | :-- | :-- |
12+
| `veadk init` | 生成可在 VeFaaS 中部署的项目脚手架 | 将会在您的目录中新增 `deploy.yaml` 文件 |
13+
| `veadk deploy` | 将某个项目部署到 VeFaaS 中 | |
14+
| `veadk prompt` | 优化智能体系统提示词 | 借助火山引擎 PromptPilot 产品 |
15+
| `veadk web` | 支持长短期记忆、知识库的前端调试界面 | 兼容 Google ADK web |
16+
| `veadk eval` | 支持不同后端的评测 | 评测后端包括 `adk``deepeval`,评测数据集源包括 Google ADK 评测集格式文件,以及 Tracing 文件 |

docs/content/90.cli/2.commands.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,37 @@ veadk web --session_service_uri="mysql+pymysql://{user}:{password}@{host}/{datab
5050
```
5151

5252
它们能够自动读取执行命令目录中的`agent.py`文件,并加载`root_agent`全局变量。
53+
54+
## 评测
55+
56+
通过使用 `veadk eval` 来进行智能体评测,相关参数如下:
57+
58+
::field-group
59+
::field{name="--agent-dir" type="string"}
60+
待评测的 Agent 目录。目录中需要符合 Google ADK 的项目结构,即需要具备一个导出 `root_agent` 的名为 `agent.py` 的文件。
61+
::
62+
63+
::field{name="--agent-a2a-url" type="string"}
64+
待评测的云端 Agent 路径,要求云端 Agent 使用 A2A 协议进行部署。
65+
::
66+
67+
::field{name="--evalset-file" type="string"}
68+
Google ADK 格式的评测集文件
69+
::
70+
71+
::field{name="--evaluator" type="string"}
72+
评测器类别:`adk` 为 Google ADK 内置评测器,评测模型为自身;`deepeval` 为 DeepEval 评测器,评测模型可以通过 `--judge-model-name` 参数指定。
73+
::
74+
75+
::field{name="--judge-model-name" type="string"}
76+
默认为 `doubao-1-5-pro-256k-250115` - 评测模型,该参数在 `--evaluator` 值为 `adk` 时无效。
77+
::
78+
79+
::field{name="--volcengine-access-key" type="string"}
80+
火山引擎 Access Key
81+
::
82+
83+
::field{name="--volcengine-secret-key" type="string"}
84+
火山引擎 Secret Key
85+
::
86+
::

veadk/cli/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
import click
1717

1818
from veadk.cli.cli_deploy import deploy
19+
from veadk.cli.cli_eval import eval
1920
from veadk.cli.cli_init import init
21+
from veadk.cli.cli_pipeline import pipeline
2022
from veadk.cli.cli_prompt import prompt
2123
from veadk.cli.cli_web import web
22-
from veadk.cli.cli_pipeline import pipeline
2324
from veadk.version import VERSION
2425

2526

@@ -37,6 +38,7 @@ def veadk():
3738
veadk.add_command(prompt)
3839
veadk.add_command(web)
3940
veadk.add_command(pipeline)
41+
veadk.add_command(eval)
4042

4143
if __name__ == "__main__":
4244
veadk()

veadk/cli/cli_eval.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import click
2+
3+
from veadk.utils.logger import get_logger
4+
5+
logger = get_logger(__name__)
6+
7+
8+
@click.command()
9+
@click.option(
10+
"--agent-dir",
11+
default=".",
12+
help="To-be-evaluated agent directory. Must export `root_agent` in `agent.py`",
13+
)
14+
@click.option(
15+
"--agent-a2a-url",
16+
default=None,
17+
help="To-be-evaluated agent URL. The agent should be deployed as A2A mode.",
18+
)
19+
@click.option(
20+
"--evalset-file",
21+
required=True,
22+
help="Google ADK formatted evalset file path",
23+
)
24+
@click.option(
25+
"--evaluator",
26+
type=click.Choice(["adk", "deepeval"], case_sensitive=False),
27+
help="Evaluator type, choose `adk` or `deepeval`",
28+
)
29+
@click.option(
30+
"--judge-model-name",
31+
default="doubao-1-5-pro-256k-250115",
32+
help="Judge model name, default is `doubao-1-5-pro-256k-250115`. Useless under `adk` evaluator.",
33+
)
34+
@click.option(
35+
"--volcengine-access-key",
36+
default=None,
37+
help="Volcengine access key for using Volcengine models",
38+
)
39+
@click.option(
40+
"--volcengine-secret-key",
41+
default=None,
42+
help="Volcengine secret key for using Volcengine models",
43+
)
44+
def eval(
45+
agent_dir: str,
46+
agent_a2a_url: str,
47+
evalset_file: str,
48+
evaluator: str,
49+
judge_model_name: str,
50+
volcengine_access_key: str,
51+
volcengine_secret_key: str,
52+
) -> None:
53+
import asyncio
54+
import os
55+
from pathlib import Path
56+
57+
from google.adk.cli.utils.agent_loader import AgentLoader
58+
59+
from veadk.a2a.remote_ve_agent import RemoteVeAgent
60+
from veadk.config import getenv, settings
61+
from veadk.prompts.prompt_evaluator import eval_principle_prompt
62+
63+
try:
64+
from deepeval.metrics import GEval, ToolCorrectnessMetric
65+
from deepeval.test_case import LLMTestCaseParams
66+
67+
from veadk.evaluation.adk_evaluator import ADKEvaluator
68+
from veadk.evaluation.deepeval_evaluator import DeepevalEvaluator
69+
except ImportError:
70+
raise ImportError(
71+
"Please install veadk with `[evaluation]` extras, e.g., `pip install veadk-python[eval]`"
72+
)
73+
74+
# ====== prepare agent instance ======
75+
if not agent_dir and not agent_a2a_url:
76+
raise ValueError(
77+
"Option `--agent-dir` or `--agent-a2a-url` should be provided one of them."
78+
)
79+
80+
if agent_dir and agent_a2a_url:
81+
logger.warning(
82+
"`--agent-dir` and `--agent-a2a-url` are both provided, will use `--agent-a2a-url`."
83+
)
84+
agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url)
85+
logger.info(f"Loaded agent from {agent_a2a_url}")
86+
87+
if not agent_dir and agent_a2a_url:
88+
agent_instance = RemoteVeAgent(name="a2a_agent", url=agent_a2a_url)
89+
logger.info(f"Loaded agent from {agent_a2a_url}")
90+
91+
if agent_dir and not agent_a2a_url:
92+
agent_instance = AgentLoader(str(Path(agent_dir).parent.resolve())).load_agent(
93+
str(Path(agent_dir).name)
94+
)
95+
logger.info(f"Loaded agent from {agent_dir}, agent name: {agent_instance.name}")
96+
97+
# ====== prepare envs ======
98+
if volcengine_access_key and "VOLCENGINE_ACCESS_KEY" not in os.environ:
99+
os.environ["VOLCENGINE_ACCESS_KEY"] = volcengine_access_key
100+
if volcengine_secret_key and "VOLCENGINE_SECRET_KEY" not in os.environ:
101+
os.environ["VOLCENGINE_SECRET_KEY"] = volcengine_secret_key
102+
103+
# ====== prepare evaluator instance ======
104+
evaluator_instance = None
105+
if evaluator == "adk" and judge_model_name:
106+
logger.warning(
107+
"Using Google ADK evaluator, `--judge-model-name` will be ignored."
108+
)
109+
evaluator_instance = ADKEvaluator(agent=agent_instance)
110+
111+
asyncio.run(evaluator_instance.evaluate(eval_set_file_path=evalset_file))
112+
113+
if evaluator == "deepeval":
114+
if not volcengine_access_key:
115+
volcengine_access_key = getenv("VOLCENGINE_ACCESS_KEY")
116+
if not volcengine_secret_key:
117+
volcengine_secret_key = getenv("VOLCENGINE_SECRET_KEY")
118+
119+
evaluator_instance = DeepevalEvaluator(
120+
agent=agent_instance,
121+
judge_model_api_key=settings.model.api_key,
122+
judge_model_name=judge_model_name,
123+
)
124+
125+
judge_model = evaluator_instance.judge_model
126+
127+
metrics = [
128+
GEval(
129+
threshold=0.8,
130+
name="Base Evaluation",
131+
criteria=eval_principle_prompt,
132+
evaluation_params=[
133+
LLMTestCaseParams.INPUT,
134+
LLMTestCaseParams.ACTUAL_OUTPUT,
135+
LLMTestCaseParams.EXPECTED_OUTPUT,
136+
],
137+
model=judge_model,
138+
),
139+
ToolCorrectnessMetric(threshold=0.5),
140+
]
141+
142+
asyncio.run(
143+
evaluator_instance.evaluate(
144+
eval_set_file_path=evalset_file, metrics=metrics
145+
)
146+
)

veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,13 @@
2424
from typing_extensions import override
2525

2626
from veadk.config import getenv
27-
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
28-
from veadk.utils.logger import get_logger
29-
3027
from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
28+
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
3129
from veadk.evaluation.utils.prometheus import (
3230
PrometheusPushgatewayConfig,
3331
push_to_prometheus,
3432
)
33+
from veadk.utils.logger import get_logger
3534

3635
logger = get_logger(__name__)
3736

@@ -45,20 +44,27 @@ class DeepevalEvaluator(BaseEvaluator):
4544
def __init__(
4645
self,
4746
agent,
48-
judge_model_api_key: str = getenv("MODEL_JUDGE_API_KEY"),
49-
judge_model_name: str = getenv(
50-
"MODEL_JUDGE_NAME",
51-
"doubao-seed-1-6-250615",
52-
),
53-
judge_model_api_base: str = getenv(
54-
"MODEL_JUDGE_API_BASE",
55-
"https://ark.cn-beijing.volces.com/api/v3/",
56-
),
47+
judge_model_api_key: str = "",
48+
judge_model_name: str = "",
49+
judge_model_api_base: str = "",
5750
name: str = "veadk_deepeval_evaluator",
5851
prometheus_config: PrometheusPushgatewayConfig | None = None,
5952
):
6053
super().__init__(agent=agent, name=name)
6154

55+
if not judge_model_api_key:
56+
judge_model_api_key = getenv("MODEL_JUDGE_API_KEY")
57+
if not judge_model_name:
58+
judge_model_name = getenv(
59+
"MODEL_JUDGE_NAME",
60+
"doubao-seed-1-6-250615",
61+
)
62+
if not judge_model_api_base:
63+
judge_model_api_base = getenv(
64+
"MODEL_JUDGE_API_BASE",
65+
"https://ark.cn-beijing.volces.com/api/v3/",
66+
)
67+
6268
self.judge_model_name = judge_model_name
6369
self.judge_model = LocalModel(
6470
model=judge_model_name,

0 commit comments

Comments
 (0)